Skip to content

Instantly share code, notes, and snippets.

@MarioZZJ
Last active May 25, 2024 02:12
Show Gist options
  • Save MarioZZJ/b3befa6903e99393c33c51deca1dc056 to your computer and use it in GitHub Desktop.
Save MarioZZJ/b3befa6903e99393c33c51deca1dc056 to your computer and use it in GitHub Desktop.
使用biobert-large-cased表示论文标题摘要文本
from transformers import AutoConfig, AutoModel, AutoTokenizer
import pandas as pd
import numpy as np
import torch
import csv
import argparse
from tqdm import tqdm
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--year', type=int, default=1991)
parser.add_argument('--cache_dir', type=str, default='/Public/huggingface')
parser.add_argument('--model_name', type=str, default='dmis-lab/biobert-large-cased-v1.1')
parser.add_argument('--proxy_ip', type=str, default='127.0.0.1')
parser.add_argument('--proxy_port', type=int, default=10809)
parser.add_argument('--batch_size', type=int, default=16)
args = parser.parse_args()
year = args.year
proxy_ip = args.proxy_ip
proxy_port = args.proxy_port
model_name = args.model_name
cache_dir = args.cache_dir
proxies = {
'http': f'http://{proxy_ip}:{proxy_port}',
'https': f'http://{proxy_ip}:{proxy_port}'
}
batch_size = args.batch_size
# load huggingface model
config = AutoConfig.from_pretrained(model_name, proxies=proxies, cache_dir=cache_dir, local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, proxies=proxies, cache_dir=cache_dir, local_files_only=True)
model = AutoModel.from_pretrained(model_name, proxies=proxies, config=config, cache_dir=cache_dir, local_files_only=True)
# load and parse data
model = model.to('cuda')
data = pd.read_csv(f'data/IEK4D/pmid_title_abstract/{year}.tsv', sep='\t', quoting=csv.QUOTE_ALL, quotechar='"', escapechar='\\')
data = data.dropna(axis=0, subset=['pmid', 'publish_year'])
data = data.fillna(' ')
data['publish_year'] = data['publish_year'].astype(int)
# embed documents
pmids = []
embs = torch.tensor([])
batch_text = []
batch_pmids = []
for index, row in tqdm(data.iterrows(), total=len(data), desc=f'Embedding {year}'):
pmid = row['pmid']
title = row['title']
abstract = row['abstract']
text = title + ' ' + abstract
batch_text.append(text)
batch_pmids.append(pmid)
if len(batch_pmids) < batch_size:
continue
else:
batch_dict = tokenizer(batch_text, return_tensors='pt', padding='max_length', truncation=True, max_length=512).to('cuda')
with torch.no_grad():
outputs = model(**batch_dict)
last_hidden_states = outputs.last_hidden_state
attention_mask = batch_dict['attention_mask']
last_hidden = last_hidden_states.masked_fill(~attention_mask.unsqueeze(-1).bool(), 0)
batch_embedding = last_hidden.sum(dim=1) / attention_mask.sum(dim=1).unsqueeze(-1)
batch_embedding = batch_embedding.detach().cpu()
pmids.extend(batch_pmids)
embs = torch.cat((embs, batch_embedding), dim=0)
batch_text = []
batch_pmids= []
# save embeddings
torch.save(torch.tensor(np.array(embs)), f'data/IEK4D/embedding/text/document/{year}_emb.pt')
torch.save(pmids, f'data/IEK4D/embedding/text/document/{year}_pmids.pt')
@MarioZZJ
Copy link
Author

MarioZZJ commented Jan 11, 2024

已优化

@MarioZZJ
Copy link
Author

似乎使用[cls]代表整个文档进而进行下游分类是更普遍的做法?如specter

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment