Created
November 28, 2023 19:25
-
-
Save yukiarimo/8b1f28016a5ef6aef751d8996f64fd10 to your computer and use it in GitHub Desktop.
Bert Large CNN fine tuning
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import torch | |
from torch.utils.data import Dataset, DataLoader | |
from transformers import BartForConditionalGeneration, BartTokenizer | |
from torch.optim import AdamW | |
import os | |
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0" | |
# Define your dataset class | |
class CustomDataset(Dataset): | |
def __init__(self, csv_path, tokenizer): | |
self.data = pd.read_csv(csv_path) | |
self.tokenizer = tokenizer | |
def __len__(self): | |
return len(self.data) | |
def __getitem__(self, idx): | |
task = self.data.iloc[idx]['task'] | |
summary = self.data.iloc[idx]['summary'] | |
# Tokenize the input and target sequences | |
tokenized_inputs = self.tokenizer.encode_plus(task, padding='max_length', truncation=True, max_length=512) | |
tokenized_summary = self.tokenizer.encode_plus(summary, padding='max_length', truncation=True, max_length=512) | |
# Convert the tokenized sequences to tensors | |
input_ids = torch.tensor(tokenized_inputs['input_ids']) | |
attention_mask = torch.tensor(tokenized_inputs['attention_mask']) | |
summary_ids = torch.tensor(tokenized_summary['input_ids']) | |
return { | |
'input_ids': input_ids, | |
'attention_mask': attention_mask, | |
'summary_ids': summary_ids | |
} | |
# Set the path to your CSV file | |
csv_path = 'yuna copy.csv' | |
# Set up the tokenizer | |
tokenizer = BartTokenizer.from_pretrained('./bart-large-cnn/') | |
# Create an instance of the custom dataset | |
dataset = CustomDataset(csv_path, tokenizer) | |
# Define hyperparameters and training configurations | |
batch_size = 1 | |
num_epochs = 50 | |
learning_rate = 1e-5 | |
# Create a data loader for the dataset | |
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) | |
# Load the pretrained BART model | |
model = BartForConditionalGeneration.from_pretrained('./bart-large-cnn/') | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
print(device) | |
model.to(device) | |
# Set the model to training mode | |
model.train() | |
# Define the optimizer and the learning rate scheduler | |
optimizer = AdamW(model.parameters(), lr=learning_rate) | |
total_steps = len(dataloader) * num_epochs | |
# Set the directory to save the models | |
save_dir = 'Yuna-trained' | |
# Set the task-specific parameters for summarization | |
length_penalty = 1.0 | |
max_length = 1024 | |
min_length = 120 | |
dropout = 0.01 | |
# Set the model parameters for summarization | |
model.config.length_penalty = length_penalty | |
model.config.max_length = max_length | |
model.config.min_length = min_length | |
model.config.dropout = dropout | |
# Set the task-specific parameters for summarization | |
task_specific_params = { | |
'summarization': { | |
'early_stopping': True, | |
'length_penalty': length_penalty, | |
'max_length': max_length, | |
'min_length': min_length, | |
'no_repeat_ngram_size': 3, | |
'num_beams': 4 | |
} | |
} | |
# Update the model configuration with task-specific parameters | |
for task, params in task_specific_params.items(): | |
if task in model.config.task_specific_params: | |
model.config.task_specific_params[task].update(params) | |
else: | |
model.config.task_specific_params[task] = params | |
# Training loop | |
for epoch in range(num_epochs): | |
total_loss = 0 | |
for batch in dataloader: | |
# Zero the gradients | |
optimizer.zero_grad() | |
# Forward pass | |
outputs = model(input_ids=batch['input_ids'].to(device), | |
attention_mask=batch['attention_mask'].to(device), | |
labels=batch['summary_ids'].to(device)) | |
# Compute the loss | |
loss = outputs.loss | |
# Backpropagation | |
loss.backward() | |
# Update the model parameters | |
optimizer.step() | |
total_loss += loss.item() | |
# Print the average loss for the epoch | |
print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(dataloader)}") | |
# Save the model every 5th epoch | |
if (epoch + 1) % 5 == 0: | |
model.save_pretrained(os.path.join(save_dir, f'fine_tuned_model_epoch{epoch+1}')) | |
tokenizer.save_pretrained(os.path.join(save_dir, f'fine_tuned_model_epoch{epoch+1}')) | |
# Save the final fine-tuned model | |
model.save_pretrained(os.path.join(save_dir, 'fine_tuned_model_final')) | |
tokenizer.save_pretrained(os.path.join(save_dir, 'fine_tuned_model_final')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment