Created
August 12, 2022 18:11
-
-
Save dlwh/074e2571fab15f94103603674dd184a3 to your computer and use it in GitHub Desktop.
bad eval output
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from itertools import chain | |
from typing import Optional | |
import numpy as np | |
import datasets | |
import torch | |
import transformers | |
from transformers import ( | |
AutoModelForCausalLM, | |
AutoTokenizer, | |
Trainer, | |
TrainingArguments, | |
default_data_collator, | |
) | |
def main(): | |
# doesn't fail if dataloader_num_workers is 0, not using IterableDataset, or not using more than one gpu | |
training_args: TrainingArguments = TrainingArguments(output_dir="scr", dataloader_num_workers=1) | |
raw_dataset = datasets.load_dataset("wikitext", "wikitext-2-raw-v1", split="validation") | |
# model doesn't matter, just something smallish | |
tokenizer = AutoTokenizer.from_pretrained("distilgpt2") | |
model = AutoModelForCausalLM.from_pretrained("distilgpt2") | |
column_names = raw_dataset.column_names | |
text_column_name = "text" if "text" in column_names else column_names[0] | |
def tokenize_function(examples): | |
output = tokenizer(examples[text_column_name]) | |
return output | |
with training_args.main_process_first(desc="dataset map tokenization"): | |
tokenized_dataset = raw_dataset.map( | |
tokenize_function, | |
batched=True, | |
num_proc=1, | |
remove_columns=column_names, | |
desc="Running tokenizer on dataset", | |
) | |
block_size = tokenizer.model_max_length | |
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. | |
def group_texts(examples): | |
# Concatenate all texts. | |
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} | |
total_length = len(concatenated_examples[list(examples.keys())[0]]) | |
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can | |
# customize this part to your needs. | |
if total_length >= block_size: | |
total_length = (total_length // block_size) * block_size | |
# Split by chunks of max_len. | |
result = { | |
k: [t[i : i + block_size] for i in range(0, total_length, block_size)] | |
for k, t in concatenated_examples.items() | |
} | |
result["labels"] = result["input_ids"].copy() | |
return result | |
with training_args.main_process_first(desc="grouping texts together"): | |
lm_dataset = tokenized_dataset.map( | |
group_texts, | |
batched=True, | |
num_proc=1, | |
desc=f"Grouping texts in chunks of {block_size}", | |
) | |
class WrapperDataset(torch.utils.data.IterableDataset): | |
def __init__(self, ds): | |
self.ds = ds | |
def __iter__(self): | |
return iter(self.ds) | |
eval_dataset = WrapperDataset(lm_dataset) | |
# Initialize our Trainer | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=None, | |
eval_dataset=eval_dataset, | |
tokenizer=tokenizer, | |
# Data collator will default to DataCollatorWithPadding, so we change it. | |
data_collator=default_data_collator, | |
) | |
metrics = trainer.evaluate() | |
assert np.isfinite(metrics["eval_loss"]) | |
def _mp_fn(index): | |
# For xla_spawn (TPUs) | |
main() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment