Skip to content

Instantly share code, notes, and snippets.

@victor-iyi
Created August 19, 2024 06:30
Show Gist options
  • Save victor-iyi/7322236186444960059ea46eae69e2d2 to your computer and use it in GitHub Desktop.
Save victor-iyi/7322236186444960059ea46eae69e2d2 to your computer and use it in GitHub Desktop.
Evaluate the output of Large Language Models based on how helpful the results are from Not Helpful to Highly helpful on a likert scale
import logging
import os
import pathlib
import re
import time
from typing import Any
import jsonlines
import openai
import torch
from rlhf_trl.args import PPOArgs
from rlhf_trl.evaluate.utils import load_models
from rlhf_trl.evaluate.utils import load_pipelines
from rlhf_trl.evaluate.utils import reduce_context_length
from rlhf_trl.predict import generate_with_model
from rlhf_trl.predict import generate_with_pipeline
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer
CHATGPT_ANNOTATOR_PROMPT = """You are evaluating a response that has been submitted for a particular task, using a specific set of standards. Below is the data:
[BEGIN DATA]
***
[Task]: {}
***
[Submission]: {}
***
[Criterion]: helpfulness:
"1": "Not helpful - The generated text is completely irrelevant, unclear, or incomplete. It does not provide any useful information to the user."
"2": "Somewhat helpful - The generated text has some relevance to the user’s question, but it may be unclear or incomplete. It provides only
partial information, or the information provided may not be useful for the user’s needs."
"3": "Moderately helpful - The generated text is relevant to the user’s question, and it provides a clear and complete answer. However, it may
lack detail or explanation that would be helpful for the user."
"4": "Helpful - The generated text is quite relevant to the user’s question, and it provides a clear, complete, and detailed answer. It offers
additional information or explanations that are useful for the user. However, some of the points of the response are somewhat repetitive or could
be combined for greater clarity and concision"
"5": "Very helpful - The generated text is highly relevant to the user’s question, and it provides a clear, complete, and detailed answer. It offers
additional information, explanations, or analogies that are not only useful but also insightful and valuable to the user. However, the structured
of the response is not well-organized and there is no clear progression or logical sequence of different points in the response."
"6": "Highly helpful - The generated text provides a clear, complete, and detailed answer. It offers additional information or explanations that
are not only useful but also insightful and valuable to the user. The response is also in a logical and easy-to-follow manner by explicitly using
headings, bullet points, or numbered lists to break up the information and make it easier to read."
***
[END DATA]
Does the submission meet the criterion? First, write out in a step by step manner your reasoning about the criterion to be sure that your
conclusion is correct. Avoid simply stating the correct answers at the outset. Then print the choice only from “1, 2, 3, 4, 5, 6” (without quotes
or punctuation) on its own line corresponding to the correct answer. At the end, repeat just the selected choice again by itself on a new line."""
# Regex pattern to match a single digit.
SINGLE_DIGIT_PATTERN = re.compile(r'\b\d\b')
def api_request(
prompt: str,
openai_model: str = 'gpt-3.5-turbo',
max_tokens: int = 512,
) -> str:
"""Send a request to the OpenAI API and return the response.
Args:
prompt (str): The prompt to send to the API.
openai_model (str, optional): The OpenAI model to use.
Defaults to 'gpt-3.5-turbo'.
max_tokens (int, optional): The maximum number of tokens to generate.
Defaults to 512.
Returns:
str: The response from the API.
"""
# Set the OpenAI API key.
openai.api_key = os.getenv('OPENAI_API_KEY')
# Send the request.
output = openai.ChatCompletion.create(
model=openai_model,
messages=[{'role': 'user', 'content': prompt}],
temperature=0.7,
max_tokens=max_tokens,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
)
return output['choices'][0]['message']['content']
def chatgpt_output(
prompt: str,
openai_model: str = 'gpt-3.5-turbo',
max_tokens: int = 512,
retry_secs: int = 5,
) -> tuple[bool, str]:
"""Send a request to the OpenAI API and return the response.
Args:
prompt (str): The prompt to send to the API.
openai_model (str, optional): The OpenAI model to use.
Defaults to 'gpt-3.5-turbo'.
max_tokens (int, optional): The maximum number of tokens to generate.
Defaults to 512.
retry_secs (int, optional): The number of seconds to wait before retrying.
Defaults to 5.
Returns:
tuple[bool, str]: A tuple containing a boolean indicating whether
the request was successful and the response.
"""
success = False
while True:
try:
output = api_request(
prompt=prompt,
openai_model=openai_model,
max_tokens=max_tokens,
)
success = True
break
except openai.InvalidRequestError as e:
logging.error(f'Failed to get response: {e}')
logging.error(f'Error: {e.error["code"]}')
if e.error['code'] == 'content_length_exceeded':
output = e.error['code']
break
else:
# Retry in `retry_secs` seconds.
time.sleep(retry_secs)
continue
return success, output
def evaluate_with_chatgpt( # noqa: C901
args: PPOArgs,
model_map: dict[str, str],
tokenizer: AutoTokenizer,
loader: DataLoader[Any],
use_pipeline: bool = False,
device: torch.device | str = 'cpu',
**gen_kwargs: Any,
) -> None:
"""Evaluate the model with ChatGPT evaluation score.
Args:
args (PPOArgs): The script arguments.
model_map (dict[str, str]): The model name to model mapping.
tokenizer (AutoTokenizer): The tokenizer to use.
loader (DataLoader): The data loader to use.
use_pipeline (bool, optional): Whether to use the `pipeline` API.
Defaults to False.
device (torch.device | str): The device to use.
Defaults to 'cpu'.
gen_kwargs: Keyword arguments for the generation function.
"""
if use_pipeline:
logging.info('Using pipeline for evaluation.')
models = load_pipelines(model_map, tokenizer=tokenizer)
else:
logging.info('Using model.generate() for evaluation.')
models = load_models(model_map)
# Get the eval_name and save_path.
if args.eval_name is None:
# Assume path is of format: experiments/<project_name>/<run_name>/model
# and use <run_name> as the eval_name.
try:
eval_name = pathlib.Path(args.ppo_model_name).parent.name
except Exception as e:
logging.exception(f'{e}: Unable to get eval_name from {args.ppo_model_name}')
logging.warning('Using default eval_name: reward-eval')
eval_name = 'reward-eval'
else:
eval_name = args.eval_name
# Create the save_path.
os.makedirs(args.eval_save_path, exist_ok=True)
save_path = os.path.join(args.eval_save_path, f'{eval_name}.jsonl')
if os.path.isfile(save_path):
logging.warning(f'{save_path} already exists. Overwriting...')
# Write the data to the jsonl file.
with jsonlines.open(save_path, 'w') as writer:
for batch in tqdm(loader, desc='Evalutaing'):
_data: dict[str, Any] = {} # [query, answer, answer_score, model_name_output, model_name_score]
_mapping: dict[str, list[tuple[str, str, int]]] = {} # model_name -> [(query, model_output, model_score)]
for model_name, model in models.items():
# Model output.
if use_pipeline:
model_outputs = generate_with_pipeline(
pipeline=model,
texts=batch['query'],
**gen_kwargs,
)
else:
model_outputs = generate_with_model(
model=model,
tokenizer=tokenizer,
input_ids=torch.stack(batch['input_ids'], dim=0).to(device),
**gen_kwargs,
)
# Get ChatGPT helpfulness scores.
results = get_chatgpt_scores(
queries=batch['query'],
outputs=model_outputs,
max_tokens=args.max_token,
)
_mapping[model_name] = results
for name, results in _mapping.items():
i = 0
_data['query'] = results[0][i]
for (_, output, score) in results:
_data[f'{name}_output'] = output
_data[f'{name}_score'] = score
i += 1
logging.info(_data)
# Write the data to jsonl file.
writer.write(_data)
# for query, output in zip(batch, model_outputs):
# try:
# score = _get_score_from_chatgpt(query, output)
# _mapping[model_name] = (output, score)
# except Exception as e:
# logging.exception(f'Error: {e}')
# # Prompt engineering to get the score w/ query & output.
# prompt = CHATGPT_ANNOTATOR_PROMPT.format(query, output)
# success, response = chatgpt_output(prompt, max_tokens=args.max_token)
# if not success:
# if response == 'content_length_exceeded':
# # Reduce the context length of the query and try again.
# query = reduce_context_length(query)
# prompt = CHATGPT_ANNOTATOR_PROMPT.format(query, output)
# success, response = chatgpt_output(prompt, max_tokens=args.max_token)
# else:
# logging.error(f'Failed to get response: {response}')
# continue
# if success:
# try:
# score = int(response.strip('\n').split('\n')[-1])
# eval_scores[model_name].append({
# 'query': query,
# 'output': output,
# 'score': score,
# })
# except Exception as e:
# logging.warning(f'Failed to parse score: {e}')
# _score = response.strip('\n')
# logging.warning(f'Score needs to be ready manually: {_score}')
# # Save the results.
# if args.eval_name is None:
# try:
# # Assume path is of format: experiments/<project_name>/<run_name>/model
# # and use <run_name> as the eval_name.
# eval_name = pathlib.Path(args.ppo_model_name).parent.name
# except Exception as e:
# logging.exception(f'{e}: Unable to get eval_name from {args.ppo_model_name}')
# logging.warning('Using default eval_name: chatgpt-eval')
# eval_name = 'chatgpt-eval'
# else:
# eval_name = args.eval_name
# save_path = os.path.join(args.eval_save_path, f'{eval_name}.json')
# os.makedirs(args.eval_save_path, exist_ok=True)
# logging.info(f'Saving evaluation results to {save_path}...')
# with open(save_path, 'w') as f:
# json.dump(dict(eval_scores), f)
def get_chatgpt_scores(
queries: list[str],
outputs: list[str],
max_tokens: int = 512,
) -> list[tuple[str, str, int]]:
"""Get ChatGPT score from query, output pair.
Args:
queries (list[str]): The queries.
outputs (list[str]): The list of outputs.
max_tokens (int, optional): Maximum number of tokens to generate. Defaults to 512.
Returns:
list[tuple[str, str, int]]: Query, output & ChatGPT scores.
"""
results = []
for query, output in zip(queries, outputs):
try:
score = _get_score_from_chatgpt(query, output, max_tokens=max_tokens)
results.append((query, output, score))
except Exception as e:
logging.exception(f'Error: {e}')
return results
def _get_score_from_chatgpt(
query: str,
output: str,
max_tokens: int = 512,
) -> int:
"""Get the score from ChatGPT.
Args:
query (str): The query.
output (str): The output.
max_tokens (int): Maximum number of tokens to generate.
Defaults to 512.
Raises:
ValueError: Failed to get response.
Failed to parse score.
Returns:
int: The score.
"""
prompt = CHATGPT_ANNOTATOR_PROMPT.format(query, output)
success, response = chatgpt_output(prompt, max_tokens=max_tokens)
if not success:
# Try again with a reduced context length.
if response == 'content_length_exceeded':
# Reduce the context length of the query and try again.
query = reduce_context_length(query)
prompt = CHATGPT_ANNOTATOR_PROMPT.format(query, output)
success, response = chatgpt_output(prompt)
else:
# Continue to the next query.
raise ValueError(f'Failed to get response: {response}')
if success:
# Parse the score.
match = SINGLE_DIGIT_PATTERN.search(response.strip('\n').split('\n')[-1])
if match:
return int(match.group())
else:
_score = response.strip('\n')
logging.warning(f'\nScore needs to be ready manually: {_score}')
raise ValueError('Failed to parse score.')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment