Skip to content

Instantly share code, notes, and snippets.

@charlesfrye
Last active September 20, 2024 06:30
Show Gist options
  • Save charlesfrye/27f25188dbbcfdf20a83c0230020fe05 to your computer and use it in GitHub Desktop.
Save charlesfrye/27f25188dbbcfdf20a83c0230020fe05 to your computer and use it in GitHub Desktop.
Reproducing results from "Beat GPT-4o at Python by Searching with 100 Dumb LLaMAs"

See rune2e.sh for info on how to run the experiment.

from datetime import datetime
import json
from pathlib import Path
from dataclasses import dataclass, asdict
import modal
image = modal.Image.debian_slim(python_version="3.11").pip_install(
"openai==1.38.0", "datasets==2.20.0"
)
app = modal.App("eval-infinite-monkeys", image=image)
volume = modal.Volume.from_name("humaneval", create_if_missing=True)
DATA_DIR = Path("/mnt/humaneval")
default_system_prompt = "Write the body for the Python function provided in the prompt below. Do not write anything else. Your output will be directly concatenated with the prompt and the resulting function executed against tests."
MINUTES = 60 # seconds
HOURS = 60 * MINUTES
@dataclass
class CompletionParams:
model: str = None
max_tokens: int = 1024
temperature: float = 0.7
top_p: float = 0.9
frequency_penalty: float = 0
presence_penalty: float = 0
n: int = 1
stop: str = None
seed: int = None
@dataclass
class ClientParams:
app_name: str = "example-infinite-monkeys"
workspace: str = None
api_key: str = "super-secret-token"
@property
def url(self):
return f"https://{self.workspace}--{self.app_name}-serve.modal.run/v1"
@app.local_entrypoint()
def main(
app_name: str = "example-infinite-monkeys",
workspace: str = None,
api_key: str = "super-secret-token",
model: str = None,
max_tokens: int = 1024,
temperature: float = 0.7,
top_p: float = 0.9,
frequency_penalty: float = 0,
presence_penalty: float = 0,
n: int = 1,
stop: str = None,
seed: int = None,
data_dir: str = "dev-llm",
subsample: int = 1,
system_prompt: str = default_system_prompt,
dry_run: bool = True,
):
if workspace is None:
workspace = modal.config._profile
client_params = ClientParams(app_name, workspace, api_key)
completion_params = CompletionParams(
model=model,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
frequency_penalty=frequency_penalty,
presence_penalty=presence_penalty,
n=n,
stop=stop,
seed=seed,
)
save_dataset.remote(path=data_dir, subsample=subsample)
results = run_human_eval.remote(
client_params=client_params,
completion_params=completion_params,
system_prompt=system_prompt,
data_dir=data_dir,
dry_run=dry_run,
)
if results:
with open("/tmp/results.jsonl", "w") as f:
f.writelines(json.dumps(result) + "\n" for result in results)
print(f"results saved locally to {f.name}")
@app.function(volumes={DATA_DIR: volume}, timeout=1 * HOURS)
def run_human_eval(
client_params: ClientParams,
completion_params: CompletionParams,
data_dir="dev-llm",
system_prompt: str = default_system_prompt,
dry_run=True,
):
dataset = load_dataset(data_dir)
timestamp = datetime.utcnow().isoformat() + "Z"
output_dir = Path(DATA_DIR) / data_dir / f"run-{timestamp}"
output_dir.mkdir(parents=True, exist_ok=True)
handles = []
for i, item in enumerate(dataset):
handles.append(
run_item.spawn(
item,
client_params,
completion_params,
system_prompt,
output_dir,
dry_run,
)
)
for handle in handles:
result = handle.get()
if not dry_run:
return result
@app.function(volumes={DATA_DIR: volume}, timeout=1 * HOURS)
def run_item(
item: dict,
client_params: ClientParams,
completion_params: CompletionParams,
system_prompt: str,
output_dir: Path,
dry_run: bool,
):
client = create_client(client_params)
if completion_params.model:
print(
Colors.BOLD,
f"🧠: Using model {completion_params.model}. This may trigger a model load on first call!",
Colors.END,
sep="",
)
else:
print(
Colors.BOLD,
f"🔎: Looking up available models on server at {client.base_url}. This may trigger a model load!",
Colors.END,
sep="",
)
model = client.models.list().data[0]
model = model.id
print(
Colors.BOLD,
f"🧠: Using {model}",
Colors.END,
sep="",
)
completion_params.model = model
prompt = item["prompt"]
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
]
per_request = 250
ct, completions = completion_params.n, []
if not dry_run:
while ct > 0:
response = get_completion(
client,
messages=messages,
**asdict(completion_params) | dict(n=min(ct, per_request)),
)
if response:
completions += [
{
"task_id": item["task_id"],
"completion": choice.message.content,
}
for choice in response.choices
]
ct -= per_request
index = item["task_id"].split("/")[-1]
output_path = output_dir / f"{index}.jsonl"
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w") as f:
f.writelines(json.dumps(completion) + "\n" for completion in completions)
print(Colors.GREEN + f"Completions saved to {output_path}" + Colors.END)
class Colors:
"""ANSI color codes"""
GREEN = "\033[0;32m"
RED = "\033[0;31m"
BLUE = "\033[0;34m"
GRAY = "\033[0;90m"
BOLD = "\033[1m"
END = "\033[0m"
def get_completion(client, **kwargs):
try:
response = client.chat.completions.create(**kwargs)
return response
except Exception as e:
print(Colors.RED, f"Error during API call: {e}", Colors.END, sep="")
return None
def create_client(client_params: ClientParams):
from openai import OpenAI
client = OpenAI(api_key=client_params.api_key)
client.base_url = client_params.url
return client
@app.function(volumes={DATA_DIR: volume})
def save_dataset(path="dev-llm", subsample: int = 1):
import datasets
path = DATA_DIR / path
ds = datasets.load_dataset(
"openai/openai_humaneval",
split=datasets.ReadInstruction("test", to=subsample, unit="%"),
)
ds.to_json(path / "data.jsonl")
volume.commit()
def load_dataset(path="dev-llm"):
import datasets
path = DATA_DIR / path
ds = datasets.load_dataset(path=str(path), data_files="data.jsonl")
return ds["train"]
import modal
MODELS_DIR = "/llamas"
DEFAULT_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct"
DEFAULT_REVISION = "8c22764a7e3675c50d4c7c9a4edb474456022b16"
volume = modal.Volume.from_name("llamas", create_if_missing=True)
image = (
modal.Image.debian_slim(python_version="3.10")
.pip_install(
[
"huggingface_hub", # download models from the Hugging Face Hub
"hf-transfer", # download models faster with Rust
]
)
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
)
MINUTES = 60
HOURS = 60 * MINUTES
app = modal.App(image=image, secrets=[modal.Secret.from_name("huggingface")])
@app.function(volumes={MODELS_DIR: volume}, timeout=4 * HOURS)
def download_model(model_name, model_revision, force_download=False):
from huggingface_hub import snapshot_download
volume.reload()
snapshot_download(
model_name,
local_dir=MODELS_DIR,
ignore_patterns=[
"*.pt",
"*.bin",
"*.pth",
"original/*",
], # Ensure safetensors
revision=model_revision,
force_download=force_download,
)
volume.commit()
@app.local_entrypoint()
def main(
model_name: str = DEFAULT_NAME,
model_revision: str = DEFAULT_REVISION,
force_download: bool = False,
):
download_model.remote(model_name, model_revision, force_download)
from pathlib import Path
import modal
app = modal.App("humaneval-sandbox")
volume = modal.Volume.from_name("humaneval", create_if_missing=True)
sandbox_image = (
modal.Image.debian_slim()
.apt_install("git")
.run_commands(
"git clone https://github.com/modal-labs/human-eval.git",
"pip install -e human-eval",
)
)
MINUTES = 60
@app.function(volumes={"/humaneval": volume})
def run_humaneval(sample_file_path: str, problem_file_path: str):
with modal.Volume.ephemeral() as vol:
with vol.batch_upload() as batch:
batch.put_file(sample_file_path, "samples.jsonl")
batch.put_file(problem_file_path, "problems.jsonl")
print(f"Starting sandbox for {sample_file_path}")
sandbox = modal.Sandbox.create(
"bash",
"-c",
"evaluate_functional_correctness vol/samples.jsonl --problem_file=vol/problems.jsonl --n_workers=32",
image=sandbox_image,
volumes={"/vol": vol},
timeout=5 * MINUTES,
cpu=32,
)
try:
sandbox.wait_for(4 * MINUTES)
print(f"Finished sandbox for {sample_file_path}")
except TimeoutError:
print("Sandbox timed out")
if sandbox.returncode == 0:
print(sandbox.stdout.read())
data = b""
for chunk in vol.read_file("samples.jsonl_results.jsonl"):
data += chunk
with open(f"{sample_file_path}_results.jsonl", "wb") as f:
f.write(data)
else:
print(f"Tests failed with code {sandbox.returncode}")
print(sandbox.stderr.read())
@app.function(volumes={"/humaneval": volume}, timeout=10 * MINUTES)
def find_missing_files():
import os
volume.reload()
# Find all files matching /humaneval/{env}/{run}/{id}.jsonl
envs = [element for element in Path("/humaneval").iterdir() if element.is_dir()]
for env in envs:
print(f"looking in {env}")
problem_file = env / "data.jsonl"
pattern = "*/*.jsonl"
handles = []
for file_path in env.glob(pattern):
# Skip files that end with _results.jsonl
if str(file_path).endswith("_results.jsonl"):
continue
print(f"Checking {file_path}")
# Check if the corresponding results file exists
results_file = f"{file_path}_results.jsonl"
if not os.path.exists(results_file):
# If it doesn't exist, run run_humaneval
handles.append(run_humaneval.spawn(file_path, problem_file))
for handle in handles:
handle.get()
@app.local_entrypoint()
def main():
find_missing_files.remote()
import modal
vllm_image = modal.Image.debian_slim(python_version="3.10").pip_install(
"vllm==0.5.3post1"
)
MODELS_DIR = "/llamas"
MODEL_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct"
MODEL_REVISION = "8c22764a7e3675c50d4c7c9a4edb474456022b16"
try:
volume = modal.Volume.lookup("llamas", create_if_missing=False)
except modal.exception.NotFoundError:
raise Exception("Download models first with modal run download_llama.py")
app = modal.App("example-infinite-monkeys")
N_GPU = 1 # tip: for best results, first upgrade to more powerful GPUs, and only then increase GPU count
TOKEN = (
"super-secret-token" # auth token. for production use, replace with a modal.Secret
)
MINUTES = 60 # seconds
HOURS = 60 * MINUTES
@app.function(
image=vllm_image,
gpu=modal.gpu.A100(count=N_GPU, size="40GB"),
container_idle_timeout=5 * MINUTES,
timeout=24 * HOURS,
allow_concurrent_inputs=2,
volumes={MODELS_DIR: volume},
concurrency_limit=10,
)
@modal.asgi_app()
def serve():
import fastapi
import vllm.entrypoints.openai.api_server as api_server
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import (
OpenAIServingCompletion,
)
from vllm.usage.usage_lib import UsageContext
volume.reload() # ensure we have the latest version of the weights
# create a fastAPI app that uses vLLM's OpenAI-compatible router
web_app = fastapi.FastAPI(
title=f"OpenAI-compatible {MODEL_NAME} server",
description="Run an OpenAI-compatible LLM server with vLLM on modal.com",
version="0.0.1",
docs_url="/docs",
)
# security: CORS middleware for external requests
http_bearer = fastapi.security.HTTPBearer(
scheme_name="Bearer Token",
description="See code for authentication details.",
)
web_app.add_middleware(
fastapi.middleware.cors.CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# security: inject dependency on authed routes
async def is_authenticated(api_key: str = fastapi.Security(http_bearer)):
if api_key.credentials != TOKEN:
raise fastapi.HTTPException(
status_code=fastapi.status.HTTP_401_UNAUTHORIZED,
detail="Invalid authentication credentials",
)
return {"username": "authenticated_user"}
router = fastapi.APIRouter(dependencies=[fastapi.Depends(is_authenticated)])
# wrap vllm's router in auth router
router.include_router(api_server.router)
# add authed vllm to our fastAPI app
web_app.include_router(router)
engine_args = AsyncEngineArgs(
model=MODELS_DIR + "/" + MODEL_NAME,
tensor_parallel_size=N_GPU,
gpu_memory_utilization=0.90,
max_model_len=2048,
enforce_eager=False, # capture the graph for faster inference, but slower cold starts (30s > 20s)
)
engine = AsyncLLMEngine.from_engine_args(
engine_args, usage_context=UsageContext.OPENAI_API_SERVER
)
model_config = get_model_config(engine)
request_logger = RequestLogger(max_log_len=2048)
api_server.openai_serving_chat = OpenAIServingChat(
engine,
model_config=model_config,
served_model_names=[MODEL_NAME],
chat_template=None,
response_role="assistant",
lora_modules=[],
prompt_adapters=[],
request_logger=request_logger,
)
api_server.openai_serving_completion = OpenAIServingCompletion(
engine,
model_config=model_config,
served_model_names=[MODEL_NAME],
lora_modules=[],
prompt_adapters=[],
request_logger=request_logger,
)
return web_app
def get_model_config(engine):
import asyncio
try: # adapted from vLLM source -- https://github.com/vllm-project/vllm/blob/507ef787d85dec24490069ffceacbd6b161f4f72/vllm/entrypoints/openai/api_server.py#L235C1-L247C1
event_loop = asyncio.get_running_loop()
except RuntimeError:
event_loop = None
if event_loop is not None and event_loop.is_running():
# If the current is instanced by Ray Serve,
# there is already a running event loop
model_config = event_loop.run_until_complete(engine.get_model_config())
else:
# When using single vLLM without engine_use_ray
model_config = asyncio.run(engine.get_model_config())
return model_config
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
#!/bin/bash
set -euo pipefail
IFS=$'\n\t'
command -v modal >/dev/null 2>&1 || { echo >&2 "modal command not found. Install modal first! Aborting."; exit 1; }
echo 'downloading LLaMA 3.1 8B'
echo 'make sure to create a Secret called huggingface on Modal and accept the LLaMA 3.1 license'
modal run download_llama.py
echo 'deploying vLLM inference server'
modal deploy inference.py
echo 'running HumanEval generation'
modal run client.py --data-dir test --no-dry-run --n 1000 --subsample 100
echo 'running HumanEval evaluation'
modal run eval.py::find_missing_files
echo 'run "modal launch jupyter --volume humaneval" and upload the notebook to run the analysis'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment