charlesfrye · September 20, 2024 06:30
diff --git a/README.md b/README.md
diff --git a/client.py b/client.py
 from datetime import datetime
 import json
 from pathlib import Path
 from dataclasses import dataclass, asdict

 import modal

 image = modal.Image.debian_slim(python_version="3.11").pip_install(
    "openai==1.38.0", "datasets==2.20.0"
 )

 app = modal.App("eval-infinite-monkeys", image=image)

 volume = modal.Volume.from_name("humaneval", create_if_missing=True)
 DATA_DIR = Path("/mnt/humaneval")

 default_system_prompt = "Write the body for the Python function provided in the prompt below. Do not write anything else. Your output will be directly concatenated with the prompt and the resulting function executed against tests."

 MINUTES = 60  # seconds
 HOURS = 60 * MINUTES


 @dataclass
 class CompletionParams:
    model: str = None
    max_tokens: int = 1024
    temperature: float = 0.7
    top_p: float = 0.9
    frequency_penalty: float = 0
    presence_penalty: float = 0
    n: int = 1
    stop: str = None
    seed: int = None


 @dataclass
 class ClientParams:
    app_name: str = "example-infinite-monkeys"
    workspace: str = None
    api_key: str = "super-secret-token"

    @property
    def url(self):
        return f"https://{self.workspace}--{self.app_name}-serve.modal.run/v1"


 @app.local_entrypoint()
 def main(
    app_name: str = "example-infinite-monkeys",
    workspace: str = None,
    api_key: str = "super-secret-token",
    model: str = None,
    max_tokens: int = 1024,
    temperature: float = 0.7,
    top_p: float = 0.9,
    frequency_penalty: float = 0,
    presence_penalty: float = 0,
    n: int = 1,
    stop: str = None,
    seed: int = None,
    data_dir: str = "dev-llm",
    subsample: int = 1,
    system_prompt: str = default_system_prompt,
    dry_run: bool = True,
 ):
    if workspace is None:
        workspace = modal.config._profile

    client_params = ClientParams(app_name, workspace, api_key)

    completion_params = CompletionParams(
        model=model,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        frequency_penalty=frequency_penalty,
        presence_penalty=presence_penalty,
        n=n,
        stop=stop,
        seed=seed,
    )

    save_dataset.remote(path=data_dir, subsample=subsample)

    results = run_human_eval.remote(
        client_params=client_params,
        completion_params=completion_params,
        system_prompt=system_prompt,
        data_dir=data_dir,
        dry_run=dry_run,
    )
    if results:
        with open("/tmp/results.jsonl", "w") as f:
            f.writelines(json.dumps(result) + "\n" for result in results)
        print(f"results saved locally to {f.name}")


 @app.function(volumes={DATA_DIR: volume}, timeout=1 * HOURS)
 def run_human_eval(
    client_params: ClientParams,
    completion_params: CompletionParams,
    data_dir="dev-llm",
    system_prompt: str = default_system_prompt,
    dry_run=True,
 ):
    dataset = load_dataset(data_dir)

    timestamp = datetime.utcnow().isoformat() + "Z"
    output_dir = Path(DATA_DIR) / data_dir / f"run-{timestamp}"
    output_dir.mkdir(parents=True, exist_ok=True)
    handles = []
    for i, item in enumerate(dataset):
        handles.append(
            run_item.spawn(
                item,
                client_params,
                completion_params,
                system_prompt,
                output_dir,
                dry_run,
            )
        )

    for handle in handles:
        result = handle.get()

    if not dry_run:
        return result


 @app.function(volumes={DATA_DIR: volume}, timeout=1 * HOURS)
 def run_item(
    item: dict,
    client_params: ClientParams,
    completion_params: CompletionParams,
    system_prompt: str,
    output_dir: Path,
    dry_run: bool,
 ):
    client = create_client(client_params)
    if completion_params.model:
        print(
            Colors.BOLD,
            f"🧠: Using model {completion_params.model}. This may trigger a model load on first call!",
            Colors.END,
            sep="",
        )
    else:
        print(
            Colors.BOLD,
            f"🔎: Looking up available models on server at {client.base_url}. This may trigger a model load!",
            Colors.END,
            sep="",
        )
        model = client.models.list().data[0]
        model = model.id
        print(
            Colors.BOLD,
            f"🧠: Using {model}",
            Colors.END,
            sep="",
        )
        completion_params.model = model

    prompt = item["prompt"]
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt},
    ]

    per_request = 250
    ct, completions = completion_params.n, []
    if not dry_run:
        while ct > 0:
            response = get_completion(
                client,
                messages=messages,
                **asdict(completion_params) | dict(n=min(ct, per_request)),
            )
            if response:
                completions += [
                    {
                        "task_id": item["task_id"],
                        "completion": choice.message.content,
                    }
                    for choice in response.choices
                ]
            ct -= per_request

        index = item["task_id"].split("/")[-1]
        output_path = output_dir / f"{index}.jsonl"
        output_path.parent.mkdir(parents=True, exist_ok=True)
        with open(output_path, "w") as f:
            f.writelines(json.dumps(completion) + "\n" for completion in completions)

        print(Colors.GREEN + f"Completions saved to {output_path}" + Colors.END)


 class Colors:
    """ANSI color codes"""

    GREEN = "\033[0;32m"
    RED = "\033[0;31m"
    BLUE = "\033[0;34m"
    GRAY = "\033[0;90m"
    BOLD = "\033[1m"
    END = "\033[0m"


 def get_completion(client, **kwargs):
    try:
        response = client.chat.completions.create(**kwargs)
        return response
    except Exception as e:
        print(Colors.RED, f"Error during API call: {e}", Colors.END, sep="")
        return None


 def create_client(client_params: ClientParams):
    from openai import OpenAI

    client = OpenAI(api_key=client_params.api_key)
    client.base_url = client_params.url

    return client


 @app.function(volumes={DATA_DIR: volume})
 def save_dataset(path="dev-llm", subsample: int = 1):
    import datasets

    path = DATA_DIR / path

    ds = datasets.load_dataset(
        "openai/openai_humaneval",
        split=datasets.ReadInstruction("test", to=subsample, unit="%"),
    )

    ds.to_json(path / "data.jsonl")

    volume.commit()


 def load_dataset(path="dev-llm"):
    import datasets

    path = DATA_DIR / path

    ds = datasets.load_dataset(path=str(path), data_files="data.jsonl")

    return ds["train"]
diff --git a/download_llama.py b/download_llama.py
 import modal

 MODELS_DIR = "/llamas"

 DEFAULT_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 DEFAULT_REVISION = "8c22764a7e3675c50d4c7c9a4edb474456022b16"

 volume = modal.Volume.from_name("llamas", create_if_missing=True)

 image = (
    modal.Image.debian_slim(python_version="3.10")
    .pip_install(
        [
            "huggingface_hub",  # download models from the Hugging Face Hub
            "hf-transfer",  # download models faster with Rust
        ]
    )
    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
 )


 MINUTES = 60
 HOURS = 60 * MINUTES


 app = modal.App(image=image, secrets=[modal.Secret.from_name("huggingface")])


 @app.function(volumes={MODELS_DIR: volume}, timeout=4 * HOURS)
 def download_model(model_name, model_revision, force_download=False):
    from huggingface_hub import snapshot_download

    volume.reload()

    snapshot_download(
        model_name,
        local_dir=MODELS_DIR,
        ignore_patterns=[
            "*.pt",
            "*.bin",
            "*.pth",
            "original/*",
        ],  # Ensure safetensors
        revision=model_revision,
        force_download=force_download,
    )

    volume.commit()


 @app.local_entrypoint()
 def main(
    model_name: str = DEFAULT_NAME,
    model_revision: str = DEFAULT_REVISION,
    force_download: bool = False,
 ):
    download_model.remote(model_name, model_revision, force_download)
diff --git a/eval.py b/eval.py
 from pathlib import Path

 import modal

 app = modal.App("humaneval-sandbox")

 volume = modal.Volume.from_name("humaneval", create_if_missing=True)

 sandbox_image = (
    modal.Image.debian_slim()
    .apt_install("git")
    .run_commands(
        "git clone https://github.com/modal-labs/human-eval.git",
        "pip install -e human-eval",
    )
 )

 MINUTES = 60


 @app.function(volumes={"/humaneval": volume})
 def run_humaneval(sample_file_path: str, problem_file_path: str):
    with modal.Volume.ephemeral() as vol:
        with vol.batch_upload() as batch:
            batch.put_file(sample_file_path, "samples.jsonl")
            batch.put_file(problem_file_path, "problems.jsonl")

        print(f"Starting sandbox for {sample_file_path}")
        sandbox = modal.Sandbox.create(
            "bash",
            "-c",
            "evaluate_functional_correctness vol/samples.jsonl --problem_file=vol/problems.jsonl --n_workers=32",
            image=sandbox_image,
            volumes={"/vol": vol},
            timeout=5 * MINUTES,
            cpu=32,
        )

        try:
            sandbox.wait_for(4 * MINUTES)
            print(f"Finished sandbox for {sample_file_path}")
        except TimeoutError:
            print("Sandbox timed out")

        if sandbox.returncode == 0:
            print(sandbox.stdout.read())
            data = b""
            for chunk in vol.read_file("samples.jsonl_results.jsonl"):
                data += chunk
            with open(f"{sample_file_path}_results.jsonl", "wb") as f:
                f.write(data)
        else:
            print(f"Tests failed with code {sandbox.returncode}")
            print(sandbox.stderr.read())


 @app.function(volumes={"/humaneval": volume}, timeout=10 * MINUTES)
 def find_missing_files():
    import os

    volume.reload()

    # Find all files matching /humaneval/{env}/{run}/{id}.jsonl
    envs = [element for element in Path("/humaneval").iterdir() if element.is_dir()]
    for env in envs:
        print(f"looking in {env}")
        problem_file = env / "data.jsonl"

        pattern = "*/*.jsonl"
        handles = []
        for file_path in env.glob(pattern):
            # Skip files that end with _results.jsonl
            if str(file_path).endswith("_results.jsonl"):
                continue

            print(f"Checking {file_path}")
            # Check if the corresponding results file exists
            results_file = f"{file_path}_results.jsonl"
            if not os.path.exists(results_file):
                # If it doesn't exist, run run_humaneval
                handles.append(run_humaneval.spawn(file_path, problem_file))

        for handle in handles:
            handle.get()


 @app.local_entrypoint()
 def main():
    find_missing_files.remote()
diff --git a/inference.py b/inference.py
 import modal

 vllm_image = modal.Image.debian_slim(python_version="3.10").pip_install(
    "vllm==0.5.3post1"
 )


 MODELS_DIR = "/llamas"
 MODEL_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 MODEL_REVISION = "8c22764a7e3675c50d4c7c9a4edb474456022b16"

 try:
    volume = modal.Volume.lookup("llamas", create_if_missing=False)
 except modal.exception.NotFoundError:
    raise Exception("Download models first with modal run download_llama.py")

 app = modal.App("example-infinite-monkeys")

 N_GPU = 1  # tip: for best results, first upgrade to more powerful GPUs, and only then increase GPU count
 TOKEN = (
    "super-secret-token"  # auth token. for production use, replace with a modal.Secret
 )

 MINUTES = 60  # seconds
 HOURS = 60 * MINUTES


 @app.function(
    image=vllm_image,
    gpu=modal.gpu.A100(count=N_GPU, size="40GB"),
    container_idle_timeout=5 * MINUTES,
    timeout=24 * HOURS,
    allow_concurrent_inputs=2,
    volumes={MODELS_DIR: volume},
    concurrency_limit=10,
 )
 @modal.asgi_app()
 def serve():
    import fastapi
    import vllm.entrypoints.openai.api_server as api_server
    from vllm.engine.arg_utils import AsyncEngineArgs
    from vllm.engine.async_llm_engine import AsyncLLMEngine
    from vllm.entrypoints.logger import RequestLogger
    from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
    from vllm.entrypoints.openai.serving_completion import (
        OpenAIServingCompletion,
    )
    from vllm.usage.usage_lib import UsageContext

    volume.reload()  # ensure we have the latest version of the weights

    # create a fastAPI app that uses vLLM's OpenAI-compatible router
    web_app = fastapi.FastAPI(
        title=f"OpenAI-compatible {MODEL_NAME} server",
        description="Run an OpenAI-compatible LLM server with vLLM on modal.com",
        version="0.0.1",
        docs_url="/docs",
    )

    # security: CORS middleware for external requests
    http_bearer = fastapi.security.HTTPBearer(
        scheme_name="Bearer Token",
        description="See code for authentication details.",
    )
    web_app.add_middleware(
        fastapi.middleware.cors.CORSMiddleware,
        allow_origins=["*"],
        allow_credentials=True,
        allow_methods=["*"],
        allow_headers=["*"],
    )

    # security: inject dependency on authed routes
    async def is_authenticated(api_key: str = fastapi.Security(http_bearer)):
        if api_key.credentials != TOKEN:
            raise fastapi.HTTPException(
                status_code=fastapi.status.HTTP_401_UNAUTHORIZED,
                detail="Invalid authentication credentials",
            )
        return {"username": "authenticated_user"}

    router = fastapi.APIRouter(dependencies=[fastapi.Depends(is_authenticated)])

    # wrap vllm's router in auth router
    router.include_router(api_server.router)
    # add authed vllm to our fastAPI app
    web_app.include_router(router)

    engine_args = AsyncEngineArgs(
        model=MODELS_DIR + "/" + MODEL_NAME,
        tensor_parallel_size=N_GPU,
        gpu_memory_utilization=0.90,
        max_model_len=2048,
        enforce_eager=False,  # capture the graph for faster inference, but slower cold starts (30s > 20s)
    )

    engine = AsyncLLMEngine.from_engine_args(
        engine_args, usage_context=UsageContext.OPENAI_API_SERVER
    )

    model_config = get_model_config(engine)

    request_logger = RequestLogger(max_log_len=2048)

    api_server.openai_serving_chat = OpenAIServingChat(
        engine,
        model_config=model_config,
        served_model_names=[MODEL_NAME],
        chat_template=None,
        response_role="assistant",
        lora_modules=[],
        prompt_adapters=[],
        request_logger=request_logger,
    )
    api_server.openai_serving_completion = OpenAIServingCompletion(
        engine,
        model_config=model_config,
        served_model_names=[MODEL_NAME],
        lora_modules=[],
        prompt_adapters=[],
        request_logger=request_logger,
    )

    return web_app


 def get_model_config(engine):
    import asyncio

    try:  # adapted from vLLM source -- https://github.com/vllm-project/vllm/blob/507ef787d85dec24490069ffceacbd6b161f4f72/vllm/entrypoints/openai/api_server.py#L235C1-L247C1
        event_loop = asyncio.get_running_loop()
    except RuntimeError:
        event_loop = None

    if event_loop is not None and event_loop.is_running():
        # If the current is instanced by Ray Serve,
        # there is already a running event loop
        model_config = event_loop.run_until_complete(engine.get_model_config())
    else:
        # When using single vLLM without engine_use_ray
        model_config = asyncio.run(engine.get_model_config())

    return model_config
diff --git a/plot-results.ipynb b/plot-results.ipynb
diff --git a/rune2e.sh b/rune2e.sh
 #!/bin/bash
 set -euo pipefail
 IFS=$'\n\t'

 command -v modal >/dev/null 2>&1 || { echo >&2 "modal command not found. Install modal first! Aborting."; exit 1; }

 echo 'downloading LLaMA 3.1 8B'
 echo 'make sure to create a Secret called huggingface on Modal and accept the LLaMA 3.1 license'
 modal run download_llama.py
 echo 'deploying vLLM inference server'
 modal deploy inference.py
 echo 'running HumanEval generation'
 modal run client.py --data-dir test --no-dry-run --n 1000 --subsample 100
 echo 'running HumanEval evaluation'
 modal run eval.py::find_missing_files
 echo 'run "modal launch jupyter --volume humaneval" and upload the notebook to run the analysis'
	from datetime import datetime
	import json
	from pathlib import Path
	from dataclasses import dataclass, asdict

	import modal

	image = modal.Image.debian_slim(python_version="3.11").pip_install(
	"openai==1.38.0", "datasets==2.20.0"
	)

	app = modal.App("eval-infinite-monkeys", image=image)

	volume = modal.Volume.from_name("humaneval", create_if_missing=True)
	DATA_DIR = Path("/mnt/humaneval")

	default_system_prompt = "Write the body for the Python function provided in the prompt below. Do not write anything else. Your output will be directly concatenated with the prompt and the resulting function executed against tests."

	MINUTES = 60 # seconds
	HOURS = 60 * MINUTES


	@dataclass
	class CompletionParams:
	model: str = None
	max_tokens: int = 1024
	temperature: float = 0.7
	top_p: float = 0.9
	frequency_penalty: float = 0
	presence_penalty: float = 0
	n: int = 1
	stop: str = None
	seed: int = None


	@dataclass
	class ClientParams:
	app_name: str = "example-infinite-monkeys"
	workspace: str = None
	api_key: str = "super-secret-token"

	@property
	def url(self):
	return f"https://{self.workspace}--{self.app_name}-serve.modal.run/v1"


	@app.local_entrypoint()
	def main(
	app_name: str = "example-infinite-monkeys",
	workspace: str = None,
	api_key: str = "super-secret-token",
	model: str = None,
	max_tokens: int = 1024,
	temperature: float = 0.7,
	top_p: float = 0.9,
	frequency_penalty: float = 0,
	presence_penalty: float = 0,
	n: int = 1,
	stop: str = None,
	seed: int = None,
	data_dir: str = "dev-llm",
	subsample: int = 1,
	system_prompt: str = default_system_prompt,
	dry_run: bool = True,
	):
	if workspace is None:
	workspace = modal.config._profile

	client_params = ClientParams(app_name, workspace, api_key)

	completion_params = CompletionParams(
	model=model,
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	frequency_penalty=frequency_penalty,
	presence_penalty=presence_penalty,
	n=n,
	stop=stop,
	seed=seed,
	)

	save_dataset.remote(path=data_dir, subsample=subsample)

	results = run_human_eval.remote(
	client_params=client_params,
	completion_params=completion_params,
	system_prompt=system_prompt,
	data_dir=data_dir,
	dry_run=dry_run,
	)
	if results:
	with open("/tmp/results.jsonl", "w") as f:
	f.writelines(json.dumps(result) + "\n" for result in results)
	print(f"results saved locally to {f.name}")


	@app.function(volumes={DATA_DIR: volume}, timeout=1 * HOURS)
	def run_human_eval(
	client_params: ClientParams,
	completion_params: CompletionParams,
	data_dir="dev-llm",
	system_prompt: str = default_system_prompt,
	dry_run=True,
	):
	dataset = load_dataset(data_dir)

	timestamp = datetime.utcnow().isoformat() + "Z"
	output_dir = Path(DATA_DIR) / data_dir / f"run-{timestamp}"
	output_dir.mkdir(parents=True, exist_ok=True)
	handles = []
	for i, item in enumerate(dataset):
	handles.append(
	run_item.spawn(
	item,
	client_params,
	completion_params,
	system_prompt,
	output_dir,
	dry_run,
	)
	)

	for handle in handles:
	result = handle.get()

	if not dry_run:
	return result


	@app.function(volumes={DATA_DIR: volume}, timeout=1 * HOURS)
	def run_item(
	item: dict,
	client_params: ClientParams,
	completion_params: CompletionParams,
	system_prompt: str,
	output_dir: Path,
	dry_run: bool,
	):
	client = create_client(client_params)
	if completion_params.model:
	print(
	Colors.BOLD,
	f"🧠: Using model {completion_params.model}. This may trigger a model load on first call!",
	Colors.END,
	sep="",
	)
	else:
	print(
	Colors.BOLD,
	f"🔎: Looking up available models on server at {client.base_url}. This may trigger a model load!",
	Colors.END,
	sep="",
	)
	model = client.models.list().data[0]
	model = model.id
	print(
	Colors.BOLD,
	f"🧠: Using {model}",
	Colors.END,
	sep="",
	)
	completion_params.model = model

	prompt = item["prompt"]
	messages = [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": prompt},
	]

	per_request = 250
	ct, completions = completion_params.n, []
	if not dry_run:
	while ct > 0:
	response = get_completion(
	client,
	messages=messages,
	**asdict(completion_params) \| dict(n=min(ct, per_request)),
	)
	if response:
	completions += [
	{
	"task_id": item["task_id"],
	"completion": choice.message.content,
	}
	for choice in response.choices
	]
	ct -= per_request

	index = item["task_id"].split("/")[-1]
	output_path = output_dir / f"{index}.jsonl"
	output_path.parent.mkdir(parents=True, exist_ok=True)
	with open(output_path, "w") as f:
	f.writelines(json.dumps(completion) + "\n" for completion in completions)

	print(Colors.GREEN + f"Completions saved to {output_path}" + Colors.END)


	class Colors:
	"""ANSI color codes"""

	GREEN = "\033[0;32m"
	RED = "\033[0;31m"
	BLUE = "\033[0;34m"
	GRAY = "\033[0;90m"
	BOLD = "\033[1m"
	END = "\033[0m"


	def get_completion(client, **kwargs):
	try:
	response = client.chat.completions.create(**kwargs)
	return response
	except Exception as e:
	print(Colors.RED, f"Error during API call: {e}", Colors.END, sep="")
	return None


	def create_client(client_params: ClientParams):
	from openai import OpenAI

	client = OpenAI(api_key=client_params.api_key)
	client.base_url = client_params.url

	return client


	@app.function(volumes={DATA_DIR: volume})
	def save_dataset(path="dev-llm", subsample: int = 1):
	import datasets

	path = DATA_DIR / path

	ds = datasets.load_dataset(
	"openai/openai_humaneval",
	split=datasets.ReadInstruction("test", to=subsample, unit="%"),
	)

	ds.to_json(path / "data.jsonl")

	volume.commit()


	def load_dataset(path="dev-llm"):
	import datasets

	path = DATA_DIR / path

	ds = datasets.load_dataset(path=str(path), data_files="data.jsonl")

	return ds["train"]
	import modal

	MODELS_DIR = "/llamas"

	DEFAULT_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct"
	DEFAULT_REVISION = "8c22764a7e3675c50d4c7c9a4edb474456022b16"

	volume = modal.Volume.from_name("llamas", create_if_missing=True)

	image = (
	modal.Image.debian_slim(python_version="3.10")
	.pip_install(
	[
	"huggingface_hub", # download models from the Hugging Face Hub
	"hf-transfer", # download models faster with Rust
	]
	)
	.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
	)


	MINUTES = 60
	HOURS = 60 * MINUTES


	app = modal.App(image=image, secrets=[modal.Secret.from_name("huggingface")])


	@app.function(volumes={MODELS_DIR: volume}, timeout=4 * HOURS)
	def download_model(model_name, model_revision, force_download=False):
	from huggingface_hub import snapshot_download

	volume.reload()

	snapshot_download(
	model_name,
	local_dir=MODELS_DIR,
	ignore_patterns=[
	"*.pt",
	"*.bin",
	"*.pth",
	"original/*",
	], # Ensure safetensors
	revision=model_revision,
	force_download=force_download,
	)

	volume.commit()


	@app.local_entrypoint()
	def main(
	model_name: str = DEFAULT_NAME,
	model_revision: str = DEFAULT_REVISION,
	force_download: bool = False,
	):
	download_model.remote(model_name, model_revision, force_download)
	from pathlib import Path

	import modal

	app = modal.App("humaneval-sandbox")

	volume = modal.Volume.from_name("humaneval", create_if_missing=True)

	sandbox_image = (
	modal.Image.debian_slim()
	.apt_install("git")
	.run_commands(
	"git clone https://github.com/modal-labs/human-eval.git",
	"pip install -e human-eval",
	)
	)

	MINUTES = 60


	@app.function(volumes={"/humaneval": volume})
	def run_humaneval(sample_file_path: str, problem_file_path: str):
	with modal.Volume.ephemeral() as vol:
	with vol.batch_upload() as batch:
	batch.put_file(sample_file_path, "samples.jsonl")
	batch.put_file(problem_file_path, "problems.jsonl")

	print(f"Starting sandbox for {sample_file_path}")
	sandbox = modal.Sandbox.create(
	"bash",
	"-c",
	"evaluate_functional_correctness vol/samples.jsonl --problem_file=vol/problems.jsonl --n_workers=32",
	image=sandbox_image,
	volumes={"/vol": vol},
	timeout=5 * MINUTES,
	cpu=32,
	)

	try:
	sandbox.wait_for(4 * MINUTES)
	print(f"Finished sandbox for {sample_file_path}")
	except TimeoutError:
	print("Sandbox timed out")

	if sandbox.returncode == 0:
	print(sandbox.stdout.read())
	data = b""
	for chunk in vol.read_file("samples.jsonl_results.jsonl"):
	data += chunk
	with open(f"{sample_file_path}_results.jsonl", "wb") as f:
	f.write(data)
	else:
	print(f"Tests failed with code {sandbox.returncode}")
	print(sandbox.stderr.read())


	@app.function(volumes={"/humaneval": volume}, timeout=10 * MINUTES)
	def find_missing_files():
	import os

	volume.reload()

	# Find all files matching /humaneval/{env}/{run}/{id}.jsonl
	envs = [element for element in Path("/humaneval").iterdir() if element.is_dir()]
	for env in envs:
	print(f"looking in {env}")
	problem_file = env / "data.jsonl"

	pattern = "/.jsonl"
	handles = []
	for file_path in env.glob(pattern):
	# Skip files that end with _results.jsonl
	if str(file_path).endswith("_results.jsonl"):
	continue

	print(f"Checking {file_path}")
	# Check if the corresponding results file exists
	results_file = f"{file_path}_results.jsonl"
	if not os.path.exists(results_file):
	# If it doesn't exist, run run_humaneval
	handles.append(run_humaneval.spawn(file_path, problem_file))

	for handle in handles:
	handle.get()


	@app.local_entrypoint()
	def main():
	find_missing_files.remote()
	import modal

	vllm_image = modal.Image.debian_slim(python_version="3.10").pip_install(
	"vllm==0.5.3post1"
	)


	MODELS_DIR = "/llamas"
	MODEL_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct"
	MODEL_REVISION = "8c22764a7e3675c50d4c7c9a4edb474456022b16"

	try:
	volume = modal.Volume.lookup("llamas", create_if_missing=False)
	except modal.exception.NotFoundError:
	raise Exception("Download models first with modal run download_llama.py")

	app = modal.App("example-infinite-monkeys")

	N_GPU = 1 # tip: for best results, first upgrade to more powerful GPUs, and only then increase GPU count
	TOKEN = (
	"super-secret-token" # auth token. for production use, replace with a modal.Secret
	)

	MINUTES = 60 # seconds
	HOURS = 60 * MINUTES


	@app.function(
	image=vllm_image,
	gpu=modal.gpu.A100(count=N_GPU, size="40GB"),
	container_idle_timeout=5 * MINUTES,
	timeout=24 * HOURS,
	allow_concurrent_inputs=2,
	volumes={MODELS_DIR: volume},
	concurrency_limit=10,
	)
	@modal.asgi_app()
	def serve():
	import fastapi
	import vllm.entrypoints.openai.api_server as api_server
	from vllm.engine.arg_utils import AsyncEngineArgs
	from vllm.engine.async_llm_engine import AsyncLLMEngine
	from vllm.entrypoints.logger import RequestLogger
	from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
	from vllm.entrypoints.openai.serving_completion import (
	OpenAIServingCompletion,
	)
	from vllm.usage.usage_lib import UsageContext

	volume.reload() # ensure we have the latest version of the weights

	# create a fastAPI app that uses vLLM's OpenAI-compatible router
	web_app = fastapi.FastAPI(
	title=f"OpenAI-compatible {MODEL_NAME} server",
	description="Run an OpenAI-compatible LLM server with vLLM on modal.com",
	version="0.0.1",
	docs_url="/docs",
	)

	# security: CORS middleware for external requests
	http_bearer = fastapi.security.HTTPBearer(
	scheme_name="Bearer Token",
	description="See code for authentication details.",
	)
	web_app.add_middleware(
	fastapi.middleware.cors.CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	# security: inject dependency on authed routes
	async def is_authenticated(api_key: str = fastapi.Security(http_bearer)):
	if api_key.credentials != TOKEN:
	raise fastapi.HTTPException(
	status_code=fastapi.status.HTTP_401_UNAUTHORIZED,
	detail="Invalid authentication credentials",
	)
	return {"username": "authenticated_user"}

	router = fastapi.APIRouter(dependencies=[fastapi.Depends(is_authenticated)])

	# wrap vllm's router in auth router
	router.include_router(api_server.router)
	# add authed vllm to our fastAPI app
	web_app.include_router(router)

	engine_args = AsyncEngineArgs(
	model=MODELS_DIR + "/" + MODEL_NAME,
	tensor_parallel_size=N_GPU,
	gpu_memory_utilization=0.90,
	max_model_len=2048,
	enforce_eager=False, # capture the graph for faster inference, but slower cold starts (30s > 20s)
	)

	engine = AsyncLLMEngine.from_engine_args(
	engine_args, usage_context=UsageContext.OPENAI_API_SERVER
	)

	model_config = get_model_config(engine)

	request_logger = RequestLogger(max_log_len=2048)

	api_server.openai_serving_chat = OpenAIServingChat(
	engine,
	model_config=model_config,
	served_model_names=[MODEL_NAME],
	chat_template=None,
	response_role="assistant",
	lora_modules=[],
	prompt_adapters=[],
	request_logger=request_logger,
	)
	api_server.openai_serving_completion = OpenAIServingCompletion(
	engine,
	model_config=model_config,
	served_model_names=[MODEL_NAME],
	lora_modules=[],
	prompt_adapters=[],
	request_logger=request_logger,
	)

	return web_app


	def get_model_config(engine):
	import asyncio

	try: # adapted from vLLM source -- https://github.com/vllm-project/vllm/blob/507ef787d85dec24490069ffceacbd6b161f4f72/vllm/entrypoints/openai/api_server.py#L235C1-L247C1
	event_loop = asyncio.get_running_loop()
	except RuntimeError:
	event_loop = None

	if event_loop is not None and event_loop.is_running():
	# If the current is instanced by Ray Serve,
	# there is already a running event loop
	model_config = event_loop.run_until_complete(engine.get_model_config())
	else:
	# When using single vLLM without engine_use_ray
	model_config = asyncio.run(engine.get_model_config())

	return model_config
	#!/bin/bash
	set -euo pipefail
	IFS=$'\n\t'

	command -v modal >/dev/null 2>&1 \|\| { echo >&2 "modal command not found. Install modal first! Aborting."; exit 1; }

	echo 'downloading LLaMA 3.1 8B'
	echo 'make sure to create a Secret called huggingface on Modal and accept the LLaMA 3.1 license'
	modal run download_llama.py
	echo 'deploying vLLM inference server'
	modal deploy inference.py
	echo 'running HumanEval generation'
	modal run client.py --data-dir test --no-dry-run --n 1000 --subsample 100
	echo 'running HumanEval evaluation'
	modal run eval.py::find_missing_files
	echo 'run "modal launch jupyter --volume humaneval" and upload the notebook to run the analysis'