Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save mrmaheshrajput/64265d19ff8ac7aaef7dc583f90c762f to your computer and use it in GitHub Desktop.
Save mrmaheshrajput/64265d19ff8ac7aaef7dc583f90c762f to your computer and use it in GitHub Desktop.
import io
import sagemaker
import boto3
import json
# Your IAM role that provides access to SageMaker and S3.
# See https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning-ex-role.html
# if running on a SageMaker notebook or directly use
# sagemaker.get_execution_role() if running on SageMaker studio
iam_role = "arn:aws:iam::1111111111:role/service-role/AmazonSageMaker-ExecutionRole-00000000T000000"
# manages interactions with the sagemaker apis
sagemaker_session = sagemaker.session.Session()
region = sagemaker_session._region_name
# boto3 Sagemaker runtime client to invoke the endpoint
# with streaming response
smr_client = boto3.client("sagemaker-runtime")
# get the lmi image uri
# available frameworks: "djl-lmi" (for vllm, lmi-dist), "djl-tensorrtllm" (for tensorrt-llm),
# "djl-neuronx" (for transformers neuronx)
container_uri = sagemaker.image_uris.retrieve(
framework="djl-lmi", version="0.28.0", region=region
)
# instance type you will deploy your model to
# Go for bigger instance if your model is bigger
# than 7B parameters
instance_type = "ml.g5.4xlarge"
# create a unique endpoint name
endpoint_name = sagemaker.utils.name_from_base("phi3-4k-lmi-endpoint")
# create your SageMaker Model
# phi-3-mini-4k model fits well on our instance's GPU
# as it only has 3.8B parameters
model = sagemaker.Model(
image_uri=container_uri,
role=iam_role,
# specify all environment variable configs in this map
env={
"HF_MODEL_ID": "microsoft/Phi-3-mini-4k-instruct",
"OPTION_ROLLING_BATCH": "vllm",
"TENSOR_PARALLEL_DEGREE": "max",
"OPTION_MAX_ROLLING_BATCH_SIZE": "2",
"OPTION_DTYPE": "fp16",
# Streaming will work without this variable
# "OPTION_ENABLE_STREAMING":"true"
},
)
# deploy your model
model.deploy(
instance_type=instance_type,
initial_instance_count=1,
endpoint_name=endpoint_name,
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment