Last active August 6, 2024 17:53
APIM Golden Template for Azure OpenAI Load Balance and Usage Tracking

Load balancing multiple AOAI resources and tracking usage for one or more use cases

Decision points

  • Which PTU deployments (models, regions)
  • Which PAYGO deployments (models, regions)
  • Token tracking level (use case, teams, etc.)

Load Balancing Strategy

PAYGO only

  • Setup of priority groups: PAYGO (highest priority in region A), then PAYGO (lower priorities in regions B, C, D, ...)


  • Priority 1: PAYGO in region A, PAYGO in region A
  • Priority 2: PAYGO in region B, PAYGO in region B


  • Setup of priority groups: PTU (highest priority), then PAYGO (lower priorities, potentially multiple regions)


  • Priority 1: PTU in region A
  • Priority 2: PAYGO in region A
  • Priority 3: PAYGO in region B

Seperating low and high-priority use cases

What is the need to differentiate between latency-sensitivity between use cases?

All use cases have equal priority

Typically a good choice when all use cases need to be fast/are equally important.

** Solution:**

  • One load balancing strategy (PTU+PAYGO)
  • Monitor PTU utilization/number of 429 error and scale number of PTUs up/down

Some use cases have higher priority

Typically useful when there are times when PTUs are underutilized and low-priority could re-use the PTUs to save cost.

Solution 1:

  • Two load balancing strategies (PTU+PAYGO)
  • Low-priority requests go a different endpoint route in APIM and only hit PTU during specific times of the day/days of the week

Solution 2:

  • Two load balancing strategies (PTU+PAYGO), requires two PTU deployments (e.g. 2x 50 PTUs for gpt-4o)
  • High-priority requests - Load balancing strategy --> implement via different API route
    • Priority 1: PTU-1 in region A, PTU-2 in region A
    • Priority 2: PAYGO in region A
    • Priority 3: PAYGO in region B
  • Low-priority requests - Load balancing strategy --> implement via different API route
    • Priority 1: PTU-2 in region A
    • Priority 2: PAYGO in region A
    • Priority 3: PAYGO in region B
  • This ensures that 50% of the capacity is reserved for the high-priority use cases only
<base />
<!-- Getting the main variable where we keep the list of backends -->
<cache-lookup-value key="listBackends" variable-name="listBackends" />
<!-- If we can't find the variable, initialize it -->
<when condition="@(context.Variables.ContainsKey("listBackends") == false)">
<set-variable name="listBackends" value="@{
// -------------------------------------------------
// ------- Explanation of backend properties -------
// -------------------------------------------------
// "url": Your backend url
// "priority": Lower value means higher priority over other backends.
// If you have more one or more Priority 1 backends, they will always be used instead
// of Priority 2 or higher. Higher values backends will only be used if your lower values (top priority) are all throttling.
// "isThrottling": Indicates if this endpoint is returning 429 (Too many requests) currently
// "retryAfter": We use it to know when to mark this endpoint as healthy again after we received a 429 response
JArray backends = new JArray();
backends.Add(new JObject()
{ "url", "" },
{ "priority", 1},
{ "isThrottling", false },
{ "retryAfter", DateTime.MinValue },
{ "hasTimeRestriction", true }
backends.Add(new JObject()
{ "url", "" },
{ "priority", 2},
{ "isThrottling", false },
{ "retryAfter", DateTime.MinValue }
backends.Add(new JObject()
{ "url", "" },
{ "priority", 2},
{ "isThrottling", false },
{ "retryAfter", DateTime.MinValue }
return backends;
}" />
<!-- And store the variable into cache again -->
<cache-store-value key="listBackends" value="@((JArray)context.Variables["listBackends"])" duration="60" />
<authentication-managed-identity resource="" output-token-variable-name="msi-access-token" ignore-error="false" />
<set-header name="Authorization" exists-action="override">
<value>@("Bearer " + (string)context.Variables["msi-access-token"])</value>
<set-variable name="backendIndex" value="-1" />
<set-variable name="remainingBackends" value="1" />
<!-- Emit token usage per APIM Subscription, requires Application Insight logging to be enabled for API -->
<dimension name="SubscriptionId" value="@(context.Subscription.Id)" />
<retry condition="@(context.Response != null && (context.Response.StatusCode == 429 || context.Response.StatusCode >= 500) && ((Int32)context.Variables["remainingBackends"]) > 0)" count="50" interval="0">
<!-- Before picking the backend, let's verify if there is any that should be set to not throttling anymore -->
<set-variable name="listBackends" value="@{
JArray backends = (JArray)context.Variables["listBackends"];
for (int i = 0; i < backends.Count; i++)
JObject backend = (JObject)backends[i];
if (backend.Value<bool>("isThrottling") && DateTime.Now >= backend.Value<DateTime>("retryAfter"))
backend["isThrottling"] = false;
backend["retryAfter"] = DateTime.MinValue;
return backends;
}" />
<cache-store-value key="listBackends" value="@((JArray)context.Variables["listBackends"])" duration="60" />
<!-- This is the main logic to pick the backend to be used -->
<set-variable name="backendIndex" value="@{
// define during which times we want the PTU to be accessible
bool IsWithinAllowedHours()
TimeZoneInfo timeZoneInfo = TimeZoneInfo.FindSystemTimeZoneById("W. Europe Standard Time");
DateTime currentTime = TimeZoneInfo.ConvertTimeFromUtc(DateTime.UtcNow, timeZoneInfo);
TimeSpan startTime = new TimeSpan(8, 0, 0);
TimeSpan endTime = new TimeSpan(22, 0, 0);
return !(currentTime.TimeOfDay >= startTime && currentTime.TimeOfDay <= endTime);
JArray backends = (JArray)context.Variables["listBackends"];
bool isWithinAllowedHours = IsWithinAllowedHours();
int selectedPriority = Int32.MaxValue;
List<int> availableBackends = new List<int>();
for (int i = 0; i < backends.Count; i++)
JObject backend = (JObject)backends[i];
if (!backend.Value<bool>("isThrottling"))
int backendPriority = backend.Value<int>("priority");
bool hasTimeRestriction = backend.Value<bool?>("hasTimeRestriction") ?? false;
// Check the time condition for the backend with time restriction
if (hasTimeRestriction && !isWithinAllowedHours)
continue; // Skip this backend if it's restricted by time and we're not within the allowed hours
if (backendPriority < selectedPriority)
selectedPriority = backendPriority;
else if (backendPriority == selectedPriority)
if (availableBackends.Count == 1)
return availableBackends[0];
if (availableBackends.Count > 0)
//Returns a random backend from the list if we have more than one available with the same priority
return availableBackends[new Random().Next(0, availableBackends.Count)];
//If there are no available backends, the request will be sent to the first one
return 0;
}" />
<set-variable name="backendUrl" value="@(((JObject)((JArray)context.Variables["listBackends"])[(Int32)context.Variables["backendIndex"]]).Value<string>("url") + "/openai")" />
<set-backend-service base-url="@((string)context.Variables["backendUrl"])" />
<forward-request buffer-request-body="true" />
<!-- In case we got 429 or 5xx from a backend, update the list with its status -->
<when condition="@(context.Response != null && (context.Response.StatusCode == 429 || context.Response.StatusCode >= 500) )">
<cache-lookup-value key="listBackends" variable-name="listBackends" />
<set-variable name="listBackends" value="@{
JArray backends = (JArray)context.Variables["listBackends"];
int currentBackendIndex = context.Variables.GetValueOrDefault<int>("backendIndex");
int retryAfter = Convert.ToInt32(context.Response.Headers.GetValueOrDefault("Retry-After", "-1"));
if (retryAfter == -1)
retryAfter = Convert.ToInt32(context.Response.Headers.GetValueOrDefault("x-ratelimit-reset-requests", "-1"));
if (retryAfter == -1)
retryAfter = Convert.ToInt32(context.Response.Headers.GetValueOrDefault("x-ratelimit-reset-tokens", "10"));
JObject backend = (JObject)backends[currentBackendIndex];
backend["isThrottling"] = true;
backend["retryAfter"] = DateTime.Now.AddSeconds(retryAfter);
return backends;
}" />
<cache-store-value key="listBackends" value="@((JArray)context.Variables["listBackends"])" duration="60" />
<set-variable name="remainingBackends" value="@{
JArray backends = (JArray)context.Variables["listBackends"];
int remainingBackends = 0;
for (int i = 0; i < backends.Count; i++)
JObject backend = (JObject)backends[i];
if (!backend.Value<bool>("isThrottling"))
return remainingBackends;
}" />
<base />
<!-- This will return the used backend URL in the HTTP header response. Remove it if you don't want to expose this data -->
<set-header name="x-openai-backendurl" exists-action="override">
<value>@(context.Variables.GetValueOrDefault<string>("backendUrl", "none"))</value>
<base />
