|
<policies> |
|
<inbound> |
|
<base /> |
|
<!-- Getting the main variable where we keep the list of backends --> |
|
<cache-lookup-value key="listBackends" variable-name="listBackends" /> |
|
<!-- If we can't find the variable, initialize it --> |
|
<choose> |
|
<when condition="@(context.Variables.ContainsKey("listBackends") == false)"> |
|
<set-variable name="listBackends" value="@{ |
|
// ------------------------------------------------- |
|
// ------- Explanation of backend properties ------- |
|
// ------------------------------------------------- |
|
// "url": Your backend url |
|
// "priority": Lower value means higher priority over other backends. |
|
// If you have more one or more Priority 1 backends, they will always be used instead |
|
// of Priority 2 or higher. Higher values backends will only be used if your lower values (top priority) are all throttling. |
|
// "isThrottling": Indicates if this endpoint is returning 429 (Too many requests) currently |
|
// "retryAfter": We use it to know when to mark this endpoint as healthy again after we received a 429 response |
|
|
|
JArray backends = new JArray(); |
|
backends.Add(new JObject() |
|
{ |
|
{ "url", "https://azure-openai-ptu.openai.azure.com/" }, |
|
{ "priority", 1}, |
|
{ "isThrottling", false }, |
|
{ "retryAfter", DateTime.MinValue }, |
|
{ "hasTimeRestriction", true } |
|
}); |
|
|
|
backends.Add(new JObject() |
|
{ |
|
{ "url", "https://azure-openai-paygo1.openai.azure.com/" }, |
|
{ "priority", 2}, |
|
{ "isThrottling", false }, |
|
{ "retryAfter", DateTime.MinValue } |
|
}); |
|
|
|
backends.Add(new JObject() |
|
{ |
|
{ "url", "https://azure-openai-paygo2.openai.azure.com/" }, |
|
{ "priority", 2}, |
|
{ "isThrottling", false }, |
|
{ "retryAfter", DateTime.MinValue } |
|
}); |
|
|
|
return backends; |
|
}" /> |
|
<!-- And store the variable into cache again --> |
|
<cache-store-value key="listBackends" value="@((JArray)context.Variables["listBackends"])" duration="60" /> |
|
</when> |
|
</choose> |
|
<authentication-managed-identity resource="https://cognitiveservices.azure.com" output-token-variable-name="msi-access-token" ignore-error="false" /> |
|
<set-header name="Authorization" exists-action="override"> |
|
<value>@("Bearer " + (string)context.Variables["msi-access-token"])</value> |
|
</set-header> |
|
<set-variable name="backendIndex" value="-1" /> |
|
<set-variable name="remainingBackends" value="1" /> |
|
<!-- Emit token usage per APIM Subscription, requires Application Insight logging to be enabled for API --> |
|
<azure-openai-emit-token-metric> |
|
<dimension name="SubscriptionId" value="@(context.Subscription.Id)" /> |
|
</azure-openai-emit-token-metric> |
|
</inbound> |
|
<backend> |
|
<retry condition="@(context.Response != null && (context.Response.StatusCode == 429 || context.Response.StatusCode >= 500) && ((Int32)context.Variables["remainingBackends"]) > 0)" count="50" interval="0"> |
|
<!-- Before picking the backend, let's verify if there is any that should be set to not throttling anymore --> |
|
<set-variable name="listBackends" value="@{ |
|
JArray backends = (JArray)context.Variables["listBackends"]; |
|
|
|
for (int i = 0; i < backends.Count; i++) |
|
{ |
|
JObject backend = (JObject)backends[i]; |
|
|
|
if (backend.Value<bool>("isThrottling") && DateTime.Now >= backend.Value<DateTime>("retryAfter")) |
|
{ |
|
backend["isThrottling"] = false; |
|
backend["retryAfter"] = DateTime.MinValue; |
|
} |
|
} |
|
|
|
return backends; |
|
}" /> |
|
<cache-store-value key="listBackends" value="@((JArray)context.Variables["listBackends"])" duration="60" /> |
|
<!-- This is the main logic to pick the backend to be used --> |
|
<set-variable name="backendIndex" value="@{ |
|
// define during which times we want the PTU to be accessible |
|
bool IsWithinAllowedHours() |
|
{ |
|
TimeZoneInfo timeZoneInfo = TimeZoneInfo.FindSystemTimeZoneById("W. Europe Standard Time"); |
|
DateTime currentTime = TimeZoneInfo.ConvertTimeFromUtc(DateTime.UtcNow, timeZoneInfo); |
|
TimeSpan startTime = new TimeSpan(8, 0, 0); |
|
TimeSpan endTime = new TimeSpan(22, 0, 0); |
|
return !(currentTime.TimeOfDay >= startTime && currentTime.TimeOfDay <= endTime); |
|
} |
|
|
|
JArray backends = (JArray)context.Variables["listBackends"]; |
|
bool isWithinAllowedHours = IsWithinAllowedHours(); |
|
|
|
int selectedPriority = Int32.MaxValue; |
|
List<int> availableBackends = new List<int>(); |
|
|
|
for (int i = 0; i < backends.Count; i++) |
|
{ |
|
JObject backend = (JObject)backends[i]; |
|
|
|
if (!backend.Value<bool>("isThrottling")) |
|
{ |
|
int backendPriority = backend.Value<int>("priority"); |
|
bool hasTimeRestriction = backend.Value<bool?>("hasTimeRestriction") ?? false; |
|
|
|
// Check the time condition for the backend with time restriction |
|
if (hasTimeRestriction && !isWithinAllowedHours) |
|
{ |
|
continue; // Skip this backend if it's restricted by time and we're not within the allowed hours |
|
} |
|
|
|
if (backendPriority < selectedPriority) |
|
{ |
|
selectedPriority = backendPriority; |
|
availableBackends.Clear(); |
|
availableBackends.Add(i); |
|
} |
|
else if (backendPriority == selectedPriority) |
|
{ |
|
availableBackends.Add(i); |
|
} |
|
} |
|
} |
|
|
|
if (availableBackends.Count == 1) |
|
{ |
|
return availableBackends[0]; |
|
} |
|
|
|
if (availableBackends.Count > 0) |
|
{ |
|
//Returns a random backend from the list if we have more than one available with the same priority |
|
return availableBackends[new Random().Next(0, availableBackends.Count)]; |
|
} |
|
else |
|
{ |
|
//If there are no available backends, the request will be sent to the first one |
|
return 0; |
|
} |
|
}" /> |
|
<set-variable name="backendUrl" value="@(((JObject)((JArray)context.Variables["listBackends"])[(Int32)context.Variables["backendIndex"]]).Value<string>("url") + "/openai")" /> |
|
<set-backend-service base-url="@((string)context.Variables["backendUrl"])" /> |
|
<forward-request buffer-request-body="true" /> |
|
<choose> |
|
<!-- In case we got 429 or 5xx from a backend, update the list with its status --> |
|
<when condition="@(context.Response != null && (context.Response.StatusCode == 429 || context.Response.StatusCode >= 500) )"> |
|
<cache-lookup-value key="listBackends" variable-name="listBackends" /> |
|
<set-variable name="listBackends" value="@{ |
|
JArray backends = (JArray)context.Variables["listBackends"]; |
|
int currentBackendIndex = context.Variables.GetValueOrDefault<int>("backendIndex"); |
|
int retryAfter = Convert.ToInt32(context.Response.Headers.GetValueOrDefault("Retry-After", "-1")); |
|
|
|
if (retryAfter == -1) |
|
{ |
|
retryAfter = Convert.ToInt32(context.Response.Headers.GetValueOrDefault("x-ratelimit-reset-requests", "-1")); |
|
} |
|
|
|
if (retryAfter == -1) |
|
{ |
|
retryAfter = Convert.ToInt32(context.Response.Headers.GetValueOrDefault("x-ratelimit-reset-tokens", "10")); |
|
} |
|
|
|
JObject backend = (JObject)backends[currentBackendIndex]; |
|
backend["isThrottling"] = true; |
|
backend["retryAfter"] = DateTime.Now.AddSeconds(retryAfter); |
|
|
|
return backends; |
|
}" /> |
|
<cache-store-value key="listBackends" value="@((JArray)context.Variables["listBackends"])" duration="60" /> |
|
<set-variable name="remainingBackends" value="@{ |
|
JArray backends = (JArray)context.Variables["listBackends"]; |
|
|
|
int remainingBackends = 0; |
|
|
|
for (int i = 0; i < backends.Count; i++) |
|
{ |
|
JObject backend = (JObject)backends[i]; |
|
|
|
if (!backend.Value<bool>("isThrottling")) |
|
{ |
|
remainingBackends++; |
|
} |
|
} |
|
|
|
return remainingBackends; |
|
}" /> |
|
</when> |
|
</choose> |
|
</retry> |
|
</backend> |
|
<outbound> |
|
<base /> |
|
<!-- This will return the used backend URL in the HTTP header response. Remove it if you don't want to expose this data --> |
|
<set-header name="x-openai-backendurl" exists-action="override"> |
|
<value>@(context.Variables.GetValueOrDefault<string>("backendUrl", "none"))</value> |
|
</set-header> |
|
</outbound> |
|
<on-error> |
|
<base /> |
|
</on-error> |
|
</policies> |