csiebler · August 6, 2024 17:53
diff --git a/instructions.md b/instructions.md
diff --git a/policy-with-time-restriction.xml b/policy-with-time-restriction.xml
 <policies>
    <inbound>
        <base />
        <!-- Getting the main variable where we keep the list of backends -->
        <cache-lookup-value key="listBackends" variable-name="listBackends" />
        <!-- If we can't find the variable, initialize it -->
        <choose>
            <when condition="@(context.Variables.ContainsKey("listBackends") == false)">
                <set-variable name="listBackends" value="@{
                    // -------------------------------------------------
                    // ------- Explanation of backend properties -------
                    // -------------------------------------------------
                    // "url":          Your backend url
                    // "priority":     Lower value means higher priority over other backends. 
                    //                 If you have more one or more Priority 1 backends, they will always be used instead
                    //                 of Priority 2 or higher. Higher values backends will only be used if your lower values (top priority) are all throttling.
                    // "isThrottling": Indicates if this endpoint is returning 429 (Too many requests) currently
                    // "retryAfter":   We use it to know when to mark this endpoint as healthy again after we received a 429 response

                    JArray backends = new JArray();
                    backends.Add(new JObject()
                    {
                        { "url", "https://azure-openai-ptu.openai.azure.com/" },
                        { "priority", 1},
                        { "isThrottling", false }, 
                        { "retryAfter", DateTime.MinValue },
                        { "hasTimeRestriction", true }
                    });

                    backends.Add(new JObject()
                    {
                        { "url", "https://azure-openai-paygo1.openai.azure.com/" },
                        { "priority", 2},
                        { "isThrottling", false },
                        { "retryAfter", DateTime.MinValue }
                    });

                    backends.Add(new JObject()
                    {
                        { "url", "https://azure-openai-paygo2.openai.azure.com/" },
                        { "priority", 2},
                        { "isThrottling", false },
                        { "retryAfter", DateTime.MinValue }
                    });

                    return backends;   
                }" />
                <!-- And store the variable into cache again -->
                <cache-store-value key="listBackends" value="@((JArray)context.Variables["listBackends"])" duration="60" />
            </when>
        </choose>
        <authentication-managed-identity resource="https://cognitiveservices.azure.com" output-token-variable-name="msi-access-token" ignore-error="false" />
        <set-header name="Authorization" exists-action="override">
            <value>@("Bearer " + (string)context.Variables["msi-access-token"])</value>
        </set-header>
        <set-variable name="backendIndex" value="-1" />
        <set-variable name="remainingBackends" value="1" />
        <!-- Emit token usage per APIM Subscription, requires Application Insight logging to be enabled for API -->
        <azure-openai-emit-token-metric>
            <dimension name="SubscriptionId" value="@(context.Subscription.Id)" />
        </azure-openai-emit-token-metric>
    </inbound>
    <backend>
        <retry condition="@(context.Response != null && (context.Response.StatusCode == 429 || context.Response.StatusCode >= 500) && ((Int32)context.Variables["remainingBackends"]) > 0)" count="50" interval="0">
            <!-- Before picking the backend, let's verify if there is any that should be set to not throttling anymore -->
            <set-variable name="listBackends" value="@{
                JArray backends = (JArray)context.Variables["listBackends"];

                for (int i = 0; i < backends.Count; i++)
                {
                    JObject backend = (JObject)backends[i];

                    if (backend.Value<bool>("isThrottling") && DateTime.Now >= backend.Value<DateTime>("retryAfter"))
                    {
                        backend["isThrottling"] = false;
                        backend["retryAfter"] = DateTime.MinValue;
                    }
                }

                return backends; 
            }" />
            <cache-store-value key="listBackends" value="@((JArray)context.Variables["listBackends"])" duration="60" />
            <!-- This is the main logic to pick the backend to be used -->
            <set-variable name="backendIndex" value="@{
                // define during which times we want the PTU to be accessible
                bool IsWithinAllowedHours()
                {
                    TimeZoneInfo timeZoneInfo = TimeZoneInfo.FindSystemTimeZoneById("W. Europe Standard Time");  
                    DateTime currentTime = TimeZoneInfo.ConvertTimeFromUtc(DateTime.UtcNow, timeZoneInfo);  
                    TimeSpan startTime = new TimeSpan(8, 0, 0);
                    TimeSpan endTime = new TimeSpan(22, 0, 0);
                    return !(currentTime.TimeOfDay >= startTime && currentTime.TimeOfDay <= endTime);
                }

                JArray backends = (JArray)context.Variables["listBackends"];
                bool isWithinAllowedHours = IsWithinAllowedHours();

                int selectedPriority = Int32.MaxValue;
                List<int> availableBackends = new List<int>();

                for (int i = 0; i < backends.Count; i++)
                {
                    JObject backend = (JObject)backends[i];

                    if (!backend.Value<bool>("isThrottling"))
                    {
                        int backendPriority = backend.Value<int>("priority");
                        bool hasTimeRestriction = backend.Value<bool?>("hasTimeRestriction") ?? false;  

                        // Check the time condition for the backend with time restriction  
                        if (hasTimeRestriction && !isWithinAllowedHours)
                        {  
                            continue; // Skip this backend if it's restricted by time and we're not within the allowed hours  
                        }
                                                    
                        if (backendPriority < selectedPriority)
                        {
                            selectedPriority = backendPriority;
                            availableBackends.Clear();
                            availableBackends.Add(i);
                        } 
                        else if (backendPriority == selectedPriority)
                        {
                            availableBackends.Add(i);
                        }
                    }
                }

                if (availableBackends.Count == 1)
                {
                    return availableBackends[0];
                }
            
                if (availableBackends.Count > 0)
                {
                    //Returns a random backend from the list if we have more than one available with the same priority
                    return availableBackends[new Random().Next(0, availableBackends.Count)];
                }
                else
                {
                    //If there are no available backends, the request will be sent to the first one
                    return 0;    
                }
                }" />
            <set-variable name="backendUrl" value="@(((JObject)((JArray)context.Variables["listBackends"])[(Int32)context.Variables["backendIndex"]]).Value<string>("url") + "/openai")" />
            <set-backend-service base-url="@((string)context.Variables["backendUrl"])" />
            <forward-request buffer-request-body="true" />
            <choose>
                <!-- In case we got 429 or 5xx from a backend, update the list with its status -->
                <when condition="@(context.Response != null && (context.Response.StatusCode == 429 || context.Response.StatusCode >= 500) )">
                    <cache-lookup-value key="listBackends" variable-name="listBackends" />
                    <set-variable name="listBackends" value="@{
                        JArray backends = (JArray)context.Variables["listBackends"];
                        int currentBackendIndex = context.Variables.GetValueOrDefault<int>("backendIndex");
                        int retryAfter = Convert.ToInt32(context.Response.Headers.GetValueOrDefault("Retry-After", "-1"));

                        if (retryAfter == -1)
                        {
                            retryAfter = Convert.ToInt32(context.Response.Headers.GetValueOrDefault("x-ratelimit-reset-requests", "-1"));
                        }

                        if (retryAfter == -1)
                        {
                            retryAfter = Convert.ToInt32(context.Response.Headers.GetValueOrDefault("x-ratelimit-reset-tokens", "10"));
                        }

                        JObject backend = (JObject)backends[currentBackendIndex];
                        backend["isThrottling"] = true;
                        backend["retryAfter"] = DateTime.Now.AddSeconds(retryAfter);

                        return backends;      
                    }" />
                    <cache-store-value key="listBackends" value="@((JArray)context.Variables["listBackends"])" duration="60" />
                    <set-variable name="remainingBackends" value="@{
                        JArray backends = (JArray)context.Variables["listBackends"];

                        int remainingBackends = 0;

                        for (int i = 0; i < backends.Count; i++)
                        {
                            JObject backend = (JObject)backends[i];

                            if (!backend.Value<bool>("isThrottling"))
                            {
                                remainingBackends++;
                            }
                        }

                        return remainingBackends;
                    }" />
                </when>
            </choose>
        </retry>
    </backend>
    <outbound>
        <base />
        <!-- This will return the used backend URL in the HTTP header response. Remove it if you don't want to expose this data -->
        <set-header name="x-openai-backendurl" exists-action="override">
            <value>@(context.Variables.GetValueOrDefault<string>("backendUrl", "none"))</value>
        </set-header>
    </outbound>
    <on-error>
        <base />
    </on-error>
 </policies>
diff --git a/policy-with-token-tracking.xml b/policy-with-token-tracking.xml
 <policies>
    <inbound>
        <base />
        <!-- Getting the main variable where we keep the list of backends -->
        <cache-lookup-value key="listBackends" variable-name="listBackends" />
        <!-- If we can't find the variable, initialize it -->
        <choose>
            <when condition="@(context.Variables.ContainsKey("listBackends") == false)">
                <set-variable name="listBackends" value="@{
                    // -------------------------------------------------
                    // ------- Explanation of backend properties -------
                    // -------------------------------------------------
                    // "url":          Your backend url
                    // "priority":     Lower value means higher priority over other backends. 
                    //                 If you have more one or more Priority 1 backends, they will always be used instead
                    //                 of Priority 2 or higher. Higher values backends will only be used if your lower values (top priority) are all throttling.
                    // "isThrottling": Indicates if this endpoint is returning 429 (Too many requests) currently
                    // "retryAfter":   We use it to know when to mark this endpoint as healthy again after we received a 429 response

                    JArray backends = new JArray();
                    backends.Add(new JObject()
                    {
                        { "url", "https://azure-openai-ptu.openai.azure.com/" },
                        { "priority", 1},
                        { "isThrottling", false }, 
                        { "retryAfter", DateTime.MinValue } 
                    });

                    backends.Add(new JObject()
                    {
                        { "url", "https://azure-openai-paygo1.openai.azure.com/" },
                        { "priority", 2},
                        { "isThrottling", false },
                        { "retryAfter", DateTime.MinValue }
                    });

                    backends.Add(new JObject()
                    {
                        { "url", "https://azure-openai-paygo2.openai.azure.com/" },
                        { "priority", 2},
                        { "isThrottling", false },
                        { "retryAfter", DateTime.MinValue }
                    });

                    return backends;   
                }" />
                <!-- And store the variable into cache again -->
                <cache-store-value key="listBackends" value="@((JArray)context.Variables["listBackends"])" duration="60" />
            </when>
        </choose>
        <authentication-managed-identity resource="https://cognitiveservices.azure.com" output-token-variable-name="msi-access-token" ignore-error="false" />
        <set-header name="Authorization" exists-action="override">
            <value>@("Bearer " + (string)context.Variables["msi-access-token"])</value>
        </set-header>
        <set-variable name="backendIndex" value="-1" />
        <set-variable name="remainingBackends" value="1" />
        <!-- Emit token usage per APIM Subscription, requires Application Insight logging to be enabled for API -->
        <azure-openai-emit-token-metric>
            <dimension name="SubscriptionId" value="@(context.Subscription.Id)" />
        </azure-openai-emit-token-metric>
    </inbound>
    <backend>
        <retry condition="@(context.Response != null && (context.Response.StatusCode == 429 || context.Response.StatusCode >= 500) && ((Int32)context.Variables["remainingBackends"]) > 0)" count="50" interval="0">
            <!-- Before picking the backend, let's verify if there is any that should be set to not throttling anymore -->
            <set-variable name="listBackends" value="@{
                JArray backends = (JArray)context.Variables["listBackends"];

                for (int i = 0; i < backends.Count; i++)
                {
                    JObject backend = (JObject)backends[i];

                    if (backend.Value<bool>("isThrottling") && DateTime.Now >= backend.Value<DateTime>("retryAfter"))
                    {
                        backend["isThrottling"] = false;
                        backend["retryAfter"] = DateTime.MinValue;
                    }
                }

                return backends; 
            }" />
            <cache-store-value key="listBackends" value="@((JArray)context.Variables["listBackends"])" duration="60" />
            <!-- This is the main logic to pick the backend to be used -->
            <set-variable name="backendIndex" value="@{
                JArray backends = (JArray)context.Variables["listBackends"];

                int selectedPriority = Int32.MaxValue;
                List<int> availableBackends = new List<int>();

                for (int i = 0; i < backends.Count; i++)
                {
                    JObject backend = (JObject)backends[i];

                    if (!backend.Value<bool>("isThrottling"))
                    {
                        int backendPriority = backend.Value<int>("priority");

                        if (backendPriority < selectedPriority)
                        {
                            selectedPriority = backendPriority;
                            availableBackends.Clear();
                            availableBackends.Add(i);
                        } 
                        else if (backendPriority == selectedPriority)
                        {
                            availableBackends.Add(i);
                        }
                    }
                }

                if (availableBackends.Count == 1)
                {
                    return availableBackends[0];
                }
            
                if (availableBackends.Count > 0)
                {
                    //Returns a random backend from the list if we have more than one available with the same priority
                    return availableBackends[new Random().Next(0, availableBackends.Count)];
                }
                else
                {
                    //If there are no available backends, the request will be sent to the first one
                    return 0;    
                }
                }" />
            <set-variable name="backendUrl" value="@(((JObject)((JArray)context.Variables["listBackends"])[(Int32)context.Variables["backendIndex"]]).Value<string>("url") + "/openai")" />
            <set-backend-service base-url="@((string)context.Variables["backendUrl"])" />
            <forward-request buffer-request-body="true" />
            <choose>
                <!-- In case we got 429 or 5xx from a backend, update the list with its status -->
                <when condition="@(context.Response != null && (context.Response.StatusCode == 429 || context.Response.StatusCode >= 500) )">
                    <cache-lookup-value key="listBackends" variable-name="listBackends" />
                    <set-variable name="listBackends" value="@{
                        JArray backends = (JArray)context.Variables["listBackends"];
                        int currentBackendIndex = context.Variables.GetValueOrDefault<int>("backendIndex");
                        int retryAfter = Convert.ToInt32(context.Response.Headers.GetValueOrDefault("Retry-After", "-1"));

                        if (retryAfter == -1)
                        {
                            retryAfter = Convert.ToInt32(context.Response.Headers.GetValueOrDefault("x-ratelimit-reset-requests", "-1"));
                        }

                        if (retryAfter == -1)
                        {
                            retryAfter = Convert.ToInt32(context.Response.Headers.GetValueOrDefault("x-ratelimit-reset-tokens", "10"));
                        }

                        JObject backend = (JObject)backends[currentBackendIndex];
                        backend["isThrottling"] = true;
                        backend["retryAfter"] = DateTime.Now.AddSeconds(retryAfter);

                        return backends;      
                    }" />
                    <cache-store-value key="listBackends" value="@((JArray)context.Variables["listBackends"])" duration="60" />
                    <set-variable name="remainingBackends" value="@{
                        JArray backends = (JArray)context.Variables["listBackends"];

                        int remainingBackends = 0;

                        for (int i = 0; i < backends.Count; i++)
                        {
                            JObject backend = (JObject)backends[i];

                            if (!backend.Value<bool>("isThrottling"))
                            {
                                remainingBackends++;
                            }
                        }

                        return remainingBackends;
                    }" />
                </when>
            </choose>
        </retry>
    </backend>
    <outbound>
        <base />
        <!-- This will return the used backend URL in the HTTP header response. Remove it if you don't want to expose this data -->
        <set-header name="x-openai-backendurl" exists-action="override">
            <value>@(context.Variables.GetValueOrDefault<string>("backendUrl", "none"))</value>
        </set-header>
    </outbound>
    <on-error>
        <base />
    </on-error>
 </policies>
	<policies>
	<inbound>
	<base />
	<!-- Getting the main variable where we keep the list of backends -->
	<cache-lookup-value key="listBackends" variable-name="listBackends" />
	<!-- If we can't find the variable, initialize it -->
	<choose>
	<when condition="@(context.Variables.ContainsKey("listBackends") == false)">
	<set-variable name="listBackends" value="@{
	// -------------------------------------------------
	// ------- Explanation of backend properties -------
	// -------------------------------------------------
	// "url": Your backend url
	// "priority": Lower value means higher priority over other backends.
	// If you have more one or more Priority 1 backends, they will always be used instead
	// of Priority 2 or higher. Higher values backends will only be used if your lower values (top priority) are all throttling.
	// "isThrottling": Indicates if this endpoint is returning 429 (Too many requests) currently
	// "retryAfter": We use it to know when to mark this endpoint as healthy again after we received a 429 response

	JArray backends = new JArray();
	backends.Add(new JObject()
	{
	{ "url", "https://azure-openai-ptu.openai.azure.com/" },
	{ "priority", 1},
	{ "isThrottling", false },
	{ "retryAfter", DateTime.MinValue },
	{ "hasTimeRestriction", true }
	});

	backends.Add(new JObject()
	{
	{ "url", "https://azure-openai-paygo1.openai.azure.com/" },
	{ "priority", 2},
	{ "isThrottling", false },
	{ "retryAfter", DateTime.MinValue }
	});

	backends.Add(new JObject()
	{
	{ "url", "https://azure-openai-paygo2.openai.azure.com/" },
	{ "priority", 2},
	{ "isThrottling", false },
	{ "retryAfter", DateTime.MinValue }
	});

	return backends;
	}" />
	<!-- And store the variable into cache again -->
	<cache-store-value key="listBackends" value="@((JArray)context.Variables["listBackends"])" duration="60" />
	</when>
	</choose>
	<authentication-managed-identity resource="https://cognitiveservices.azure.com" output-token-variable-name="msi-access-token" ignore-error="false" />
	<set-header name="Authorization" exists-action="override">
	<value>@("Bearer " + (string)context.Variables["msi-access-token"])</value>
	</set-header>
	<set-variable name="backendIndex" value="-1" />
	<set-variable name="remainingBackends" value="1" />
	<!-- Emit token usage per APIM Subscription, requires Application Insight logging to be enabled for API -->
	<azure-openai-emit-token-metric>
	<dimension name="SubscriptionId" value="@(context.Subscription.Id)" />
	</azure-openai-emit-token-metric>
	</inbound>
	<backend>
	<retry condition="@(context.Response != null && (context.Response.StatusCode == 429 \|\| context.Response.StatusCode >= 500) && ((Int32)context.Variables["remainingBackends"]) > 0)" count="50" interval="0">
	<!-- Before picking the backend, let's verify if there is any that should be set to not throttling anymore -->
	<set-variable name="listBackends" value="@{
	JArray backends = (JArray)context.Variables["listBackends"];

	for (int i = 0; i < backends.Count; i++)
	{
	JObject backend = (JObject)backends[i];

	if (backend.Value<bool>("isThrottling") && DateTime.Now >= backend.Value<DateTime>("retryAfter"))
	{
	backend["isThrottling"] = false;
	backend["retryAfter"] = DateTime.MinValue;
	}
	}

	return backends;
	}" />
	<cache-store-value key="listBackends" value="@((JArray)context.Variables["listBackends"])" duration="60" />
	<!-- This is the main logic to pick the backend to be used -->
	<set-variable name="backendIndex" value="@{
	// define during which times we want the PTU to be accessible
	bool IsWithinAllowedHours()
	{
	TimeZoneInfo timeZoneInfo = TimeZoneInfo.FindSystemTimeZoneById("W. Europe Standard Time");
	DateTime currentTime = TimeZoneInfo.ConvertTimeFromUtc(DateTime.UtcNow, timeZoneInfo);
	TimeSpan startTime = new TimeSpan(8, 0, 0);
	TimeSpan endTime = new TimeSpan(22, 0, 0);
	return !(currentTime.TimeOfDay >= startTime && currentTime.TimeOfDay <= endTime);
	}

	JArray backends = (JArray)context.Variables["listBackends"];
	bool isWithinAllowedHours = IsWithinAllowedHours();

	int selectedPriority = Int32.MaxValue;
	List<int> availableBackends = new List<int>();

	for (int i = 0; i < backends.Count; i++)
	{
	JObject backend = (JObject)backends[i];

	if (!backend.Value<bool>("isThrottling"))
	{
	int backendPriority = backend.Value<int>("priority");
	bool hasTimeRestriction = backend.Value<bool?>("hasTimeRestriction") ?? false;

	// Check the time condition for the backend with time restriction
	if (hasTimeRestriction && !isWithinAllowedHours)
	{
	continue; // Skip this backend if it's restricted by time and we're not within the allowed hours
	}

	if (backendPriority < selectedPriority)
	{
	selectedPriority = backendPriority;
	availableBackends.Clear();
	availableBackends.Add(i);
	}
	else if (backendPriority == selectedPriority)
	{
	availableBackends.Add(i);
	}
	}
	}

	if (availableBackends.Count == 1)
	{
	return availableBackends[0];
	}

	if (availableBackends.Count > 0)
	{
	//Returns a random backend from the list if we have more than one available with the same priority
	return availableBackends[new Random().Next(0, availableBackends.Count)];
	}
	else
	{
	//If there are no available backends, the request will be sent to the first one
	return 0;
	}
	}" />
	<set-variable name="backendUrl" value="@(((JObject)((JArray)context.Variables["listBackends"])[(Int32)context.Variables["backendIndex"]]).Value<string>("url") + "/openai")" />
	<set-backend-service base-url="@((string)context.Variables["backendUrl"])" />
	<forward-request buffer-request-body="true" />
	<choose>
	<!-- In case we got 429 or 5xx from a backend, update the list with its status -->
	<when condition="@(context.Response != null && (context.Response.StatusCode == 429 \|\| context.Response.StatusCode >= 500) )">
	<cache-lookup-value key="listBackends" variable-name="listBackends" />
	<set-variable name="listBackends" value="@{
	JArray backends = (JArray)context.Variables["listBackends"];
	int currentBackendIndex = context.Variables.GetValueOrDefault<int>("backendIndex");
	int retryAfter = Convert.ToInt32(context.Response.Headers.GetValueOrDefault("Retry-After", "-1"));

	if (retryAfter == -1)
	{
	retryAfter = Convert.ToInt32(context.Response.Headers.GetValueOrDefault("x-ratelimit-reset-requests", "-1"));
	}

	if (retryAfter == -1)
	{
	retryAfter = Convert.ToInt32(context.Response.Headers.GetValueOrDefault("x-ratelimit-reset-tokens", "10"));
	}

	JObject backend = (JObject)backends[currentBackendIndex];
	backend["isThrottling"] = true;
	backend["retryAfter"] = DateTime.Now.AddSeconds(retryAfter);

	return backends;
	}" />
	<cache-store-value key="listBackends" value="@((JArray)context.Variables["listBackends"])" duration="60" />
	<set-variable name="remainingBackends" value="@{
	JArray backends = (JArray)context.Variables["listBackends"];

	int remainingBackends = 0;

	for (int i = 0; i < backends.Count; i++)
	{
	JObject backend = (JObject)backends[i];

	if (!backend.Value<bool>("isThrottling"))
	{
	remainingBackends++;
	}
	}

	return remainingBackends;
	}" />
	</when>
	</choose>
	</retry>
	</backend>
	<outbound>
	<base />
	<!-- This will return the used backend URL in the HTTP header response. Remove it if you don't want to expose this data -->
	<set-header name="x-openai-backendurl" exists-action="override">
	<value>@(context.Variables.GetValueOrDefault<string>("backendUrl", "none"))</value>
	</set-header>
	</outbound>
	<on-error>
	<base />
	</on-error>
	</policies>