Created
July 13, 2019 20:25
-
-
Save emrul/42c6baf23eb16791c48e7226e616d116 to your computer and use it in GitHub Desktop.
C#/.NET implementation of P2 (P-Squared) algorithm for calculating percentiles on streaming data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
/* | |
* References: | |
* Initial idea: https://www.cse.wustl.edu/~jain/papers/ftp/psqr.pdf | |
* Based on implementation: https://github.com/skycmoon/P2 | |
* | |
* Author: | |
* Emrul Islam <emrul@emrul.com> | |
* | |
* Usage: | |
* Either initialise with a specified quantile: | |
* P2Engine p2Engine = new P2Engine(0.5); | |
* or a list of quantiles: | |
* P2Engine p2Engine = new P2Engine(new[] {0.25, 0.5, 0.75}); | |
* | |
* Then for each data-point call the add method. E.g.: | |
* int SAMPLE_SIZE = 1000000; | |
* Random rnd = new Random(78651); | |
* double[] values = Enumerable.Repeat(0, SAMPLE_SIZE).Select(i=>rnd.NextDouble()).ToArray(); | |
* foreach (double value in values) | |
* { | |
* p2Engine.add(value); | |
* } | |
* | |
* If you passed a single double to the constructor you can call result(): | |
* p2Engine.result(); | |
* // 0.499828816523464 | |
* Or if you passed a list of doubles you must call result with a specific quantile: | |
* p2Engine.result(0.25); | |
* // 0.249839636176194 | |
*/ | |
public class P2Engine | |
{ | |
private double[] q; | |
private double[] dn; | |
private double[] np; | |
private int[] n; | |
private int count = 0; | |
private int marker_count; | |
public P2Engine(double quantile) : this(new[] {quantile}) | |
{ | |
} | |
public P2Engine(ICollection<double> quantiles) | |
{ | |
count = 0; | |
// We need len(quantiles) * 3 + 2 markers | |
// One marker for start, one for end, and each quantile we're interested in needs 3 markers. | |
marker_count = quantiles.Count * 3 + 2; | |
q = new double[marker_count]; | |
dn = new double[marker_count]; | |
np = new double[marker_count]; | |
n = new int[marker_count]; | |
dn[0] = 0; // First Marker is always 0 | |
dn[marker_count-1] = 1; // Last marker is always 1 | |
int pointer = 1; | |
// Add each quantile | |
foreach (double quantile in quantiles) | |
{ | |
dn[pointer] = quantile; | |
dn[pointer + 1] = quantile / 2; | |
dn[pointer + 2] = (1 + quantile) / 2; | |
pointer += 3; | |
} | |
// Sort the markers | |
Array.Sort(dn); | |
for (int i = 0; i < marker_count; i++) | |
{ | |
np[i] = (marker_count - 1) * dn[i] + 1; | |
} | |
} | |
private int sign(double d) | |
{ | |
if (d >= 0) | |
{ | |
return 1; | |
} | |
return -1; | |
} | |
private double parabolic(int i, int d) | |
{ | |
return q[i] + d / ((n[i + 1] - n[i - 1]) * | |
((n[i] - n[i - 1] + d) * ((q[i + 1] - q[i]) / (n[i + 1] - n[i])) + | |
(n[i + 1] - (n[i] - d)) * ((q[i] - q[i - 1]) / (n[i] - n[i - 1])))); | |
} | |
private double linear(int i, int d) | |
{ | |
return q[i] + d * ((q[i + d] - q[i]) / (n[i + d] - n[i])); | |
} | |
public void add(double data) | |
{ | |
int i; | |
int k = 0; | |
if (count >= marker_count) | |
{ | |
count++; | |
// B1 | |
if (data < q[0]) | |
{ | |
q[0] = data; | |
k = 1; | |
} | |
else if (data >= q[marker_count - 1]) | |
{ | |
q[marker_count - 1] = data; | |
k = marker_count - 1; | |
} | |
else | |
{ | |
for (i = 1; i < marker_count; i++) | |
{ | |
if (!(data < q[i])) continue; | |
k = i; | |
break; | |
} | |
} | |
// B2 | |
for (i = k; i < marker_count; i++) | |
{ | |
n[i]++; | |
np[i] = np[i] + dn[i]; | |
} | |
for (i = 0; i < k; i++) | |
{ | |
np[i] = np[i] + dn[i]; | |
} | |
// B3 | |
for (i = 1; i < marker_count - 1; i++) | |
{ | |
var d = np[i] - n[i]; | |
if ((!(d >= 1) || n[i + 1] - n[i] <= 1) && (!(d <= -1) || n[i - 1] - n[i] >= -1)) continue; | |
var newq = parabolic(i, sign(d)); | |
if (q[i - 1] < newq && newq < q[i + 1]) | |
{ | |
q[i] = newq; | |
} | |
else | |
{ | |
q[i] = linear(i, sign(d)); | |
} | |
n[i] = n[i] + sign(d); | |
} | |
} | |
else | |
{ | |
q[count] = data; | |
count++; | |
if (count != marker_count) return; | |
// We have enough to start the algorithm, initialize | |
Array.Sort(q); | |
//bubbleSort(q, marker_count); | |
for (i = 0; i < marker_count; i++) | |
{ | |
n[i] = i + 1; | |
} | |
} | |
} | |
public double result() | |
{ | |
if (marker_count != 5) | |
{ | |
// result() NoArg method will only work as long as only 1 quantile was specified during class instantiation. | |
throw new Exception("result() NoArg method will only work as long as only 1 quantile was specified during class instantiation."); | |
} | |
return result(dn[(marker_count - 1) / 2]); | |
} | |
// ReSharper disable once MemberCanBePrivate.Global | |
public double result(double quantile) | |
{ | |
if (count < marker_count) | |
{ | |
int closest = 1; | |
Array.Sort(q); | |
//bubbleSort(q, count); | |
for (int i = 2; i < count; i++) | |
{ | |
if (Math.Abs((double) i / count - quantile) < Math.Abs((double) closest / marker_count - quantile)) | |
{ | |
closest = i; | |
} | |
} | |
return q[closest]; | |
} | |
else | |
{ | |
// Figure out which quantile is the one we're looking for by nearest dn | |
int closest = 1; | |
for (int i = 2; i < marker_count - 1; i++) | |
{ | |
if (Math.Abs(dn[i] - quantile) < Math.Abs(dn[closest] - quantile)) | |
{ | |
closest = i; | |
} | |
} | |
return q[closest]; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment