Last active
September 25, 2020 21:13
-
-
Save pkhuong/68365b3f52a15f5cb652344b0f35cfe9 to your computer and use it in GitHub Desktop.
Numerical implementation of Beyer et al's confidence intervals for k min values
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Numerical implementation of Beyer et al's confidence intervals for k min values | |
[On Synopses for Distinct-Value Estimation Under Multiset Operations](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.230.7008&rep=rep1&type=pdf#page=6), Section 4.3. | |
ci_probability(k, eps) computes the probability that the unbiased | |
cardinality estimate | |
\hat{D}_k^{UB} = (k - 1) / U(k), | |
where k is the number of minimum values we keep, and U(k) is the | |
maximum hash value in that list, normalised to [0, 1]. If we saw | |
fewer than k distinct values, we can simply report the exact value. | |
Let D be the actual cardinality. If ci_probability(k, eps) = delta, | |
we know that | |
P(|\hat{D}_k^{UB} - D| / D <= eps) >= delta, | |
for any number of values and cardinality D: the asymptotic bound | |
derived in the reference paper is pessimistic, but still correct, | |
for finite cardinalities and value counts. | |
We can also invert that function and instead find eps, given a fixed | |
sketch size k and coverage probability delta. That's what | |
`eps_search` implements. | |
ci_probability was validated by comparing with Figure 1 in Beyer et | |
al's paper, and with the example "delta = 0.95, k = 1024, eps ~= | |
0.06127." | |
""" | |
from mpmath import mp, mpf | |
def ibterm(k, pm_eps, log_scale): | |
"""Computes | |
exp(log_scale) [ 1 + \sum_{i = 1}^{k - 1} (k - 1)^i / ((1 + pm_eps)^i i!) ], | |
when pm_eps is +eps or -eps. | |
This is a product of a very small (exp(log_scale)) value and a large | |
sum. In order to work in log space, we distribute to instead sum | |
better behaved products that can be evaluated in log space: | |
exp(log_scale) | |
+ \sum_{i=1}^{k - 1} exp(log_scale) (k - 1)^i / ((1 + pm_eps)^i i!) | |
Every component of the inner product is trivial to compute in log | |
space; we simply update the factorial for each iteration of the | |
sum. | |
""" | |
pm_eps = mpf(pm_eps) | |
log_scale = mpf(log_scale) | |
log_km1 = mp.log(k - 1) # log(k - 1) | |
log_1pme = mp.log(1 + pm_eps) # log(1 + pm_eps) | |
fac_acc = mpf(0) # Accumulator for i! in log. | |
acc = mp.exp(log_scale) | |
for i in range(1, k): | |
fac_acc += mp.log(i) | |
acc += mp.exp(log_scale + i * (log_km1 - log_1pme) - fac_acc) | |
return acc | |
def ci_probability(k, eps): | |
"""Computes the probability that a k min value sketch has a relative | |
error of eps or less (see S 4.3 of | |
http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.230.7008&rep=rep1&type=pdf#page=6).""" | |
eps = mpf(eps) | |
pos_term = ibterm(k, eps, -(k - 1) / (1 + eps)) | |
neg_term = ibterm(k, -eps, -(k - 1) / (1 - eps)) | |
return pos_term - neg_term | |
def eps_search(k, p, min_eps=1e-5, max_eps=1 - 1e-5): | |
"""Binary searches for the minimum eps such that a k min value sketch | |
has a relative error eps or less with probability at least p.""" | |
for _ in range(64): | |
mid = (min_eps + max_eps) / 2 | |
# the CI is too aggressive. | |
if ci_probability(k, mid) < p: | |
min_eps = mid | |
else: | |
max_eps = mid | |
return max_eps | |
mp.dps = 200 | |
sizes = sorted( | |
list( | |
{2 ** i for i in range(8, 16)} | |
| {100 * i for i in range(3, 10)} | |
| {1000 * i for i in range(1, 21)} | |
) | |
) | |
for k in sizes: | |
print((k, eps_search(k, 1 - mpf(1e-20)))) | |
# In [40]: for k in sizes: | |
# ...: print((k, eps_search(k, 1 - mpf(1e-20)))) | |
# ...: | |
# (256, 0.8933010437798387) | |
# (300, 0.795063354117913) | |
# (400, 0.649231171891936) | |
# (500, 0.5584635478511522) | |
# (512, 0.5497594311001835) | |
# (600, 0.49560051748505624) | |
# (700, 0.44901120757738205) | |
# (800, 0.4128276319843038) | |
# (900, 0.3837456093057503) | |
# (1000, 0.3597514885971114) | |
# (1024, 0.35459489223872503) | |
# (2000, 0.23886654949510666) | |
# (2048, 0.23563644019592284) | |
# (3000, 0.18981268452153102) | |
# (4000, 0.16177921662153935) | |
# (4096, 0.15967878445409936) | |
# (5000, 0.1431446307946877) | |
# (6000, 0.12964041665307863) | |
# (7000, 0.11928942625624793) | |
# (8000, 0.11103632459490151) | |
# (8192, 0.10963525526984166) | |
# (9000, 0.10426065031951647) | |
# (10000, 0.09857100245899354) | |
# (11000, 0.09370684955379038) | |
# (12000, 0.08948726223117043) | |
# (13000, 0.08578220293734226) | |
# (14000, 0.0824955198497288) | |
# (15000, 0.07955439905541617) | |
# (16000, 0.07690256574667127) | |
# (16384, 0.07595147059050668) | |
# (17000, 0.07449575639286388) | |
# (18000, 0.07229861740425302) | |
# (19000, 0.07028252833983045) | |
# (20000, 0.068424040928028) | |
# (32768, 0.052942429149847224) | |
# In [41]: for k in sizes: | |
# ...: print((k, eps_search(k, 1 - mpf(2**-70)))) | |
# ...: | |
# (256, 0.9314607985943795) | |
# (300, 0.8279129506378315) | |
# (400, 0.6746537447821473) | |
# (500, 0.5795522484509731) | |
# (512, 0.5704446214577652) | |
# (600, 0.5138236681150362) | |
# (700, 0.4651847660519782) | |
# (800, 0.42745398481359465) | |
# (900, 0.397157233270994) | |
# (1000, 0.3721805371190448) | |
# (1024, 0.3668151163871019) | |
# (2000, 0.24662395085228275) | |
# (2048, 0.2432755689900746) | |
# (3000, 0.19581191703669742) | |
# (4000, 0.16681072702452937) | |
# (4096, 0.16463888967344198) | |
# (5000, 0.14754806648706512) | |
# (6000, 0.1335963734526734) | |
# (7000, 0.12290679695227132) | |
# (8000, 0.1143864907519984) | |
# (8192, 0.11294030406681277) | |
# (9000, 0.1073932845831681) | |
# (10000, 0.10152225768879311) | |
# (11000, 0.09650397542794534) | |
# (12000, 0.09215138190015283) | |
# (13000, 0.08833007115386277) | |
# (14000, 0.08494068379081839) | |
# (15000, 0.08190799189738561) | |
# (16000, 0.07917386534949619) | |
# (16384, 0.07819331779778638) | |
# (17000, 0.07669258863232875) | |
# (18000, 0.07442765375699807) | |
# (19000, 0.07234950966446299) | |
# (20000, 0.07043394859402075) | |
# (32768, 0.05448167268478741) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment