pkhuong · September 25, 2020 21:13
diff --git a/kmv_ci.py b/kmv_ci.py
 """Numerical implementation of Beyer et al's confidence intervals for k min values

 [On Synopses for Distinct-Value Estimation Under Multiset Operations](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.230.7008&rep=rep1&type=pdf#page=6), Section 4.3.

 ci_probability(k, eps) computes the probability that the unbiased
 cardinality estimate

    \hat{D}_k^{UB} = (k - 1) / U(k),

 where k is the number of minimum values we keep, and U(k) is the
 maximum hash value in that list, normalised to [0, 1].  If we saw
 fewer than k distinct values, we can simply report the exact value.

 Let D be the actual cardinality.  If ci_probability(k, eps) = delta,
 we know that

    P(|\hat{D}_k^{UB} - D| / D <= eps) >= delta,

 for any number of values and cardinality D: the asymptotic bound
 derived in the reference paper is pessimistic, but still correct,
 for finite cardinalities and value counts.

 We can also invert that function and instead find eps, given a fixed
 sketch size k and coverage probability delta.  That's what
 `eps_search` implements.

 ci_probability was validated by comparing with Figure 1 in Beyer et
 al's paper, and with the example "delta = 0.95, k = 1024, eps ~=
 0.06127."
 """

 from mpmath import mp, mpf


 def ibterm(k, pm_eps, log_scale):
    """Computes
         exp(log_scale) [ 1 + \sum_{i = 1}^{k - 1} (k - 1)^i / ((1 + pm_eps)^i i!) ],
    when pm_eps is +eps or -eps.

    This is a product of a very small (exp(log_scale)) value and a large
    sum.  In order to work in log space, we distribute to instead sum
    better behaved products that can be evaluated in log space:

    exp(log_scale)
    + \sum_{i=1}^{k - 1} exp(log_scale) (k - 1)^i / ((1 + pm_eps)^i i!)

    Every component of the inner product is trivial to compute in log
    space; we simply update the factorial for each iteration of the
    sum.
    """
    pm_eps = mpf(pm_eps)
    log_scale = mpf(log_scale)
    log_km1 = mp.log(k - 1)  # log(k - 1)
    log_1pme = mp.log(1 + pm_eps)  # log(1 + pm_eps)
    fac_acc = mpf(0)  # Accumulator for i! in log.

    acc = mp.exp(log_scale)
    for i in range(1, k):
        fac_acc += mp.log(i)
        acc += mp.exp(log_scale + i * (log_km1 - log_1pme) - fac_acc)
    return acc


 def ci_probability(k, eps):
    """Computes the probability that a k min value sketch has a relative
    error of eps or less (see S 4.3 of
    http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.230.7008&rep=rep1&type=pdf#page=6)."""
    eps = mpf(eps)
    pos_term = ibterm(k, eps, -(k - 1) / (1 + eps))
    neg_term = ibterm(k, -eps, -(k - 1) / (1 - eps))
    return pos_term - neg_term


 def eps_search(k, p, min_eps=1e-5, max_eps=1 - 1e-5):
    """Binary searches for the minimum eps such that a k min value sketch
    has a relative error eps or less with probability at least p."""
    for _ in range(64):
        mid = (min_eps + max_eps) / 2
        # the CI is too aggressive.
        if ci_probability(k, mid) < p:
            min_eps = mid
        else:
            max_eps = mid
    return max_eps


 mp.dps = 200


 sizes = sorted(
    list(
        {2 ** i for i in range(8, 16)}
        | {100 * i for i in range(3, 10)}
        | {1000 * i for i in range(1, 21)}
    )
 )

 for k in sizes:
    print((k, eps_search(k, 1 - mpf(1e-20))))


 # In [40]: for k in sizes:
 #     ...:     print((k, eps_search(k, 1 - mpf(1e-20))))
 #     ...:
 # (256, 0.8933010437798387)
 # (300, 0.795063354117913)
 # (400, 0.649231171891936)
 # (500, 0.5584635478511522)
 # (512, 0.5497594311001835)
 # (600, 0.49560051748505624)
 # (700, 0.44901120757738205)
 # (800, 0.4128276319843038)
 # (900, 0.3837456093057503)
 # (1000, 0.3597514885971114)
 # (1024, 0.35459489223872503)
 # (2000, 0.23886654949510666)
 # (2048, 0.23563644019592284)
 # (3000, 0.18981268452153102)
 # (4000, 0.16177921662153935)
 # (4096, 0.15967878445409936)
 # (5000, 0.1431446307946877)
 # (6000, 0.12964041665307863)
 # (7000, 0.11928942625624793)
 # (8000, 0.11103632459490151)
 # (8192, 0.10963525526984166)
 # (9000, 0.10426065031951647)
 # (10000, 0.09857100245899354)
 # (11000, 0.09370684955379038)
 # (12000, 0.08948726223117043)
 # (13000, 0.08578220293734226)
 # (14000, 0.0824955198497288)
 # (15000, 0.07955439905541617)
 # (16000, 0.07690256574667127)
 # (16384, 0.07595147059050668)
 # (17000, 0.07449575639286388)
 # (18000, 0.07229861740425302)
 # (19000, 0.07028252833983045)
 # (20000, 0.068424040928028)
 # (32768, 0.052942429149847224)

 # In [41]: for k in sizes:
 #     ...:     print((k, eps_search(k, 1 - mpf(2**-70))))
 #     ...:
 # (256, 0.9314607985943795)
 # (300, 0.8279129506378315)
 # (400, 0.6746537447821473)
 # (500, 0.5795522484509731)
 # (512, 0.5704446214577652)
 # (600, 0.5138236681150362)
 # (700, 0.4651847660519782)
 # (800, 0.42745398481359465)
 # (900, 0.397157233270994)
 # (1000, 0.3721805371190448)
 # (1024, 0.3668151163871019)
 # (2000, 0.24662395085228275)
 # (2048, 0.2432755689900746)
 # (3000, 0.19581191703669742)
 # (4000, 0.16681072702452937)
 # (4096, 0.16463888967344198)
 # (5000, 0.14754806648706512)
 # (6000, 0.1335963734526734)
 # (7000, 0.12290679695227132)
 # (8000, 0.1143864907519984)
 # (8192, 0.11294030406681277)
 # (9000, 0.1073932845831681)
 # (10000, 0.10152225768879311)
 # (11000, 0.09650397542794534)
 # (12000, 0.09215138190015283)
 # (13000, 0.08833007115386277)
 # (14000, 0.08494068379081839)
 # (15000, 0.08190799189738561)
 # (16000, 0.07917386534949619)
 # (16384, 0.07819331779778638)
 # (17000, 0.07669258863232875)
 # (18000, 0.07442765375699807)
 # (19000, 0.07234950966446299)
 # (20000, 0.07043394859402075)
 # (32768, 0.05448167268478741)
	"""Numerical implementation of Beyer et al's confidence intervals for k min values

	[On Synopses for Distinct-Value Estimation Under Multiset Operations](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.230.7008&rep=rep1&type=pdf#page=6), Section 4.3.

	ci_probability(k, eps) computes the probability that the unbiased
	cardinality estimate

	\hat{D}_k^{UB} = (k - 1) / U(k),

	where k is the number of minimum values we keep, and U(k) is the
	maximum hash value in that list, normalised to [0, 1]. If we saw
	fewer than k distinct values, we can simply report the exact value.

	Let D be the actual cardinality. If ci_probability(k, eps) = delta,
	we know that

	P(\|\hat{D}_k^{UB} - D\| / D <= eps) >= delta,

	for any number of values and cardinality D: the asymptotic bound
	derived in the reference paper is pessimistic, but still correct,
	for finite cardinalities and value counts.

	We can also invert that function and instead find eps, given a fixed
	sketch size k and coverage probability delta. That's what
	`eps_search` implements.

	ci_probability was validated by comparing with Figure 1 in Beyer et
	al's paper, and with the example "delta = 0.95, k = 1024, eps ~=
	0.06127."
	"""

	from mpmath import mp, mpf


	def ibterm(k, pm_eps, log_scale):
	"""Computes
	exp(log_scale) [ 1 + \sum_{i = 1}^{k - 1} (k - 1)^i / ((1 + pm_eps)^i i!) ],
	when pm_eps is +eps or -eps.

	This is a product of a very small (exp(log_scale)) value and a large
	sum. In order to work in log space, we distribute to instead sum
	better behaved products that can be evaluated in log space:

	exp(log_scale)
	+ \sum_{i=1}^{k - 1} exp(log_scale) (k - 1)^i / ((1 + pm_eps)^i i!)

	Every component of the inner product is trivial to compute in log
	space; we simply update the factorial for each iteration of the
	sum.
	"""
	pm_eps = mpf(pm_eps)
	log_scale = mpf(log_scale)
	log_km1 = mp.log(k - 1) # log(k - 1)
	log_1pme = mp.log(1 + pm_eps) # log(1 + pm_eps)
	fac_acc = mpf(0) # Accumulator for i! in log.

	acc = mp.exp(log_scale)
	for i in range(1, k):
	fac_acc += mp.log(i)
	acc += mp.exp(log_scale + i * (log_km1 - log_1pme) - fac_acc)
	return acc


	def ci_probability(k, eps):
	"""Computes the probability that a k min value sketch has a relative
	error of eps or less (see S 4.3 of
	http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.230.7008&rep=rep1&type=pdf#page=6)."""
	eps = mpf(eps)
	pos_term = ibterm(k, eps, -(k - 1) / (1 + eps))
	neg_term = ibterm(k, -eps, -(k - 1) / (1 - eps))
	return pos_term - neg_term


	def eps_search(k, p, min_eps=1e-5, max_eps=1 - 1e-5):
	"""Binary searches for the minimum eps such that a k min value sketch
	has a relative error eps or less with probability at least p."""
	for _ in range(64):
	mid = (min_eps + max_eps) / 2
	# the CI is too aggressive.
	if ci_probability(k, mid) < p:
	min_eps = mid
	else:
	max_eps = mid
	return max_eps


	mp.dps = 200


	sizes = sorted(
	list(
	{2 ** i for i in range(8, 16)}
	\| {100 * i for i in range(3, 10)}
	\| {1000 * i for i in range(1, 21)}
	)
	)

	for k in sizes:
	print((k, eps_search(k, 1 - mpf(1e-20))))


	# In [40]: for k in sizes:
	# ...: print((k, eps_search(k, 1 - mpf(1e-20))))
	# ...:
	# (256, 0.8933010437798387)
	# (300, 0.795063354117913)
	# (400, 0.649231171891936)
	# (500, 0.5584635478511522)
	# (512, 0.5497594311001835)
	# (600, 0.49560051748505624)
	# (700, 0.44901120757738205)
	# (800, 0.4128276319843038)
	# (900, 0.3837456093057503)
	# (1000, 0.3597514885971114)
	# (1024, 0.35459489223872503)
	# (2000, 0.23886654949510666)
	# (2048, 0.23563644019592284)
	# (3000, 0.18981268452153102)
	# (4000, 0.16177921662153935)
	# (4096, 0.15967878445409936)
	# (5000, 0.1431446307946877)
	# (6000, 0.12964041665307863)
	# (7000, 0.11928942625624793)
	# (8000, 0.11103632459490151)
	# (8192, 0.10963525526984166)
	# (9000, 0.10426065031951647)
	# (10000, 0.09857100245899354)
	# (11000, 0.09370684955379038)
	# (12000, 0.08948726223117043)
	# (13000, 0.08578220293734226)
	# (14000, 0.0824955198497288)
	# (15000, 0.07955439905541617)
	# (16000, 0.07690256574667127)
	# (16384, 0.07595147059050668)
	# (17000, 0.07449575639286388)
	# (18000, 0.07229861740425302)
	# (19000, 0.07028252833983045)
	# (20000, 0.068424040928028)
	# (32768, 0.052942429149847224)

	# In [41]: for k in sizes:
	# ...: print((k, eps_search(k, 1 - mpf(2**-70))))
	# ...:
	# (256, 0.9314607985943795)
	# (300, 0.8279129506378315)
	# (400, 0.6746537447821473)
	# (500, 0.5795522484509731)
	# (512, 0.5704446214577652)
	# (600, 0.5138236681150362)
	# (700, 0.4651847660519782)
	# (800, 0.42745398481359465)
	# (900, 0.397157233270994)
	# (1000, 0.3721805371190448)
	# (1024, 0.3668151163871019)
	# (2000, 0.24662395085228275)
	# (2048, 0.2432755689900746)
	# (3000, 0.19581191703669742)
	# (4000, 0.16681072702452937)
	# (4096, 0.16463888967344198)
	# (5000, 0.14754806648706512)
	# (6000, 0.1335963734526734)
	# (7000, 0.12290679695227132)
	# (8000, 0.1143864907519984)
	# (8192, 0.11294030406681277)
	# (9000, 0.1073932845831681)
	# (10000, 0.10152225768879311)
	# (11000, 0.09650397542794534)
	# (12000, 0.09215138190015283)
	# (13000, 0.08833007115386277)
	# (14000, 0.08494068379081839)
	# (15000, 0.08190799189738561)
	# (16000, 0.07917386534949619)
	# (16384, 0.07819331779778638)
	# (17000, 0.07669258863232875)
	# (18000, 0.07442765375699807)
	# (19000, 0.07234950966446299)
	# (20000, 0.07043394859402075)
	# (32768, 0.05448167268478741)