pwolfram · August 29, 2015 14:24
diff --git a/fast_hist.py b/fast_hist.py
 #!/usr/bin/env python

 import numpy as np

 def fast_hist(data, bin_edges):
    """Fast 1-dimensional histogram. Comparable to numpy.histogram(), but careless.
    
    'bin_edges' should encompass all values in 'data'; the first and last elements 
    in 'bin_edges' are ignored, and are effectively (-infinity, infinity).

    Returns the histogram array only.
    """
    # Yes, I've tested this against histogram().
    return np.bincount(np.digitize(data, bin_edges[1:-1]), minlength=len(bin_edges) - 1)

 def fast_hist_2d(data, bin_edges):
    """Fast 2-dimensional histogram. Comparable to numpy.histogramdd(), but careless.
    
    'data' is an Nx2 array. 'bin_edges' is used for both dimensions and should
    encompass all values in 'data'; the first and last elements in 'bin_edges'
    are ignored, and are effectively (-infinity, infinity).

    Returns the histogram array only.
    """
    # Yes, I've tested this against histogramdd().
    xassign = np.digitize(data[:,0], bin_edges[1:-1]) 
    yassign = np.digitize(data[:,1], bin_edges[1:-1])
    nbins = len(bin_edges) - 1
    flatcount = np.bincount(xassign + yassign * nbins, minlength=nbins*nbins)
    return flatcount.reshape((nbins, nbins)).T

 def fast_hist_weighted(data, weights, bin_edges):
    """Fast 1-dimensional histogram. Comparable to numpy.histogram(), but careless.

    'bin_edges' should encompass all values in 'data'; the first and last elements
    in 'bin_edges' are ignored, and are effectively (-infinity, infinity).

    Returns the histogram array only with weighting.  Non-weighted version at https://gist.github.com/nkeim/4455635

    """
    return np.bincount(np.digitize(data, bin_edges[1:-1]), minlength=len(bin_edges) - 1, weights=weights)

 def test_hist():
    print 'Note: bins must contain the whole range of the data for fast and regular histograms to be equivalent!'
    a = np.random.randn(1e7)
    b = np.random.randn(1e7)
    lvec = np.array([-1000, 0,1,4,16, 1000])
    pdf,_ = np.histogram(a,bins=lvec,weights=b,density=False)
    pdfnew = fast_hist_weighted(a,b,lvec)
    print 'Error between weighted fast histograms:'
    print np.max(np.abs(pdf-pdfnew))
    print np.mean(np.sqrt((pdf-pdfnew)**2.0))


    pdf,_ = np.histogram(a,bins=lvec,density=False)
    pdfnew = fast_hist(a,lvec)
    print 'Error between fast histograms:'
    print np.max(np.abs(pdf-pdfnew))
    print np.mean(np.sqrt((pdf-pdfnew)**2.0))

 if __name__ == "__main__":
    test_hist()
	#!/usr/bin/env python

	import numpy as np

	def fast_hist(data, bin_edges):
	"""Fast 1-dimensional histogram. Comparable to numpy.histogram(), but careless.

	'bin_edges' should encompass all values in 'data'; the first and last elements
	in 'bin_edges' are ignored, and are effectively (-infinity, infinity).

	Returns the histogram array only.
	"""
	# Yes, I've tested this against histogram().
	return np.bincount(np.digitize(data, bin_edges[1:-1]), minlength=len(bin_edges) - 1)

	def fast_hist_2d(data, bin_edges):
	"""Fast 2-dimensional histogram. Comparable to numpy.histogramdd(), but careless.

	'data' is an Nx2 array. 'bin_edges' is used for both dimensions and should
	encompass all values in 'data'; the first and last elements in 'bin_edges'
	are ignored, and are effectively (-infinity, infinity).

	Returns the histogram array only.
	"""
	# Yes, I've tested this against histogramdd().
	xassign = np.digitize(data[:,0], bin_edges[1:-1])
	yassign = np.digitize(data[:,1], bin_edges[1:-1])
	nbins = len(bin_edges) - 1
	flatcount = np.bincount(xassign + yassign * nbins, minlength=nbins*nbins)
	return flatcount.reshape((nbins, nbins)).T

	def fast_hist_weighted(data, weights, bin_edges):
	"""Fast 1-dimensional histogram. Comparable to numpy.histogram(), but careless.

	'bin_edges' should encompass all values in 'data'; the first and last elements
	in 'bin_edges' are ignored, and are effectively (-infinity, infinity).

	Returns the histogram array only with weighting. Non-weighted version at https://gist.github.com/nkeim/4455635

	"""
	return np.bincount(np.digitize(data, bin_edges[1:-1]), minlength=len(bin_edges) - 1, weights=weights)

	def test_hist():
	print 'Note: bins must contain the whole range of the data for fast and regular histograms to be equivalent!'
	a = np.random.randn(1e7)
	b = np.random.randn(1e7)
	lvec = np.array([-1000, 0,1,4,16, 1000])
	pdf,_ = np.histogram(a,bins=lvec,weights=b,density=False)
	pdfnew = fast_hist_weighted(a,b,lvec)
	print 'Error between weighted fast histograms:'
	print np.max(np.abs(pdf-pdfnew))
	print np.mean(np.sqrt((pdf-pdfnew)**2.0))


	pdf,_ = np.histogram(a,bins=lvec,density=False)
	pdfnew = fast_hist(a,lvec)
	print 'Error between fast histograms:'
	print np.max(np.abs(pdf-pdfnew))
	print np.mean(np.sqrt((pdf-pdfnew)**2.0))

	if __name__ == "__main__":
	test_hist()