Last active
July 5, 2019 02:04
-
-
Save daigotanaka/3907b11126114a5c74b84f89d0041e19 to your computer and use it in GitHub Desktop.
SQL histogram template
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Histogram template | |
Edit Config and Input sections at the top of this query. | |
Config: | |
- bin_size: Width of the bin | |
- min_bin | |
- max_bin | |
Input: | |
- c: The column of interest | |
- t: Table | |
Output: | |
- bin: The text label of the bin. For the out of range (min_bin, max_bin), '<=' and '>' is added | |
- n: Count | |
- bin_num: Numeric bin labels convenient for sorting bin correctly in business intelligence tools | |
*/ | |
with | |
/* Config */ | |
config as ( | |
select | |
10 as bin_size, | |
10 as min_bin, | |
100 as max_bin | |
), | |
/* Inputs */ | |
t as ( | |
select | |
-- Set the column of interest in the next line | |
<your_column> | |
as c | |
-- Set the table of interest in the next line | |
from <your_table> | |
-- Any filters? | |
where 1=1 | |
and date_diff(current_date, registered_on, day) > (select max_bin from config) | |
), | |
/* Don't touch below */ | |
-- Create a number sequence from 0~999 | |
seed as ( | |
select v from ( | |
select 0 as v union all select 1 union all select 2 union all select 3 union all select 4 union all select 5 | |
union all select 6 union all select 7 union all select 8 union all select 9) | |
), | |
n999 as ( | |
select (s0.v + 10 * s1.v + 100 * s2.v) as v | |
from seed as s0 cross join seed as s1 cross join seed as s2 | |
), | |
-- Create bins in the range min_bin ~ max_bin | |
bins as ( | |
select bin, | |
0 as n | |
from (select v * (select bin_size from config) as bin from n999) | |
where (select min_bin from config) < bin and bin <= (select max_bin from config) | |
order by 1 | |
), | |
-- Count instances by bin | |
-- Also generate the sum for the out of range values | |
counts as ( | |
-- Equal or less than min_bin | |
select | |
(select min_bin from config) as bin, | |
sum(case when c < (select min_bin from config) then 1 else 0 end) as n | |
from t | |
-- Greater than max_bin | |
union all( | |
select | |
(select max_bin from config) + 1 as bin, | |
sum(case when c >= (select max_bin from config) then 1 else 0 end) as n | |
from t) | |
-- Everything in between | |
union all ( | |
select | |
(floor(c / (select bin_size from config)) + 1) * (select bin_size from config) as bin, | |
count(*) as n | |
from t | |
where (select min_bin from config) <= c and c < (select max_bin from config) | |
group by 1) | |
order by 1 | |
), | |
with_zero_bins as ( | |
select | |
coalesce(b.bin, c.bin) as bin, | |
coalesce(b.n, 0) + coalesce(c.n, 0) as n | |
from bins as b | |
full join counts as c | |
on b.bin=c.bin | |
), | |
histogram as ( | |
select | |
bin as bin_num, | |
case | |
when bin = (select min_bin from config) then concat('<', cast((select min_bin from config) as string)) | |
when bin > (select max_bin from config) then concat ('>=', cast((select max_bin from config) as string)) | |
else cast(bin as string) | |
end as bin, | |
n | |
from with_zero_bins | |
) | |
-- Check sum | |
-- select count(*) as n from t union all select sum(n) from histogram | |
select | |
bin, | |
n, | |
-- Use bin_num to sort bins | |
bin_num | |
from histogram | |
order by bin_num |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment