Created
July 31, 2019 22:36
-
-
Save benhannel/ee3d4b4c67b293032363a3553f8a7af0 to your computer and use it in GitHub Desktop.
Using SQL to join the nested JSON data with CSV data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%%time | |
%%sql | |
WITH | |
-- # compute application statistics, MAU and percent change in MAU. | |
appStats AS | |
( | |
SELECT | |
rows.r[2][1]."name" AS app, | |
rows.r[2][1]."company_name" AS company, | |
rows.r[4][1] AS mau, | |
rows.r[4][4] AS mau_percent_change | |
FROM | |
app_annie_monthly a, | |
unnest(a."data"."table"."rows" AS r) AS rows | |
WHERE | |
a._meta.s3.path LIKE 'app\_annie/monthly/2018-05/01/data/all\_users\_top\_usage\_US\_iphone\_100\_%' | |
), | |
-- # Get list of crunchbase orgs to join with. | |
crunchbaseOrgs AS | |
( | |
SELECT | |
founded_on AS founded_on, | |
uuid AS company_uuid, | |
short_description AS short_description, | |
company_name as company_name | |
FROM | |
"crunchbase_organizations" | |
), | |
-- # Get the JOINED relation from the above steps. | |
appStatsWithCrunchbaseOrgs as | |
( | |
SELECT | |
appStats.app as App, | |
appStats.mau as mau, | |
appStats.mau_percent_change as mau_percent_change, | |
crunchbaseOrgs.company_uuid as company_uuid, | |
crunchbaseOrgs.company_name as company_name, | |
crunchbaseOrgs.founded_on as founded_on, | |
crunchbaseOrgs.short_description as short_description | |
FROM | |
appStats | |
INNER JOIN | |
crunchbaseOrgs | |
ON appStats.company = crunchbaseOrgs.company_name | |
), | |
-- # Compute companyStatus = (IPO|ACQUIRED|CLOSED|OPERATING) | |
-- # There may be more than one status associated with a company, so, we do the Group By and Min. | |
companyStatus as | |
( | |
SELECT | |
company_name, | |
min( | |
case | |
status | |
when | |
'ipo' | |
then | |
1 | |
when | |
'acquired' | |
then | |
2 | |
when | |
'closed' | |
then | |
3 | |
when | |
'operating' | |
then | |
4 | |
end | |
) as status | |
FROM | |
"crunchbase_organizations" | |
GROUP BY | |
company_name | |
), | |
-- # JOIN with companyStatus == (OPERATING), call it ventureFunded | |
ventureFunded as (SELECT | |
appStatsWithCrunchbaseOrgs.App, | |
appStatsWithCrunchbaseOrgs.company_name, | |
appStatsWithCrunchbaseOrgs.mau_percent_change, | |
appStatsWithCrunchbaseOrgs.mau, | |
appStatsWithCrunchbaseOrgs.company_uuid, | |
appStatsWithCrunchbaseOrgs.founded_on, | |
appStatsWithCrunchbaseOrgs.short_description | |
FROM | |
appStatsWithCrunchbaseOrgs | |
INNER JOIN | |
companyStatus | |
ON appStatsWithCrunchbaseOrgs.company_name = companyStatus.company_name | |
AND companyStatus.status = 4), | |
-- # Find the latest round that each company raised, grouped by company UUID | |
latestRound AS | |
( | |
SELECT | |
company_uuid as cuid, | |
max(announced_on) as announced_on, | |
max(raised_amount_usd) as raised_amount_usd | |
FROM | |
"crunchbase_funding_rounds" | |
GROUP BY | |
company_uuid | |
), | |
-- # Join it back with crunchbase_funding_rounds to get other details about that company | |
fundingRounds AS | |
( | |
SELECT | |
cfr.company_uuid as company_uuid, | |
cfr.announced_on as announced_on, | |
cfr.funding_round_uuid as funding_round_uuid, | |
cfr.company_name as company_name, | |
cfr.investment_type as investment_type, | |
cfr.raised_amount_usd as raised_amount_usd, | |
cfr.country_code as country_code, | |
cfr.state_code as state_code, | |
cfr.investor_names as investor_names | |
FROM | |
"crunchbase_funding_rounds" cfr | |
JOIN | |
latestRound | |
ON latestRound.company_uuid = cfr.company_uuid | |
AND latestRound.announced_on = cfr.announced_on | |
), | |
-- # Finally, select the dataset with all the fields that are interesting to us. ventureFundedAllRegions | |
ventureFundedAllRegions AS ( | |
SELECT | |
ventureFunded.App as App, | |
ventureFunded.company_name as company_name, | |
ventureFunded.mau as mau, | |
ventureFunded.mau_percent_change as mau_percent_change, | |
ventureFunded.short_description as short_description, | |
fundingRounds.announced_on as last_funding, | |
fundingRounds.raised_amount_usd as raised_amount_usd, | |
fundingRounds.country_code as country_code, | |
fundingRounds.state_code as state_code, | |
fundingRounds.investor_names as investor_names, | |
fundingRounds.investment_type as investment_type | |
FROM | |
ventureFunded | |
JOIN | |
fundingRounds | |
ON fundingRounds.company_uuid = ventureFunded.company_uuid) | |
SELECT * FROM ventureFundedAllRegions | |
ORDER BY | |
mau_percent_change DESC LIMIT 10 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment