Last active
August 18, 2016 12:12
-
-
Save jvilledieu/4968309b478125bfbfe02576812fb304 to your computer and use it in GitHub Desktop.
New crunchbase import script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
https://linkurio.us/the-crunchbase-graph-data-modelling/ | |
https://linkurio.us/crunchbase-graph-importing-data-neo4j/ | |
https://linkurio.us/crunchbase-graph-analysing-graph/ | |
Check also Neo4j csv and etl guide | |
*/ | |
// Verifying the CSV file | |
// assert correct line count | |
LOAD CSV FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Companies.csv" AS line | |
RETURN count(*); | |
// check first few raw lines | |
LOAD CSV FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Companies.csv" AS line WITH line | |
RETURN line | |
LIMIT 1; | |
// check first 5 line-sample with header-mapping | |
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Companies.csv" AS line WITH line | |
RETURN line | |
LIMIT 1; | |
//-=-=-=-=-=-=-= | |
//Cleaning up the database | |
MATCH (n) | |
WITH n LIMIT 10000 | |
OPTIONAL MATCH (n)-[r]->() | |
DELETE n,r; | |
//Importing the file Companies.csv | |
//Creating the constraints | |
create constraint on (a:COMPANY) assert a.permalink is unique; | |
create constraint on (b:CATEGORY) assert b.name is unique; | |
create constraint on (c:MARKET) assert c.name is unique; | |
create constraint on (d:STATUS) assert d.name is unique; | |
create constraint on (e:COUNTRY) assert e.name is unique; | |
create constraint on (f:STATE) assert f.name is unique; | |
create constraint on (g:REGION) assert g.name is unique; | |
create constraint on (h:CITY) assert h.name is unique; | |
//Creating the nodes | |
USING PERIODIC COMMIT 1000 | |
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Companies.csv" AS line WITH line | |
WHERE line.category_list IS NOT NULL | |
MERGE (b:CATEGORY {name: line.category_list}); | |
USING PERIODIC COMMIT 1000 | |
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Companies.csv" AS line WITH line | |
WHERE line.market IS NOT NULL | |
MERGE (c:MARKET {name: line.market}); | |
USING PERIODIC COMMIT 1000 | |
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Companies.csv" AS line WITH line | |
WHERE line.status IS NOT NULL | |
MERGE (d:STATUS {name: line.status}); | |
USING PERIODIC COMMIT 1000 | |
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Companies.csv" AS line WITH line | |
WHERE line.country_code IS NOT NULL | |
MERGE (e:COUNTRY {name: line.country_code}); | |
USING PERIODIC COMMIT 1000 | |
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Companies.csv" AS line WITH line | |
WHERE line.state_code IS NOT NULL | |
MERGE (f:STATE {name: line.state_code}); | |
USING PERIODIC COMMIT 1000 | |
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Companies.csv" AS line WITH line | |
WHERE line.region IS NOT NULL | |
MERGE (g:REGION {name: line.region}); | |
USING PERIODIC COMMIT 1000 | |
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Companies.csv" AS line WITH line | |
WHERE line.city IS NOT NULL | |
MERGE (h:CITY {name: line.city}); | |
// Check label counts: | |
MATCH n | |
RETURN DISTINCT count(labels(n)), labels(n); | |
USING PERIODIC COMMIT 2000 | |
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Companies.csv" AS line | |
MERGE (a:COMPANY { permalink: line.permalink }) | |
ON CREATE SET a.name = line.name, | |
a.homepage_url = line.homepage_url, | |
a.funding_total = toInt(line.funding_total_usd), | |
a.funding_rounds = toInt(line.funding_rounds), | |
a.founded_at = line.founded_at, | |
a.founded_month = line.founded_month, | |
a.founded_quarter = line.founded_quarter, | |
a.founded_year = toInt(line.founded_year), | |
a.first_funding_at = line.first_funding_at, | |
a.last_funding_at = line.last_funding_at, | |
a.category = line.category_list, | |
a.market = line.market, | |
a.status = line.status, | |
a.country = line.country_code, | |
a.state = line.state_code, | |
a.region = line.region, | |
a.city = line.city, | |
a.url = 'http://www.crunchbase.com' + line.permalink, | |
a.logo = 'http://www.crunchbase.com' + line.permalink + '/primary-image/raw'; | |
//Creating the edges | |
//Creating an index on the companies to speed up the import | |
DROP CONSTRAINT ON (a:COMPANY) ASSERT a.permalink IS UNIQUE; | |
CREATE INDEX ON :COMPANY(permalink); | |
//Companies & Categories | |
MATCH (a:COMPANY) | |
MATCH (b:CATEGORY {name: a.category}) | |
CREATE (a)-[:HAS_CATEGORY]->(b); | |
//Companies & Market | |
MATCH (a:COMPANY) | |
MATCH (b:MARKET {name: a.market}) | |
CREATE (a)-[:HAS_MARKET]->(b); | |
//Companies & Status | |
MATCH (a:COMPANY) | |
MATCH (b:STATUS {name: a.status}) | |
CREATE (a)-[:HAS_STATUS]->(b); | |
//Companies & Country | |
MATCH (a:COMPANY) | |
MATCH (b:COUNTRY {name: a.country}) | |
CREATE (a)-[:HAS_COUNTRY]->(b); | |
//Companies & State | |
MATCH (a:COMPANY) | |
MATCH (b:STATE {name: a.state}) | |
CREATE (a)-[:HAS_STATE]->(b); | |
//Companies & Region | |
MATCH (a:COMPANY) | |
MATCH (b:REGION {name: a.region}) | |
CREATE (a)-[:HAS_REGION]->(b); | |
//Companies & Cities | |
MATCH (a:COMPANY) | |
MATCH (b:CITY {name: a.city}) | |
CREATE (a)-[:HAS_CITY]->(b); | |
//Importing the file Investments.csv | |
//Creating the constraints | |
create constraint on (a:INVESTOR) assert a.permalink is unique; | |
create constraint on (b:INVESTOR_CATEGORY) assert b.name is unique; | |
create constraint on (c:INVESTOR_MARKET) assert c.name is unique; | |
create constraint on (e:COUNTRY) assert e.name is unique; | |
create constraint on (f:STATE) assert f.name is unique; | |
create constraint on (g:REGION) assert g.name is unique; | |
create constraint on (h:CITY) assert h.name is unique; | |
create constraint on (i:FUNDING_ROUND) assert i.permalink is unique; | |
create constraint on (j:ROUND_CODE) assert j.name is unique; | |
create constraint on (k:ROUND_TYPE) assert k.name is unique; | |
//Creating the nodes | |
//Relabel added companies that invested to become INVESTOR | |
USING PERIODIC COMMIT 2000 | |
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Investments.csv" AS line | |
MATCH (a:COMPANY { permalink: line.investor_permalink }) | |
SET a:INVESTOR; | |
USING PERIODIC COMMIT 2000 | |
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Investments.csv" AS line WITH line | |
WHERE line.investor_permalink IS NOT NULL | |
MERGE (a:INVESTOR { permalink: line.investor_permalink }) | |
ON CREATE SET a.name = line.investor_name, | |
a.category = line.investor_category_list, | |
a.market = line.investor_market, | |
a.country = line.investor_country_code, | |
a.state = line.investor_state_code, | |
a.region = line.investor_region, | |
a.city = line.investor_city, | |
a.logo = 'http://www.crunchbase.com' + line.permalink + '/primary-image/raw', | |
a.url = 'http://www.crunchbase.com' + line.investor_permalink; | |
USING PERIODIC COMMIT 1000 | |
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Investments.csv" AS line WITH line | |
WHERE line.investor_category_list IS NOT NULL | |
MERGE (b:INVESTOR_CATEGORY {name: line.investor_category_list}); | |
USING PERIODIC COMMIT 1000 | |
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Investments.csv" AS line WITH line | |
WHERE line.investor_market IS NOT NULL | |
MERGE (c:INVESTOR_MARKET {name: line.investor_market}); | |
USING PERIODIC COMMIT 1000 | |
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Investments.csv" AS line WITH line | |
WHERE line.investor_country_code IS NOT NULL | |
MERGE (d:COUNTRY {name: line.investor_country_code}); | |
USING PERIODIC COMMIT 1000 | |
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Investments.csv" AS line WITH line | |
WHERE line.investor_state_code IS NOT NULL | |
MERGE (e:STATE {name: line.investor_state_code}); | |
USING PERIODIC COMMIT 1000 | |
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Investments.csv" AS line WITH line | |
WHERE line.investor_region IS NOT NULL | |
MERGE (f:REGION {name: line.investor_region}); | |
USING PERIODIC COMMIT 1000 | |
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Investments.csv" AS line WITH line | |
WHERE line.investor_city IS NOT NULL | |
MERGE (g:CITY {name: line.investor_city}); | |
USING PERIODIC COMMIT 1000 | |
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Investments.csv" AS line WITH line | |
WHERE line.funding_round_permalink IS NOT NULL | |
MERGE (h:FUNDING_ROUND {permalink: line.funding_round_permalink}) | |
ON CREATE SET h.funded_at = line.funded_at, | |
h.funded_month = line.funded_month, | |
h.funded_quarter = line.funded_quarter, | |
h.funded_year = toInt(line.funded_year), | |
h.raised_amount_usd = toInt(line.raised_amount_usd), | |
h. funding_round_code = line.funding_round_code, | |
h.funding_round_type = line.funding_round_type, | |
h.url = 'http://www.crunchbase.com' + line.permalink; | |
USING PERIODIC COMMIT 1000 | |
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Investments.csv" AS line WITH line | |
WHERE line.funding_round_code IS NOT NULL | |
MERGE (i:ROUND_CODE {name: line.funding_round_code}); | |
USING PERIODIC COMMIT 1000 | |
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Investments.csv" AS line WITH line | |
WHERE line.funding_round_type IS NOT NULL | |
MERGE (j:ROUND_TYPE {name: line.funding_round_type}); | |
//Creating the edges | |
//Creating an index on the investors and funding rounds to speed up the import | |
DROP CONSTRAINT ON (a:INVESTOR) ASSERT a.permalink IS UNIQUE; | |
CREATE INDEX ON :INVESTOR(permalink); | |
DROP CONSTRAINT ON (a:FUNDING_ROUND) ASSERT a.permalink IS UNIQUE; | |
CREATE INDEX ON :FUNDING_ROUND(permalink); | |
//Investor & Category | |
MATCH (a:INVESTOR) | |
MATCH (b:INVESTOR_CATEGORY { name: a.category}) | |
CREATE (a)-[:HAS_INVESTOR_CATEGORY]->(b); | |
//Investor & Market | |
MATCH (a:INVESTOR) | |
MATCH (b:INVESTOR_MARKET { name: a.market}) | |
CREATE (a)-[:HAS_INVESTOR_MARKET]->(b); | |
//Investor and Country | |
MATCH (a:INVESTOR) | |
MATCH (b:COUNTRY { name: a.country}) | |
CREATE (a)-[:HAS_COUNTRY]->(b); | |
//Investor & State | |
MATCH (a:INVESTOR) | |
MATCH (b:STATE { name: a.state}) | |
CREATE (a)-[:HAS_STATE]->(b); | |
//Investor & Region | |
MATCH (a:INVESTOR) | |
MATCH (b:REGION { name: a.region}) | |
CREATE (a)-[:HAS_REGION]->(b); | |
//Investor & City | |
MATCH (a:INVESTOR) | |
MATCH (b:CITY { name: a.city}) | |
CREATE (a)-[:HAS_CITY]->(b); | |
//Investor & Funding Round | |
USING PERIODIC COMMIT 1000 | |
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Investments.csv" AS line WITH line | |
WHERE line.investor_permalink IS NOT NULL AND line.funding_round_permalink IS NOT NULL | |
MATCH (a:INVESTOR { permalink: line.investor_permalink}),(b:FUNDING_ROUND { permalink: line.funding_round_permalink}) | |
CREATE (a)-[:HAS_FUNDED]->(b); | |
//Startup & Funding Round | |
USING PERIODIC COMMIT 1000 | |
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Investments.csv" AS line WITH line | |
WHERE line.company_permalink IS NOT NULL AND line.funding_round_permalink IS NOT NULL | |
MATCH (a:COMPANY { permalink: line.company_permalink}), (b:FUNDING_ROUND { permalink: line.funding_round_permalink}) | |
CREATE (a)-[:HAS_BEEN_FUNDED]->(b); | |
//Funding round and round code | |
MATCH (a:FUNDING_ROUND) | |
MATCH (b:ROUND_CODE { name: a.funding_round_code}) | |
CREATE (a)-[:HAS_ROUND_CODE]->(b); | |
//Funding round and round type | |
MATCH (a:FUNDING_ROUND) | |
MATCH (b:ROUND_TYPE { name: a.funding_round_type}) | |
CREATE (a)-[:HAS_ROUND_TYPE]->(b); | |
//Importing the file Acquisitions.csv | |
USING PERIODIC COMMIT 1000 | |
LOAD CSV WITH HEADERS FROM "file:/Developer/Neo4jCaseStudies/Crunchbase/data/Acquisitions.csv" AS line WITH line | |
WHERE line.company_permalink IS NOT NULL AND line.acquirer_permalink IS NOT NULL | |
MATCH (a:COMPANY {permalink: line.company_permalink}), (b:COMPANY {permalink: line.acquirer_permalink}) | |
CREATE (b)-[:HAS_ACQUIRED {acquisition_date : line.acquired_at, month: line.acquired_month, quarter: line.acquired_quarter, year: line.acquired_year, price_amount: toInt(line.price_amount), currency: line.price_currency_code}]->(a); | |
//----------------------- | |
//Cleaning up | |
//----------------------- | |
MATCH (n:COMPANY) | |
WHERE n.state = '' | |
REMOVE n.state; | |
MATCH (n:COMPANY) | |
WHERE n.homepage_url = '' | |
REMOVE n.homepage_url; | |
MATCH (n:COMPANY) | |
WHERE n.funding_total = '' | |
REMOVE n.funding_total; | |
MATCH (n:COMPANY) | |
WHERE n.funding_rounds = '' | |
REMOVE n.funding_rounds; | |
MATCH (n:COMPANY) | |
WHERE n.founded_at = '' | |
REMOVE n.founded_at; | |
MATCH (n:COMPANY) | |
WHERE n.founded_month = '' | |
REMOVE n.founded_month; | |
MATCH (n:COMPANY) | |
WHERE n.founded_quarter = '' | |
REMOVE n.founded_quarter; | |
MATCH (n:COMPANY) | |
WHERE n.founded_year = '' | |
REMOVE n.founded_year; | |
MATCH (n:COMPANY) | |
WHERE n.first_funding_at = '' | |
REMOVE n.first_funding_at; | |
MATCH (n:COMPANY) | |
WHERE n.last_funding_at = '' | |
REMOVE n.last_funding_at; | |
MATCH (n:COMPANY) | |
WHERE n.category = '' | |
REMOVE n.category; | |
MATCH (n:COMPANY) | |
WHERE n.market = '' | |
REMOVE n.market; | |
MATCH (n:COMPANY) | |
WHERE n.status = '' | |
REMOVE n.status; | |
MATCH (n:COMPANY) | |
WHERE n.country = '' | |
REMOVE n.country; | |
MATCH (n:COMPANY) | |
WHERE n.state = '' | |
REMOVE n.state; | |
MATCH (n:COMPANY) | |
WHERE n.region = '' | |
REMOVE n.region; | |
MATCH (n:COMPANY) | |
REMOVE n.city; | |
MATCH (n:INVESTOR) | |
WHERE n.state = '' | |
REMOVE n.state; | |
MATCH (n:INVESTOR) | |
WHERE n.category = '' | |
REMOVE n.category; | |
MATCH (n:INVESTOR) | |
WHERE n.market = '' | |
REMOVE n.market; | |
MATCH (n:INVESTOR) | |
WHERE n.city = '' | |
REMOVE n.city; | |
MATCH (n:INVESTOR) | |
WHERE n.region = '' | |
REMOVE n.region; | |
MATCH (n:INVESTOR) | |
WHERE n.country = '' | |
REMOVE n.country; | |
optional match (n)-[r]-() where n.name = '' | |
DELETE n, r; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment