Created
July 18, 2022 12:02
-
-
Save edonnachie/6e12ef92b9a672e68a036c489e93d0ae to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
library(arrow) | |
library(duckdb) | |
# install.packages(c("dplyr", "arrpw", "duckdb", "nycflights13")) | |
## Export the nycflights13 dataset to arrow format ---- | |
nycflights13::flights |> | |
# Partition by month to speed up and make keep files small | |
group_by(month) |> | |
arrow::write_dataset(path = "nyflights/flights") | |
nycflights13::airlines |> | |
arrow::write_dataset(path = "nyflights/airlines") | |
nycflights13::airports |> | |
arrow::write_dataset(path = "nyflights/airports") | |
nycflights13::weather |> | |
arrow::write_dataset(path = "nyflights/weather") | |
## Open the dataset using arrow ---- | |
flights <- open_dataset("nyflights/flights", partitioning = "month") | |
airlines <- open_dataset("nyflights/airlines") | |
weather <- open_dataset("nyflights/weather") | |
airports <- open_dataset("nyflights/airports") | |
## DuckDB setup ---- | |
# Initialise a duckDB database connection | |
duck <- dbConnect(duckdb::duckdb()) | |
# Register the arrow datasets as duckDB views | |
duckdb::duckdb_register_arrow(duck, "flights", flights) | |
duckdb::duckdb_register_arrow(duck, "airlines", airlines) | |
duckdb::duckdb_register_arrow(duck, "airports", airports) | |
duckdb::duckdb_register_arrow(duck, "weather", weather) | |
## Do some queries ---- | |
# Using the arrow datasets, without duckDB | |
flights |> | |
filter(month == 5 & day == 15) |> | |
left_join(airports, by = c("origin" = "faa")) |> | |
collect() | |
# dbplyr | |
tbl(duck, "flights") |> | |
filter(month == 2) |> | |
left_join(tbl(duck, "airports"), by = c("origin" = "faa")) | |
# duckDB | |
dbGetQuery(duck, " | |
select dest, count(*) n_flights from flights | |
inner join weather | |
using (origin, year, month, day, hour) | |
where wind_speed > 15 | |
group by dest | |
order by n_flights desc | |
") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment