Queries based on the Citi bikes dataset using DuckDB.
Associated YouTube video: https://www.youtube.com/watch?v=KTIBhsIoCvk
CREATE OR REPLACE TABLE bikeStations AS
FROM read_csv_auto('data/*.csv', types={"station_id": "VARCHAR"})
WHERE missing_station_information = false;
ALTER TABLE bikeStations ADD COLUMN ts TIMESTAMP;
UPDATE bikeStations
SET ts = to_timestamp(station_status_last_reported)::TIMESTAMP AT TIME ZONE 'US/Eastern';
.mode line
SELECT * FROM bikeStations LIMIT 1;
.mode duckbox
SELECT station_name,
date_trunc('DAY', ts) AS day,
dayName(ts) AS weekDay,
round(avg(num_bikes_available), 2) AS avg,
round(min(num_bikes_available), 2) AS min,
round(max(num_bikes_available), 2) AS max,
round(stddev(num_bikes_available), 2) AS stddev
FROM bikeStations
WHERE station_id = 128
GROUP BY ALL
ORDER BY day
LIMIT 10;
// First compute the average availability for the day
SELECT station_name, date_trunc('DAY', ts) AS day,
round(avg(num_bikes_available) OVER (
PARTITION BY station_id, day
), 2) AS avg,
ts,
num_bikes_available AS available
FROM bikeStations
ORDER BY day, ts
LIMIT 10;
WITH avgByDay AS (
SELECT station_id, station_name,
date_trunc('DAY', ts) AS day,
round(avg(num_bikes_available) OVER (
PARTITION BY station_id, day
), 2) AS avg,
ts,
num_bikes_available AS available,
FROM bikeStations
ORDER BY ts, day
)
SELECT ts, available, avg
FROM avgByDay
WHERE station_id = 128 AND day = '2019-08-06'
AND available > avg
LIMIT 10;
WITH avgByDay AS (
SELECT station_id, station_name, date_trunc('DAY', ts) AS day,
ts,
num_bikes_available AS available,
round(avg(num_bikes_available) OVER (
PARTITION BY station_id, day
), 2) AS avg
FROM bikeStations
ORDER BY ts, day
)
SELECT ts, available, avg
FROM avgByDay
WHERE station_id = 128 AND day = '2019-08-06'
AND available < avg
ORDER BY available
LIMIT 10;
WITH avgByDay AS (
SELECT station_id, station_name, date_trunc('DAY', ts) AS day, ts,
num_bikes_available as available,
round(avg(num_bikes_available) OVER station, 2) AS avg,
round(min(num_bikes_available) OVER station, 2) AS min,
round(max(num_bikes_available) OVER station, 2) AS max,
quantile_cont(num_bikes_available, 0.75) OVER station AS q75
FROM bikeStations
WINDOW station AS (
PARTITION BY station_id, day
)
ORDER BY ts, day
)
SELECT ts, available, avg, min, max, q75
FROM avgByDay
WHERE station_id = 128 AND day = '2019-08-07'
AND available > q75
LIMIT 10;
WITH avgByDay AS (
SELECT station_id, station_name, date_trunc('DAY', ts) AS day, ts,
num_bikes_available as available,
round(avg(num_bikes_available) OVER station, 2) AS avg,
round(min(num_bikes_available) OVER station, 2) AS min,
round(max(num_bikes_available) OVER station, 2) AS max,
quantile_cont(num_bikes_available, 0.75) OVER station AS q75
FROM bikeStations
WINDOW station AS (
PARTITION BY station_id, day
)
ORDER BY ts, day
)
SELECT station_name, strftime(ts, '%H:%M:%S') AS time,
available, avg, min, max, q75
FROM avgByDay
WHERE ts BETWEEN '2019-08-09 17:00:00' AND '2019-08-09 18:00:00'
AND available > q75
LIMIT 10;
WITH movingAverage AS (
SELECT station_id, ts, num_bikes_available AS available,
round(avg(num_bikes_available) OVER rolling, 2) AS avg,
lag(num_bikes_available, 1) OVER rolling AS "n-1",
lag(num_bikes_available, 2) OVER rolling AS "n-2"
FROM bikeStations
WINDOW rolling AS (
PARTITION BY station_id
ORDER BY ts
ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
)
ORDER BY ts
)
SELECT ts, avg, "n-2", "n-1", available AS current
FROM movingAverage
WHERE station_id = 128 AND date_trunc('DAY', ts) = '2019-08-06'
ORDER BY ts
LIMIT 10;
WITH movingAverage AS (
SELECT station_id, ts, num_bikes_available AS available,
round(avg(num_bikes_available) OVER rolling, 2) AS avg,
round(min(num_bikes_available) OVER rolling, 2) AS min,
round(max(num_bikes_available) OVER rolling, 2) AS max
FROM bikeStations
WINDOW rolling AS (
PARTITION BY station_id
ORDER BY ts
RANGE BETWEEN INTERVAL 10 MINUTES PRECEDING AND CURRENT ROW
)
ORDER BY ts
)
SELECT ts, available, avg, min, max
FROM movingAverage
WHERE station_id = 128 AND date_trunc('DAY', ts) = '2019-08-06'
ORDER BY ts
LIMIT 10;