Skip to content

Instantly share code, notes, and snippets.

@NickCH-K
Last active December 22, 2020 07:23
Show Gist options
  • Save NickCH-K/2c620f6db353d7cd7d63142c5ed7ee56 to your computer and use it in GitHub Desktop.
Save NickCH-K/2c620f6db353d7cd7d63142c5ed7ee56 to your computer and use it in GitHub Desktop.
########## THIS FILE REQUIRES data.table TO BE LOADED TO FUNCTION. USE library(data.table)
#' Patterns File Lookup
#'
#' This function, given a date or range of dates, will return a character vector of folder paths you will need to read in with \code{list.files()} (or just set \code{list_files = TRUE} to return the full set of filepaths), which must be run through \code{list.files(pattern = '.csv.gz', full.names = TRUE)} after downloading files. This is done because the subfolder after this is based on the hour the data is released, which can't be predicted ahead of time for future weeks.
#'
#' For the period from mid-June-early December, 2020, data is available in both "old" (\code{patterns_backfill}) and "new" (\code{patterns}`) This function will generate filepaths to the "new" format.
#'
#' @param dates A vector of \code{Date} objects (perhaps taking a single \code{Date} object and adding \code{+lubridate::days(0:finish)}) to find the associated files for.
#' @param dir If specified, will append \code{dir} to the start of the filepaths, to get full filepaths. If using both "old" (pre-June 15, 2020) and "new" (post) dates, this will only work if both the "patterns_backfill" (old) and "patterns" (new) folders are in the same folder. Superseded by \code{old_dir} and \code{new_dir} for old and new files, respectively.
#' @param old_dir If specified, will append \code{old_dir} to the start of the filepaths for all "old" (pre-Dec 7, 2020) files. This should be the folder that contains the \code{patterns_backfill} folder.
#' @param new_dir If specified, will append \code{new_dir} to the start of the filepaths for all "new" (post-Dec 7, 2020) files. This should be the folder that contains the \code{patterns} folder.
#' @param silent If specified, will omit the warning for using any dates after the package author last checked the consistency of the SafeGraph file structure.
#' @param add_ma Also looks at the \code{add_ma} days before the dates listed in \code{dates}, so you can calculate an \code{add_ma}-day moving average. Or you could just change the \code{dates} argument yourself to allow this.
#' @param patterns_backfill_date Character variable with the folder structure for the most recent \code{patterns_backfill} pull. i.e., the 2018, 2019, and 2020 folders containing backfill data in their subfolders should set in the \code{paste0(old_dir,'/patterns_backfill/',patterns_backfill_date)} folder.
#' @param list_files After creating folderpaths, run each of them through \code{list.files(pattern = '.csv', recursive = TRUE, full.names = TRUE)} to get a usable list of files. This only works if all the files have already been downloaded.
patterns_lookup <- function(dates,
dir = NULL,
old_dir = NULL,
new_dir = NULL,
silent = FALSE,
add_ma = 0,
patterns_backfill_date = '2020/12/14/21/',
list_files = FALSE) {
if (!lubridate::is.Date(dates)) {
stop('dates must be a vector of Date objects.')
}
if (add_ma < 0) {
stop('add_ma must be nonnegative.')
}
# Fill in null values of dir
if (is.null(dir)) {
dir <- ""
}
if (is.null(old_dir)) {
old_dir <- dir
}
if (is.null(new_dir)) {
new_dir <- dir
}
# and add trailing /
if (nchar(old_dir) > 0 & stringr::str_sub(old_dir,-1) != '/') {
old_dir <- paste0(old_dir,'/')
}
if (nchar(new_dir) > 0 & stringr::str_sub(new_dir, -1) != '/') {
new_dir <- paste0(new_dir,'/')
}
if (nchar(patterns_backfill_date) > 0 & stringr::str_sub(patterns_backfill_date, -1) != '/') {
patterns_backfill_date <- paste0(patterns_backfill_date,'/')
}
# Warn about new dates
if (!silent) {
if (max(dates) > lubridate::ymd('2020-12-16')) {
warning('This function has been tested to match the SafeGraph file structure as of Dec 16, 2020. Any file structure changes since then could make your result wrong.')
}
}
# Add moving-average days
if (add_ma > 0) {
dates2 <- dates
for (d in 1:add_ma) {
dates2 <- unique(c(dates2, dates - lubridate::days(d)))
}
dates <- sort(dates2)
}
# Split the dates into new and old
old <- dates[dates <= lubridate::ymd('2020-12-06')]
new <- dates[dates >= lubridate::ymd('2020-12-07')]
filelist <- c()
if (length(old) > 0) {
old_dt <- data.table::data.table(date = old)
# Find the most recent wday = 2, which is the first day in the file
old_dt[, recent := date + lubridate::days(2 - lubridate::wday(date)) - lubridate::days(7*(lubridate::wday(date) == 1))]
# And filename
old_dt[, filename := paste0(
old_dir,
'patterns_backfill/',
patterns_backfill_date,
lubridate::year(recent),'/',
stringr::str_pad(lubridate::month(recent), 2, 'left', '0'), '/',
stringr::str_pad(lubridate::day(recent), 2, 'left', '0'), '/'
)]
filelist <- unique(old_dt$filename)
if (list_files) {
filelist <- filelist %>%
map_chr(function(x) {
fls <- list.files(paste0(old_dir, x), pattern = '.csv',
recursive = TRUE, full.names = TRUE)
if (length(fls) == 0) {
warning(paste0('Found no files in ',x,'. list_files requires files be downloaded first.'))
return('')
} else {
return(fls)
}
})
}
}
if (length(new) > 0) {
new_dt <- data.table::data.table(date = new)
# Find the most recent wday = 2, which is the first day in the file
# Then add 9 days to get to release date
new_dt[, recent := date + lubridate::days(2 - lubridate::wday(date)) + lubridate::days(9) - lubridate::days(7*(lubridate::wday(date) == 1))]
# And filename
new_dt[, filename := paste0(
'patterns/',
new_dir,
lubridate::year(recent),'/',
stringr::str_pad(lubridate::month(recent), 2, 'left', '0'), '/',
stringr::str_pad(lubridate::day(recent), 2, 'left', '0'), '/'
)]
if (!list_files) {
filelist <- c(filelist,unique(new_dt$filename))
} else {
filelist <- c(filelist,
unique(new_dt$filename) %>%
map_chr(function(x) {
fls <- list.files(paste0(new_dir, x), pattern = '.csv',
recursive = TRUE, full.names = TRUE)
if (length(fls) == 0) {
warning(paste0('Found no files in ',x,'. list_files requires files be downloaded first.'))
return('')
} else {
return(fls)
}
}))
}
}
if (list_files) {
filelist <- filelist[filelist != '']
}
return(filelist)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment