Created
November 8, 2022 15:57
-
-
Save zkamvar/bbca87e6053fbb43449cf7f7cb8f3e5f to your computer and use it in GitHub Desktop.
Scrape lessons from workshops and determine if they have workbench lessons
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library("jsonlite") | |
library("polite") | |
library("rvest") | |
library("purrr") | |
library("dplyr") | |
workbench_slugs <- c( | |
"r-ecology-lesson", | |
"r-socialsci", | |
"r-raster-vector-geospatial", | |
"lc-shell", | |
"instructor-training", | |
"python-ecology-es" | |
) | |
#' Extract a data frame of lessons from a workshop website | |
#' | |
#' @param url the URL to a workshop website | |
#' @return a data frame with two columns | |
#' - url: the url of a lesson or resource | |
#' - name: the name of the lesson or resource | |
get_schedule <- function(url) { | |
# open the session | |
session <- polite::bow(url) | |
# get the HTML | |
butter <- polite::scrape(session) | |
# extract the schedule if available | |
sched <- rvest::html_nodes(butter, "#schedule + table a") | |
if (length(sched) == 0) { | |
# all links in table headers | |
sched <- rvest::html_nodes(butter, "#schedule + h3 a") | |
} | |
if (length(sched) == 0) { | |
# all links in paragraphs | |
sched <- rvest::html_nodes(butter, "#schedule + p a") # select all links below a paragraph | |
} | |
# return a data frame with the URL and name of the lesson | |
data.frame( | |
url = rvest::html_attr(sched, "href"), | |
name = rvest::html_text(sched)) | |
} | |
#' Extract the slug from a URL | |
#' | |
#' @param url a URL | |
#' @return the URL slug in lowercase format | |
get_url_slug <- function(url) { | |
xml2::url_parse(url)$path |> | |
tolower() |> | |
strsplit("/") |> | |
purrr::map_chr(2) | |
} | |
# get the upcoming workshops | |
upcoming <- read_json("https://feeds.carpentries.org/all_upcoming_workshops.json") | |
urls <- map_chr(upcoming, "url") | |
names(urls) <- map_chr(upcoming, "slug") | |
names(upcoming) <- names(urls) | |
# create the link table | |
lessons <- map_dfr(urls, get_schedule, .id = "slug") | |
# find out if we have any lessons that are upcoming that use the workbench | |
res <- lessons |> | |
mutate(path = get_url_slug(url)) |> | |
filter(path %in% workbench_slugs) | |
# TODO: extract names and find in AMY |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment