zkamvar · November 8, 2022 15:57
diff --git a/scrape-workshops.R b/scrape-workshops.R
 library("jsonlite")
 library("polite")
 library("rvest")
 library("purrr")
 library("dplyr")


 workbench_slugs <- c(
  "r-ecology-lesson",
  "r-socialsci",
  "r-raster-vector-geospatial",
  "lc-shell",
  "instructor-training",
  "python-ecology-es"
 )

 #' Extract a data frame of lessons from a workshop website
 #'
 #' @param url the URL to a workshop website
 #' @return a data frame with two columns
 #'   - url: the url of a lesson or resource
 #'   - name: the name of the lesson or resource
 get_schedule <- function(url) {
  # open the session
  session <- polite::bow(url)
  # get the HTML
  butter <- polite::scrape(session)
  # extract the schedule if available
  sched <- rvest::html_nodes(butter, "#schedule + table a")
  if (length(sched) == 0) {
    # all links in table headers
     sched <- rvest::html_nodes(butter, "#schedule + h3 a") 
  }
  if (length(sched) == 0) {
    # all links in paragraphs
    sched <- rvest::html_nodes(butter, "#schedule + p a") # select all links below a paragraph
  }
  # return a data frame with the URL and name of the lesson
  data.frame(
    url = rvest::html_attr(sched, "href"), 
    name = rvest::html_text(sched))
 }

 #' Extract the slug from a URL
 #'
 #' @param url a URL
 #' @return the URL slug in lowercase format
 get_url_slug <- function(url) {
  xml2::url_parse(url)$path |> 
    tolower() |> 
    strsplit("/") |> 
    purrr::map_chr(2)
 }

 # get the upcoming workshops
 upcoming <- read_json("https://feeds.carpentries.org/all_upcoming_workshops.json") 
 urls <- map_chr(upcoming, "url")
 names(urls) <- map_chr(upcoming, "slug")
 names(upcoming) <- names(urls)
 
 # create the link table
 lessons <- map_dfr(urls, get_schedule, .id = "slug")

 # find out if we have any lessons that are upcoming that use the workbench
 res <- lessons |>
  mutate(path = get_url_slug(url)) |>
  filter(path %in% workbench_slugs)

 # TODO: extract names and find in AMY
	library("jsonlite")
	library("polite")
	library("rvest")
	library("purrr")
	library("dplyr")


	workbench_slugs <- c(
	"r-ecology-lesson",
	"r-socialsci",
	"r-raster-vector-geospatial",
	"lc-shell",
	"instructor-training",
	"python-ecology-es"
	)

	#' Extract a data frame of lessons from a workshop website
	#'
	#' @param url the URL to a workshop website
	#' @return a data frame with two columns
	#' - url: the url of a lesson or resource
	#' - name: the name of the lesson or resource
	get_schedule <- function(url) {
	# open the session
	session <- polite::bow(url)
	# get the HTML
	butter <- polite::scrape(session)
	# extract the schedule if available
	sched <- rvest::html_nodes(butter, "#schedule + table a")
	if (length(sched) == 0) {
	# all links in table headers
	sched <- rvest::html_nodes(butter, "#schedule + h3 a")
	}
	if (length(sched) == 0) {
	# all links in paragraphs
	sched <- rvest::html_nodes(butter, "#schedule + p a") # select all links below a paragraph
	}
	# return a data frame with the URL and name of the lesson
	data.frame(
	url = rvest::html_attr(sched, "href"),
	name = rvest::html_text(sched))
	}

	#' Extract the slug from a URL
	#'
	#' @param url a URL
	#' @return the URL slug in lowercase format
	get_url_slug <- function(url) {
	xml2::url_parse(url)$path \|>
	tolower() \|>
	strsplit("/") \|>
	purrr::map_chr(2)
	}

	# get the upcoming workshops
	upcoming <- read_json("https://feeds.carpentries.org/all_upcoming_workshops.json")
	urls <- map_chr(upcoming, "url")
	names(urls) <- map_chr(upcoming, "slug")
	names(upcoming) <- names(urls)

	# create the link table
	lessons <- map_dfr(urls, get_schedule, .id = "slug")

	# find out if we have any lessons that are upcoming that use the workbench
	res <- lessons \|>
	mutate(path = get_url_slug(url)) \|>
	filter(path %in% workbench_slugs)

	# TODO: extract names and find in AMY