Created September 6, 2024 04:25
Code to get player lists from AFL tables
library(httr2) # request(), req_*(), resp_body_html()
library(rvest) # html_elements(), html_table()
extract_team_ids <- function(link_text) {
matches <- str_match(link_text, '<a href="teams/(.*)_idx.html">(.*)</a>')
team_id <- matches[, 2]
team_name <- matches[, 3]
tibble(team_id, team_name)
all_teams_url <- ""
teams <-
map_dfr(team_urls, extract_team_ids) |>
filter(team_id != "allteams")
get_team_list <- function(team_id) {
url <- str_c("",
team_id, ".html")
resp <-
request(url) |>
req_user_agent(getOption("HTTPUserAgent")) |>
req_perform() |>
resp_body_html() |>
make_range <- function(start, end) {
df <-
resp[[1]] |>
html_table() |>
mutate(DOB = as.Date(DOB)) |>
filter(! |>
separate_wider_regex(cols = c(Debut, Last),
patterns = c(years = "^[0-9]+", "y ", days = "[0-9]+", "d$"),
names_sep = "_",
too_few = "align_start") |>
mutate(debut = if_else(,
DOB + years(Debut_years),
DOB + years(Debut_years) + days(Debut_days)),
last = if_else(,
DOB + years(Last_years),
DOB + years(Last_years) + days(Last_days))) |>
arrange(desc(last)) |>
select(-matches("^(Debut|Last)", = FALSE)) |>
mutate(team_id = team_id, .before = 1)
team_lists <-
map(teams$team_id, get_team_list) |>
seasons <-
team_lists |>
select(team_id, Player, DOB, Seasons) |>
separate_longer_delim(Seasons, ",") |>
mutate(Seasons = str_trim(Seasons)) |>
patterns = c(first_season = "^[0-9]+", "-", last_season = "[0-9]+$"),
too_few = "align_start") |>
mutate(last_season = coalesce(last_season, first_season)) |>
rowwise() |>
mutate(seasons = list(first_season:last_season)) |>
unnest(seasons) |>
select(-first_season, -last_season)
