Skip to content

Instantly share code, notes, and snippets.

@yjunechoe
Last active August 17, 2024 17:33
Show Gist options
  • Save yjunechoe/468aa63dace4cab6c3aab9f611b71ca3 to your computer and use it in GitHub Desktop.
Save yjunechoe/468aa63dace4cab6c3aab9f611b71ca3 to your computer and use it in GitHub Desktop.
artifact parsed mapping comparison functions
## Setup ----
devtools::load_all(".")
library(dplyr)
library(tidyr)
library(gt)
mangle <- function(x) {
stringi::stri_sub(x, 1, 2) <- "__"
x
}
gen_tib <- function(i) {
tib <- bind_rows(
tibble(
col_raw = "MONTHS",
col_parsed = "months",
value_raw = toupper(month.name[i]),
value_parsed = month.name[i],
n = i
),
tibble(
col_raw = "STATES",
col_parsed = "states",
value_raw = toupper(state.name[i]),
value_parsed = state.name[i],
n = i
)
)
tib %>%
rowwise() %>%
mutate(value_raw = mangle(value_raw)) %>%
ungroup()
}
old_artifact1 <- gen_tib(1:3)
old_artifact2 <- gen_tib(3:4)
new_problems <- gen_tib(3:5) %>%
select(-value_parsed) %>%
uncount(n) %>%
mutate(row = sample(row_number() * 2))
new_artifacts <- gen_tib(3:5) %>%
uncount(n) %>%
mutate(row = sample(row_number() * 2)) %>%
count(pick(everything(), -row)) %>%
mutate(value_parsed = case_match(value_parsed, "Arkansas" ~ "Arkansaw", .default = value_parsed))
new_artifacts
## Setup simulate read/write in a project ----
init_temp_proj(dates = as.Date(0) + c(0, 365, 365*2))
write_parsing_artifacts(new_artifacts, script = "read_data")
write_parsing_artifacts(old_artifact2, script = "read_data",
path = path_to_prior(path_artifact("clin_data"), 1))
write_parsing_artifacts(old_artifact1, script = "read_data",
path = path_to_prior(path_artifact("clin_data"), 2))
old_artifacts <- fs::dir_ls(
dir_all_data()[-1],
recurse = TRUE, regexp = "clin_data/parsing.csv$"
) %>%
readr::read_csv()
## Table rendering ----
# Color palette for mapping types
map_pal <- function(x) {
dplyr::case_match(x, "changed" ~ "#FFA900", "new" ~ "skyblue", NA ~ "white")
}
# Render an html table of mappings
render_mappings_table <- function(mappings,
title = NULL,
type = c("all", "new", "changed"),
style = htmltools::css(),
...) {
# Return empty table for NULL / 0-row data
if (NROW(mappings) == 0) {
empty_table <- data.frame() %>%
gt::gt() %>%
gt::tab_header(title)
return(empty_table)
}
cols <- mappings %>%
dplyr::bind_rows() %>%
dplyr::distinct(.data$col_raw, .data$col_parsed) %>%
as.list()
stopifnot(
"`mappings` must be for a single column" = all(lengths(cols) == 1)
)
if (is.null(title)) {
title <- htmltools::HTML(paste(cols$col_raw, "&rarr;", cols$col_parsed))
}
# Make sure `.mapping` column is present
if (type != "all") {
mappings$.mapping <- type
}
# If comparing problems (vs. mappings), move new ones to the top
if (type == "all" && "row" %in% colnames(mappings)) {
mappings <- mappings %>%
dplyr::arrange(.data$.mapping, .data$row)
}
mappings_table <- mappings %>%
dplyr::mutate(.color = "", .before = 1L) %>%
gt::gt() %>%
gt::opt_interactive(...) %>%
gt::cols_hide(dplyr::any_of(c("col_raw", "col_parsed", ".mapping"))) %>%
gt::tab_header(gt::md(title)) %>%
gt::cols_label(.color = "") %>%
gt::data_color(".mapping", target_columns = ".color", fn = map_pal) %>%
gt::cols_width(.color ~ px(20))
htmltools::div(style = style, mappings_table) %>%
htmltools::browsable()
}
## Compare problems ----
new_problems
# Compre raw non-parsing values encountered (which ones are new?)
compare_parsing_problems <- function(new_problems, old_artifacts) {
new_problems <- dplyr::bind_rows(new_problems)
keys <- c("col_raw", "col_parsed", "value_raw")
new_values <- dplyr::anti_join(new_problems, old_artifacts, by = keys) %>%
dplyr::select(dplyr::all_of(keys)) %>%
dplyr::distinct() %>%
dplyr::mutate(.mapping = "new")
compared_problems <- left_join(new_problems, new_values, by = keys) %>%
dplyr::arrange(.data$col_raw, .data$row)
compared_problems
}
compared_problems <- compare_parsing_problems(new_problems, old_artifacts)
compared_problems %>%
split(~ col_raw) %>%
lapply(render_mappings_table, type = "all")
## Compare mappings ----
new_artifacts
# Extract updates to mappings between new and old parsing artifacts
compare_parsed_mappings <- function(new_artifacts, old_artifacts) {
keys <- c("col_raw", "col_parsed", "value_raw", "value_parsed")
# 1) New mappings (= previously unseen parsing problems)
maps_new <- dplyr::anti_join(new_artifacts, old_artifacts, by = keys[-4]) %>%
mutate(.mapping = "new")
# All mapppings found in `new_artifacts` but not in `old_artifacts`
maps_new_chg <- dplyr::anti_join(new_artifacts, old_artifacts, by = keys)
# 2a) Changed mappings (= different repairs on a previously seen parsing problems)
maps_chg_bare <- dplyr::anti_join(maps_new_chg, maps_new, by = keys) %>%
mutate(.mapping = "changed")
# 2b) Changed mappings with records of old mappings (+ their counts)
maps_chg <- dplyr::left_join(
maps_chg_bare %>%
dplyr::select(-"n"),
old_artifacts %>%
dplyr::rename("prior_parsed" = "value_parsed"),
by = keys[-4]
) %>%
dplyr::relocate(".mapping", .after = dplyr::last_col())
# 3) Combined record of all mappings: seen, new, and changed
maps_all <- dplyr::left_join(
new_artifacts, dplyr::bind_rows(maps_new, maps_chg_bare),
by = c(keys, "n")
)
# Return mapping analysis results
mappings <- list(all = maps_all, new = maps_new, changed = maps_chg)
mappings <- purrr::map(mappings, function(x) {
x %>% dplyr::arrange(.data$col_raw, dplyr::desc(.data$n))
})
mappings
}
compared_mappings <- compare_parsed_mappings(new_artifacts, old_artifacts)
compared_mappings
# Summary of 3 tables arranged for a parsed column
render_parsing_summaries <- function(mappings) {
cols <- mappings %>%
dplyr::bind_rows() %>%
distinct(.data$col_raw, .data$col_parsed) %>%
as.list()
stopifnot(
"`mappings` must be for a single column" = all(lengths(cols) == 1)
)
tables_html <- htmltools::div(
style = htmltools::css(
display = "grid", max.width = "80%", margin = "0% 5%",
grid.template.columns = "1fr 1fr", grid.template.rows = "auto auto",
grid.gap = "10px 30px"
),
render_mappings_table(
mappings$new, type = "new",
title = "**New** problems and their repairs"
),
render_mappings_table(
mappings$changed, type = "changed",
title = "**Changed** repairs for previously-seen problems"
),
render_mappings_table(
mappings$all, type = "all",
title = "**All** problems and their repairs",
style = htmltools::css(grid.column = "1 / span 2")
)
)
title_html <- htmltools::tags$h2(
style = htmltools::css(
transform = "translateX(5%)", margin.bottom = "10px", font.size = "300%"
),
htmltools::HTML(paste(cols$col_raw, "&rarr;", cols$col_parsed))
)
parsing_summary <- htmltools::div(
id = paste0("parse-", cols$col_raw),
title_html, tables_html
)
htmltools::browsable(parsing_summary)
}
compared_mappings %>%
lapply(split, ~ col_raw) %>%
purrr::transpose() %>%
lapply(render_parsing_summaries)
@yjunechoe
Copy link
Author

yjunechoe commented Aug 16, 2024

library(dplyr)
library(gt)

mangle <- function(x) {
  stringi::stri_sub(x, 1, 2) <- "__"
  x
}
gen_tib <- function(i) {
  tib <- bind_rows(
    tibble(
      col_raw = "MONTHS",
      col_parsed = "months",
      value_raw = toupper(month.name[i]),
      value_parsed = month.name[i],
      n = i
    ),
    tibble(
      col_raw = "STATES",
      col_parsed = "states",
      value_raw = toupper(state.name[i]),
      value_parsed = state.name[i],
      n = i
    )
  )
  tib %>% 
    rowwise() %>% 
    mutate(value_raw = mangle(value_raw)) %>% 
    ungroup()
}

old_artifact1 <- gen_tib(1:3)
old_artifact2 <- gen_tib(3:4)
old_artifacts <- bind_rows(old_artifact1, old_artifact2) %>% 
  count(pick(everything()), wt = n, name = "n")

new_problems <- gen_tib(3:5) %>% 
  select(-value_parsed) %>%
  uncount(n) %>% 
  mutate(row = sample(row_number() * 2))
new_artifacts <- gen_tib(3:5) %>% 
  uncount(n) %>% 
  mutate(row = sample(row_number() * 2)) %>% 
  count(pick(everything(), -row)) %>% 
  mutate(value_parsed = replace(value_parsed, 1, "Apr"))

old_artifacts <- old_artifacts %>% filter(col_raw == "X")
new_artifacts <- new_artifacts %>% filter(col_raw == "X")

old_artifacts
new_artifacts

# Extract updates to mappings between new and old parsing artifacts
compare_parsed_mappings <- function(new_artifacts, old_artifacts) {
  
  keys <- c("col_raw", "col_parsed", "value_raw", "value_parsed")
  
  # 1) New mappings (= previously unseen parsing problems)
  maps_new <- dplyr::anti_join(new_artifacts, old_artifacts, by = keys[-4])
  maps_new$.mapping <- "new"
  
  # All mapppings found in `new_artifacts` but not in `old_artifacts`
  maps_new_chg <- dplyr::anti_join(new_artifacts, old_artifacts, by = keys)
  
  # 2a) Changed mappings (= different repairs on a previously seen parsing problems)
  maps_chg_bare <- dplyr::anti_join(maps_new_chg, maps_new, by = keys)
  maps_chg_bare$.mapping <- "changed"
  # 2b) Changed mappings with records of old mappings (+ their counts)
  maps_chg <- dplyr::left_join(
    maps_chg_bare %>% 
      dplyr::select(-"n"),
    old_artifacts %>% 
      dplyr::rename("prior_parsed" = "value_parsed"),
    by = keys[-4]
  ) %>% 
    dplyr::relocate(".mapping", .after = dplyr::last_col())
  
  # 3) Combined record of all mappings: seen, new, and changed
  maps_all <- dplyr::left_join(
    new_artifacts, dplyr::bind_rows(maps_new, maps_chg_bare),
    by = c(keys, "n")
  )
  
  # Return mapping analysis results
  mappings <- list(all = maps_all, new = maps_new, changed = maps_chg)
  mappings <- purrr::map(mappings, function(x) {
    x %>% dplyr::arrange(.data$col_raw, .data$col_parsed, dplyr::desc(.data$n))
  })
  mappings
  
}

compared_mappings <- compare_parsed_mappings(new_artifacts, old_artifacts)

# Color palette for mapping types
map_pal <- function(x) {
  dplyr::case_match(
    as.character(x),
    "changed" ~ "#FFA900",
    "new" ~ "skyblue",
    NA ~ "white"
  )
}

# Render an html table of mappings
render_mappings_table <- function(mappings, type = c("all", "new", "changed"),
                                  ...,
                                  title = NULL, style = htmltools::css()) {

  # Return empty table for NULL / 0-row data
  if (NROW(mappings) == 0) {
    empty_table <- data.frame() %>% 
      gt::gt() %>% 
      gt::tab_header(title)
    return(empty_table)
  }
  
  cols <- mappings %>% 
    dplyr::bind_rows() %>% 
    dplyr::distinct(.data$col_raw, .data$col_parsed) %>% 
    as.list()
  
  stopifnot(
    "`mappings` must be for a single column" = all(lengths(cols) == 1)
  )
  
  if (is.null(title)) {
    title <- htmltools::HTML(paste(cols$col_raw, "&rarr;", cols$col_parsed))
  }
  
  # Make sure `.mapping` column is present
  if (type != "all") {
    mappings$.mapping <- type
  }
  
  mappings_table <- mappings %>% 
    dplyr::arrange(.data$.mapping) %>% 
    dplyr::mutate(.color = "", .before = 1L) %>% 
    gt::gt() %>% 
    gt::opt_interactive(...) %>% 
    gt::cols_hide(dplyr::any_of(c("col_raw", "col_parsed", ".mapping"))) %>% 
    gt::tab_header(gt::md(title)) %>% 
    gt::cols_label(.color = "") %>% 
    gt::data_color(".mapping", target_columns = ".color", fn = map_pal) %>% 
    gt::cols_width(.color ~ px(20)) 
  
  htmltools::div(style = style, mappings_table) %>%
    htmltools::browsable()
  
}

render_parsing_summaries <- function(mappings) {
  
  cols <- mappings %>% 
    dplyr::bind_rows() %>% 
    distinct(.data$col_raw, .data$col_parsed) %>% 
    as.list()
  
  stopifnot(
    "`mappings` must be for a single column" = all(lengths(cols) == 1)
  )
  
  tables_html <- htmltools::div(
    style = htmltools::css(
      display = "grid", max.width = "80%", margin = "0% 5%",
      grid.template.columns = "1fr 1fr", grid.template.rows = "auto auto",
      grid.gap = "10px 30px"
    ),
    render_mappings_table(
      mappings$new, type = "new",
      title = "**New** problems and their repairs"
    ),
    render_mappings_table(
      mappings$changed, type = "changed",
      title = "**Changed** repairs for previously-seen problems"
    ),
    render_mappings_table(
      mappings$all, type = "all",
      title = "**All** problems and their repairs",
      style = htmltools::css(grid.column = "1 / span 2")
    )
  )
  
  title_html <- htmltools::tags$h2(
    style = htmltools::css(
      transform = "translateX(5%)", margin.bottom = "10px", font.size = "300%"
    ),
    htmltools::HTML(paste(cols$col_raw, "&rarr;", cols$col_parsed))
  )
  
  parsing_summary <- htmltools::div(
    id = paste0("parse-", cols$col_raw),
    title_html, tables_html
  )
  
  htmltools::browsable(parsing_summary)
  
}

render_parsing_summaries(mappings)
compare_parsing_problems <- function(new_problems, old_artifacts) {
  
  new_problems <- dplyr::bind_rows(new_problems)  
  keys <- c("col_raw", "col_parsed", "value_raw")
  
  new_values <- dplyr::anti_join(new_problems, old_artifacts, by = keys) %>% 
    dplyr::select(dplyr::all_of(keys)) %>% 
    dplyr::distinct() %>% 
    dplyr::mutate(.mapping = "new")
  
  left_join(new_problems, new_values, by = keys)
  
}

@yjunechoe
Copy link
Author

## Setup ----
devtools::load_all(".")
library(dplyr)
library(tidyr)
library(gt)

mangle <- function(x) {
  stringi::stri_sub(x, 1, 2) <- "__"
  x
}
gen_tib <- function(i) {
  tib <- bind_rows(
    tibble(
      col_raw = "MONTHS",
      col_parsed = "months",
      value_raw = toupper(month.name[i]),
      value_parsed = month.name[i],
      n = i
    ),
    tibble(
      col_raw = "STATES",
      col_parsed = "states",
      value_raw = toupper(state.name[i]),
      value_parsed = state.name[i],
      n = i
    )
  )
  tib %>% 
    rowwise() %>% 
    mutate(value_raw = mangle(value_raw)) %>% 
    ungroup()
}

old_artifact1 <- gen_tib(1:3)
old_artifact2 <- gen_tib(3:4)

new_problems <- gen_tib(3:5) %>% 
  select(-value_parsed) %>%
  uncount(n) %>% 
  mutate(row = sample(row_number() * 2))
new_artifacts <- gen_tib(3:5) %>% 
  uncount(n) %>% 
  mutate(row = sample(row_number() * 2)) %>% 
  count(pick(everything(), -row)) %>% 
  mutate(value_parsed = case_match(value_parsed, "Arkansas" ~ "Arkansaw", .default = value_parsed))

new_artifacts

## Setup simulate read/write in a project ----

init_temp_proj(dates = as.Date(0) + c(0, 365, 365*2))
write_parsing_artifacts(new_artifacts, script = "read_data")
write_parsing_artifacts(old_artifact2, script = "read_data",
                        path = path_to_prior(path_artifact("clin_data"), 1))
write_parsing_artifacts(old_artifact1, script = "read_data",
                        path = path_to_prior(path_artifact("clin_data"), 2))

old_artifacts <- fs::dir_ls(
  dir_all_data()[-1],
  recurse = TRUE, regexp = "clin_data/parsing.csv$"
) %>% 
  readr::read_csv()

## Table rendering ----

# Color palette for mapping types
map_pal <- function(x) {
  dplyr::case_match(x, "changed" ~ "#FFA900", "new" ~ "skyblue", NA ~ "white")
}

# Render an html table of mappings
render_mappings_table <- function(mappings,
                                  title = NULL,
                                  type = c("all", "new", "changed"),
                                  style = htmltools::css(),
                                  ...) {
  
  # Return empty table for NULL / 0-row data
  if (NROW(mappings) == 0) {
    empty_table <- data.frame() %>% 
      gt::gt() %>% 
      gt::tab_header(title)
    return(empty_table)
  }
  
  cols <- mappings %>% 
    dplyr::bind_rows() %>% 
    dplyr::distinct(.data$col_raw, .data$col_parsed) %>% 
    as.list()
  
  stopifnot(
    "`mappings` must be for a single column" = all(lengths(cols) == 1)
  )
  
  if (is.null(title)) {
    title <- htmltools::HTML(paste(cols$col_raw, "&rarr;", cols$col_parsed))
  }
  
  # Make sure `.mapping` column is present
  if (type != "all") {
    mappings$.mapping <- type
  }
  
  # If comparing problems (vs. mappings), move new ones to the top
  if (type == "all" && "row" %in% colnames(mappings)) {
    mappings <- mappings %>% 
      dplyr::arrange(.data$.mapping, .data$row)
  }
  
  mappings_table <- mappings %>% 
    dplyr::mutate(.color = "", .before = 1L) %>% 
    gt::gt() %>% 
    gt::opt_interactive(...) %>% 
    gt::cols_hide(dplyr::any_of(c("col_raw", "col_parsed", ".mapping"))) %>% 
    gt::tab_header(gt::md(title)) %>% 
    gt::cols_label(.color = "") %>% 
    gt::data_color(".mapping", target_columns = ".color", fn = map_pal) %>% 
    gt::cols_width(.color ~ px(20)) 
  
  htmltools::div(style = style, mappings_table) %>%
    htmltools::browsable()
  
}

## Compare problems ----
new_problems

# Compre raw non-parsing values encountered (which ones are new?)
compare_parsing_problems <- function(new_problems, old_artifacts) {
  
  new_problems <- dplyr::bind_rows(new_problems)  
  keys <- c("col_raw", "col_parsed", "value_raw")
  
  new_values <- dplyr::anti_join(new_problems, old_artifacts, by = keys) %>% 
    dplyr::select(dplyr::all_of(keys)) %>% 
    dplyr::distinct() %>% 
    dplyr::mutate(.mapping = "new")
  
  compared_problems <- left_join(new_problems, new_values, by = keys) %>% 
    dplyr::arrange(.data$col_raw, .data$row)
  compared_problems
  
}

compared_problems <- compare_parsing_problems(new_problems, old_artifacts)

compared_problems %>% 
  split(~ col_raw) %>% 
  lapply(render_mappings_table, type = "all")

## Compare mappings ----
new_artifacts

# Extract updates to mappings between new and old parsing artifacts
compare_parsed_mappings <- function(new_artifacts, old_artifacts) {
  
  keys <- c("col_raw", "col_parsed", "value_raw", "value_parsed")
  
  # 1) New mappings (= previously unseen parsing problems)
  maps_new <- dplyr::anti_join(new_artifacts, old_artifacts, by = keys[-4])
  maps_new$.mapping <- "new"
  
  # All mapppings found in `new_artifacts` but not in `old_artifacts`
  maps_new_chg <- dplyr::anti_join(new_artifacts, old_artifacts, by = keys)
  
  # 2a) Changed mappings (= different repairs on a previously seen parsing problems)
  maps_chg_bare <- dplyr::anti_join(maps_new_chg, maps_new, by = keys)
  maps_chg_bare$.mapping <- "changed"
  # 2b) Changed mappings with records of old mappings (+ their counts)
  maps_chg <- dplyr::left_join(
    maps_chg_bare %>% 
      dplyr::select(-"n"),
    old_artifacts %>% 
      dplyr::rename("prior_parsed" = "value_parsed"),
    by = keys[-4]
  ) %>% 
    dplyr::relocate(".mapping", .after = dplyr::last_col())
  
  # 3) Combined record of all mappings: seen, new, and changed
  maps_all <- dplyr::left_join(
    new_artifacts, dplyr::bind_rows(maps_new, maps_chg_bare),
    by = c(keys, "n")
  )
  
  # Return mapping analysis results
  mappings <- list(all = maps_all, new = maps_new, changed = maps_chg)
  mappings <- purrr::map(mappings, function(x) {
    x %>% dplyr::arrange(.data$col_raw, dplyr::desc(.data$n))
  })
  mappings
  
}

compared_mappings <- compare_parsed_mappings(new_artifacts, old_artifacts)
compared_mappings

# Summary of 3 tables arranged for a parsed column
render_parsing_summaries <- function(mappings) {
  
  cols <- mappings %>% 
    dplyr::bind_rows() %>% 
    distinct(.data$col_raw, .data$col_parsed) %>% 
    as.list()
  
  stopifnot(
    "`mappings` must be for a single column" = all(lengths(cols) == 1)
  )
  
  tables_html <- htmltools::div(
    style = htmltools::css(
      display = "grid", max.width = "80%", margin = "0% 5%",
      grid.template.columns = "1fr 1fr", grid.template.rows = "auto auto",
      grid.gap = "10px 30px"
    ),
    render_mappings_table(
      mappings$new, type = "new",
      title = "**New** problems and their repairs"
    ),
    render_mappings_table(
      mappings$changed, type = "changed",
      title = "**Changed** repairs for previously-seen problems"
    ),
    render_mappings_table(
      mappings$all, type = "all",
      title = "**All** problems and their repairs",
      style = htmltools::css(grid.column = "1 / span 2")
    )
  )
  
  title_html <- htmltools::tags$h2(
    style = htmltools::css(
      transform = "translateX(5%)", margin.bottom = "10px", font.size = "300%"
    ),
    htmltools::HTML(paste(cols$col_raw, "&rarr;", cols$col_parsed))
  )
  
  parsing_summary <- htmltools::div(
    id = paste0("parse-", cols$col_raw),
    title_html, tables_html
  )
  
  htmltools::browsable(parsing_summary)
  
}

compared_mappings %>% 
  lapply(split, ~ col_raw) %>% 
  purrr::transpose() %>% 
  lapply(render_parsing_summaries)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment