Last active
August 17, 2024 17:33
-
-
Save yjunechoe/468aa63dace4cab6c3aab9f611b71ca3 to your computer and use it in GitHub Desktop.
artifact parsed mapping comparison functions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Setup ---- | |
devtools::load_all(".") | |
library(dplyr) | |
library(tidyr) | |
library(gt) | |
mangle <- function(x) { | |
stringi::stri_sub(x, 1, 2) <- "__" | |
x | |
} | |
gen_tib <- function(i) { | |
tib <- bind_rows( | |
tibble( | |
col_raw = "MONTHS", | |
col_parsed = "months", | |
value_raw = toupper(month.name[i]), | |
value_parsed = month.name[i], | |
n = i | |
), | |
tibble( | |
col_raw = "STATES", | |
col_parsed = "states", | |
value_raw = toupper(state.name[i]), | |
value_parsed = state.name[i], | |
n = i | |
) | |
) | |
tib %>% | |
rowwise() %>% | |
mutate(value_raw = mangle(value_raw)) %>% | |
ungroup() | |
} | |
old_artifact1 <- gen_tib(1:3) | |
old_artifact2 <- gen_tib(3:4) | |
new_problems <- gen_tib(3:5) %>% | |
select(-value_parsed) %>% | |
uncount(n) %>% | |
mutate(row = sample(row_number() * 2)) | |
new_artifacts <- gen_tib(3:5) %>% | |
uncount(n) %>% | |
mutate(row = sample(row_number() * 2)) %>% | |
count(pick(everything(), -row)) %>% | |
mutate(value_parsed = case_match(value_parsed, "Arkansas" ~ "Arkansaw", .default = value_parsed)) | |
new_artifacts | |
## Setup simulate read/write in a project ---- | |
init_temp_proj(dates = as.Date(0) + c(0, 365, 365*2)) | |
write_parsing_artifacts(new_artifacts, script = "read_data") | |
write_parsing_artifacts(old_artifact2, script = "read_data", | |
path = path_to_prior(path_artifact("clin_data"), 1)) | |
write_parsing_artifacts(old_artifact1, script = "read_data", | |
path = path_to_prior(path_artifact("clin_data"), 2)) | |
old_artifacts <- fs::dir_ls( | |
dir_all_data()[-1], | |
recurse = TRUE, regexp = "clin_data/parsing.csv$" | |
) %>% | |
readr::read_csv() | |
## Table rendering ---- | |
# Color palette for mapping types | |
map_pal <- function(x) { | |
dplyr::case_match(x, "changed" ~ "#FFA900", "new" ~ "skyblue", NA ~ "white") | |
} | |
# Render an html table of mappings | |
render_mappings_table <- function(mappings, | |
title = NULL, | |
type = c("all", "new", "changed"), | |
style = htmltools::css(), | |
...) { | |
# Return empty table for NULL / 0-row data | |
if (NROW(mappings) == 0) { | |
empty_table <- data.frame() %>% | |
gt::gt() %>% | |
gt::tab_header(title) | |
return(empty_table) | |
} | |
cols <- mappings %>% | |
dplyr::bind_rows() %>% | |
dplyr::distinct(.data$col_raw, .data$col_parsed) %>% | |
as.list() | |
stopifnot( | |
"`mappings` must be for a single column" = all(lengths(cols) == 1) | |
) | |
if (is.null(title)) { | |
title <- htmltools::HTML(paste(cols$col_raw, "→", cols$col_parsed)) | |
} | |
# Make sure `.mapping` column is present | |
if (type != "all") { | |
mappings$.mapping <- type | |
} | |
# If comparing problems (vs. mappings), move new ones to the top | |
if (type == "all" && "row" %in% colnames(mappings)) { | |
mappings <- mappings %>% | |
dplyr::arrange(.data$.mapping, .data$row) | |
} | |
mappings_table <- mappings %>% | |
dplyr::mutate(.color = "", .before = 1L) %>% | |
gt::gt() %>% | |
gt::opt_interactive(...) %>% | |
gt::cols_hide(dplyr::any_of(c("col_raw", "col_parsed", ".mapping"))) %>% | |
gt::tab_header(gt::md(title)) %>% | |
gt::cols_label(.color = "") %>% | |
gt::data_color(".mapping", target_columns = ".color", fn = map_pal) %>% | |
gt::cols_width(.color ~ px(20)) | |
htmltools::div(style = style, mappings_table) %>% | |
htmltools::browsable() | |
} | |
## Compare problems ---- | |
new_problems | |
# Compre raw non-parsing values encountered (which ones are new?) | |
compare_parsing_problems <- function(new_problems, old_artifacts) { | |
new_problems <- dplyr::bind_rows(new_problems) | |
keys <- c("col_raw", "col_parsed", "value_raw") | |
new_values <- dplyr::anti_join(new_problems, old_artifacts, by = keys) %>% | |
dplyr::select(dplyr::all_of(keys)) %>% | |
dplyr::distinct() %>% | |
dplyr::mutate(.mapping = "new") | |
compared_problems <- left_join(new_problems, new_values, by = keys) %>% | |
dplyr::arrange(.data$col_raw, .data$row) | |
compared_problems | |
} | |
compared_problems <- compare_parsing_problems(new_problems, old_artifacts) | |
compared_problems %>% | |
split(~ col_raw) %>% | |
lapply(render_mappings_table, type = "all") | |
## Compare mappings ---- | |
new_artifacts | |
# Extract updates to mappings between new and old parsing artifacts | |
compare_parsed_mappings <- function(new_artifacts, old_artifacts) { | |
keys <- c("col_raw", "col_parsed", "value_raw", "value_parsed") | |
# 1) New mappings (= previously unseen parsing problems) | |
maps_new <- dplyr::anti_join(new_artifacts, old_artifacts, by = keys[-4]) %>% | |
mutate(.mapping = "new") | |
# All mapppings found in `new_artifacts` but not in `old_artifacts` | |
maps_new_chg <- dplyr::anti_join(new_artifacts, old_artifacts, by = keys) | |
# 2a) Changed mappings (= different repairs on a previously seen parsing problems) | |
maps_chg_bare <- dplyr::anti_join(maps_new_chg, maps_new, by = keys) %>% | |
mutate(.mapping = "changed") | |
# 2b) Changed mappings with records of old mappings (+ their counts) | |
maps_chg <- dplyr::left_join( | |
maps_chg_bare %>% | |
dplyr::select(-"n"), | |
old_artifacts %>% | |
dplyr::rename("prior_parsed" = "value_parsed"), | |
by = keys[-4] | |
) %>% | |
dplyr::relocate(".mapping", .after = dplyr::last_col()) | |
# 3) Combined record of all mappings: seen, new, and changed | |
maps_all <- dplyr::left_join( | |
new_artifacts, dplyr::bind_rows(maps_new, maps_chg_bare), | |
by = c(keys, "n") | |
) | |
# Return mapping analysis results | |
mappings <- list(all = maps_all, new = maps_new, changed = maps_chg) | |
mappings <- purrr::map(mappings, function(x) { | |
x %>% dplyr::arrange(.data$col_raw, dplyr::desc(.data$n)) | |
}) | |
mappings | |
} | |
compared_mappings <- compare_parsed_mappings(new_artifacts, old_artifacts) | |
compared_mappings | |
# Summary of 3 tables arranged for a parsed column | |
render_parsing_summaries <- function(mappings) { | |
cols <- mappings %>% | |
dplyr::bind_rows() %>% | |
distinct(.data$col_raw, .data$col_parsed) %>% | |
as.list() | |
stopifnot( | |
"`mappings` must be for a single column" = all(lengths(cols) == 1) | |
) | |
tables_html <- htmltools::div( | |
style = htmltools::css( | |
display = "grid", max.width = "80%", margin = "0% 5%", | |
grid.template.columns = "1fr 1fr", grid.template.rows = "auto auto", | |
grid.gap = "10px 30px" | |
), | |
render_mappings_table( | |
mappings$new, type = "new", | |
title = "**New** problems and their repairs" | |
), | |
render_mappings_table( | |
mappings$changed, type = "changed", | |
title = "**Changed** repairs for previously-seen problems" | |
), | |
render_mappings_table( | |
mappings$all, type = "all", | |
title = "**All** problems and their repairs", | |
style = htmltools::css(grid.column = "1 / span 2") | |
) | |
) | |
title_html <- htmltools::tags$h2( | |
style = htmltools::css( | |
transform = "translateX(5%)", margin.bottom = "10px", font.size = "300%" | |
), | |
htmltools::HTML(paste(cols$col_raw, "→", cols$col_parsed)) | |
) | |
parsing_summary <- htmltools::div( | |
id = paste0("parse-", cols$col_raw), | |
title_html, tables_html | |
) | |
htmltools::browsable(parsing_summary) | |
} | |
compared_mappings %>% | |
lapply(split, ~ col_raw) %>% | |
purrr::transpose() %>% | |
lapply(render_parsing_summaries) | |
Author
yjunechoe
commented
Aug 16, 2024
•
## Setup ----
devtools::load_all(".")
library(dplyr)
library(tidyr)
library(gt)
mangle <- function(x) {
stringi::stri_sub(x, 1, 2) <- "__"
x
}
gen_tib <- function(i) {
tib <- bind_rows(
tibble(
col_raw = "MONTHS",
col_parsed = "months",
value_raw = toupper(month.name[i]),
value_parsed = month.name[i],
n = i
),
tibble(
col_raw = "STATES",
col_parsed = "states",
value_raw = toupper(state.name[i]),
value_parsed = state.name[i],
n = i
)
)
tib %>%
rowwise() %>%
mutate(value_raw = mangle(value_raw)) %>%
ungroup()
}
old_artifact1 <- gen_tib(1:3)
old_artifact2 <- gen_tib(3:4)
new_problems <- gen_tib(3:5) %>%
select(-value_parsed) %>%
uncount(n) %>%
mutate(row = sample(row_number() * 2))
new_artifacts <- gen_tib(3:5) %>%
uncount(n) %>%
mutate(row = sample(row_number() * 2)) %>%
count(pick(everything(), -row)) %>%
mutate(value_parsed = case_match(value_parsed, "Arkansas" ~ "Arkansaw", .default = value_parsed))
new_artifacts
## Setup simulate read/write in a project ----
init_temp_proj(dates = as.Date(0) + c(0, 365, 365*2))
write_parsing_artifacts(new_artifacts, script = "read_data")
write_parsing_artifacts(old_artifact2, script = "read_data",
path = path_to_prior(path_artifact("clin_data"), 1))
write_parsing_artifacts(old_artifact1, script = "read_data",
path = path_to_prior(path_artifact("clin_data"), 2))
old_artifacts <- fs::dir_ls(
dir_all_data()[-1],
recurse = TRUE, regexp = "clin_data/parsing.csv$"
) %>%
readr::read_csv()
## Table rendering ----
# Color palette for mapping types
map_pal <- function(x) {
dplyr::case_match(x, "changed" ~ "#FFA900", "new" ~ "skyblue", NA ~ "white")
}
# Render an html table of mappings
render_mappings_table <- function(mappings,
title = NULL,
type = c("all", "new", "changed"),
style = htmltools::css(),
...) {
# Return empty table for NULL / 0-row data
if (NROW(mappings) == 0) {
empty_table <- data.frame() %>%
gt::gt() %>%
gt::tab_header(title)
return(empty_table)
}
cols <- mappings %>%
dplyr::bind_rows() %>%
dplyr::distinct(.data$col_raw, .data$col_parsed) %>%
as.list()
stopifnot(
"`mappings` must be for a single column" = all(lengths(cols) == 1)
)
if (is.null(title)) {
title <- htmltools::HTML(paste(cols$col_raw, "→", cols$col_parsed))
}
# Make sure `.mapping` column is present
if (type != "all") {
mappings$.mapping <- type
}
# If comparing problems (vs. mappings), move new ones to the top
if (type == "all" && "row" %in% colnames(mappings)) {
mappings <- mappings %>%
dplyr::arrange(.data$.mapping, .data$row)
}
mappings_table <- mappings %>%
dplyr::mutate(.color = "", .before = 1L) %>%
gt::gt() %>%
gt::opt_interactive(...) %>%
gt::cols_hide(dplyr::any_of(c("col_raw", "col_parsed", ".mapping"))) %>%
gt::tab_header(gt::md(title)) %>%
gt::cols_label(.color = "") %>%
gt::data_color(".mapping", target_columns = ".color", fn = map_pal) %>%
gt::cols_width(.color ~ px(20))
htmltools::div(style = style, mappings_table) %>%
htmltools::browsable()
}
## Compare problems ----
new_problems
# Compre raw non-parsing values encountered (which ones are new?)
compare_parsing_problems <- function(new_problems, old_artifacts) {
new_problems <- dplyr::bind_rows(new_problems)
keys <- c("col_raw", "col_parsed", "value_raw")
new_values <- dplyr::anti_join(new_problems, old_artifacts, by = keys) %>%
dplyr::select(dplyr::all_of(keys)) %>%
dplyr::distinct() %>%
dplyr::mutate(.mapping = "new")
compared_problems <- left_join(new_problems, new_values, by = keys) %>%
dplyr::arrange(.data$col_raw, .data$row)
compared_problems
}
compared_problems <- compare_parsing_problems(new_problems, old_artifacts)
compared_problems %>%
split(~ col_raw) %>%
lapply(render_mappings_table, type = "all")
## Compare mappings ----
new_artifacts
# Extract updates to mappings between new and old parsing artifacts
compare_parsed_mappings <- function(new_artifacts, old_artifacts) {
keys <- c("col_raw", "col_parsed", "value_raw", "value_parsed")
# 1) New mappings (= previously unseen parsing problems)
maps_new <- dplyr::anti_join(new_artifacts, old_artifacts, by = keys[-4])
maps_new$.mapping <- "new"
# All mapppings found in `new_artifacts` but not in `old_artifacts`
maps_new_chg <- dplyr::anti_join(new_artifacts, old_artifacts, by = keys)
# 2a) Changed mappings (= different repairs on a previously seen parsing problems)
maps_chg_bare <- dplyr::anti_join(maps_new_chg, maps_new, by = keys)
maps_chg_bare$.mapping <- "changed"
# 2b) Changed mappings with records of old mappings (+ their counts)
maps_chg <- dplyr::left_join(
maps_chg_bare %>%
dplyr::select(-"n"),
old_artifacts %>%
dplyr::rename("prior_parsed" = "value_parsed"),
by = keys[-4]
) %>%
dplyr::relocate(".mapping", .after = dplyr::last_col())
# 3) Combined record of all mappings: seen, new, and changed
maps_all <- dplyr::left_join(
new_artifacts, dplyr::bind_rows(maps_new, maps_chg_bare),
by = c(keys, "n")
)
# Return mapping analysis results
mappings <- list(all = maps_all, new = maps_new, changed = maps_chg)
mappings <- purrr::map(mappings, function(x) {
x %>% dplyr::arrange(.data$col_raw, dplyr::desc(.data$n))
})
mappings
}
compared_mappings <- compare_parsed_mappings(new_artifacts, old_artifacts)
compared_mappings
# Summary of 3 tables arranged for a parsed column
render_parsing_summaries <- function(mappings) {
cols <- mappings %>%
dplyr::bind_rows() %>%
distinct(.data$col_raw, .data$col_parsed) %>%
as.list()
stopifnot(
"`mappings` must be for a single column" = all(lengths(cols) == 1)
)
tables_html <- htmltools::div(
style = htmltools::css(
display = "grid", max.width = "80%", margin = "0% 5%",
grid.template.columns = "1fr 1fr", grid.template.rows = "auto auto",
grid.gap = "10px 30px"
),
render_mappings_table(
mappings$new, type = "new",
title = "**New** problems and their repairs"
),
render_mappings_table(
mappings$changed, type = "changed",
title = "**Changed** repairs for previously-seen problems"
),
render_mappings_table(
mappings$all, type = "all",
title = "**All** problems and their repairs",
style = htmltools::css(grid.column = "1 / span 2")
)
)
title_html <- htmltools::tags$h2(
style = htmltools::css(
transform = "translateX(5%)", margin.bottom = "10px", font.size = "300%"
),
htmltools::HTML(paste(cols$col_raw, "→", cols$col_parsed))
)
parsing_summary <- htmltools::div(
id = paste0("parse-", cols$col_raw),
title_html, tables_html
)
htmltools::browsable(parsing_summary)
}
compared_mappings %>%
lapply(split, ~ col_raw) %>%
purrr::transpose() %>%
lapply(render_parsing_summaries)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment