Skip to content

Instantly share code, notes, and snippets.

@PoisonAlien
Last active August 28, 2024 05:34
Show Gist options
  • Save PoisonAlien/d92e040f3b7849556e7504cd0fcc0c95 to your computer and use it in GitHub Desktop.
Save PoisonAlien/d92e040f3b7849556e7504cd0fcc0c95 to your computer and use it in GitHub Desktop.
---
title: "Sequencing run summary"
date: "Generated on: `r Sys.Date()`"
output:
html_document:
toc: true
toc_depth: 3
toc_float: true
self_contained: yes
theme: sandstone
highlight: tango
params:
summary_file: NULL
bam_manta_files: NULL
metrics_dir: NULL
filter_stats_file: NULL
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE, fig.width = 8)
```
<img src="https://www.arcensus-diagnostics.com/wp-content/uploads/arcensus-logo_monochrome-blue-1.svg" height="42" width="250" style="position:absolute;top:30px;right:10px;" >
```{r}
library(ggplot2)
library(plotly)
library(data.table)
library(kableExtra)
#x = data.table::fread("~/Documents/arc_rsa/S7100_240729_1033_summary.tsv")
x = data.table::fread(params$summary_file)
x$sample_id = as.character(x$sample_id)
x$sex = ifelse(test = x$ploidy_estimation == "XY", yes = "♂", no = ifelse(test = x$ploidy_estimation == "XX", yes = "♀", no = "⚤"))
#file_pahts = data.table::fread("~/Documents/arc_rsa/S7100_240729_1033.bam_manta_files.tsv")
file_pahts = data.table::fread(params$bam_manta_files)
file_pahts$sample_id = data.table::tstrsplit(x = file_pahts$sample_id, split = "_", keep = 1) |> unlist() |> as.character()
x = merge(file_pahts, x, by = "sample_id", all = TRUE)
```
# Arcflows summary
## Run summary {.tabset .tabset-fade .tabset-pills}
### General info
```{r arcflow_gen_stat}
dt = x[,.(sample_id, date_received, date_processed, sex, contamination_level)]
dt_fp = x[,.(sample_id, BAM_path, SV_path, CNV_path)]
kableExtra::kbl(x = dt, col.names = gsub("_", " ", names(dt)), booktabs = TRUE, full_width = TRUE, html_font = "Cambria", escape = FALSE) |> kable_paper(full_width = TRUE) |> kable_styling(bootstrap_options = c("hover", "condensed", "striped"))
```
### variant filtering
```{r arcflow_var_filt}
#filter_stats_dir = "/Users/anand/Documents/arc_rsa/metrics/filterstats/"
filter_stats_files = data.table::fread(input = params$filter_stats_file, header = FALSE)$V1
filter_stats = lapply(filter_stats_files, function(y) {
y = data.table::fread(y)
colnames(y) = "n"
y$filter = c("raw", "QC", "gnomadAF10", "gnomadAF05", "gnomadAF02", "arcAF10")
y
})
names(filter_stats) = basename(path = filter_stats_files) |> data.table::tstrsplit(split = "_", keep = 1) |> unlist() |> as.character()
filter_stats = data.table::rbindlist(l = filter_stats, use.names = TRUE, fill = TRUE, idcol = "sample_id") |> data.table::dcast(sample_id ~ filter, value.var = "n")
filter_stats = filter_stats[,.(sample_id, raw, QC, gnomadAF10, gnomadAF05,gnomadAF02, arcAF10)]
```
```{r mappingKbl}
k = kableExtra::kbl(x = filter_stats, col.names = gsub("_", " ", names(filter_stats)), booktabs = TRUE, full_width = TRUE, html_font = "Cambria", escape = FALSE, format.args = list(big.mark = ",")) |> kable_paper(full_width = TRUE) |> kable_styling(bootstrap_options = c("hover", "condensed", "striped", "striped"))
k = column_spec(kable_input = k, column = 7, color = "white",
background = spec_color(filter_stats$arcAF10, end = 0.7, direction = -1, alpha = 0.6))
k
```
### Ancestry
```{r arcflow_ancestry}
anc = x[,.(sample_id, prob_afr, prob_ami, prob_amr, prob_asj, prob_eas, prob_fin, prob_mid, prob_nfe, prob_oth, prob_sas)]
x_anc = data.table::melt(data = anc, id.vars = "sample_id")
colnames(x_anc) = c("sample_id", "ancestry", "prob")
x_anc$ancestry = gsub(pattern = "prob_", replacement = "", x = x_anc$ancestry)
x_anc$ancestry = factor(x = x_anc$ancestry, levels = c('afr', 'ami', 'amr', 'asj', 'eas', 'fin', 'mde', 'nfe', 'oth', 'sas'))
col_codes = setNames(nm = c('afr', 'ami', 'amr', 'asj', 'eas', 'fin', 'mde', 'nfe', 'oth', 'sas'), object = c('#941494', '#FFC0CB', '#ED1E24', '#FF7F50', '#108C44', '#002F6C', '#33CC33', '#6AA5CD', '#ABB9B9', '#FF9912'))
anc_plot = ggplot(data = x_anc, aes(x = sample_id, y = prob, fill = ancestry))+ geom_bar(stat="identity", colour="white")+scale_fill_manual(values = col_codes)+xlab("")+ylab("Population probability")+theme_minimal()+coord_flip()
plotly::ggplotly(p = anc_plot)
```
### Result file paths
```{r arcflow_filepaths}
dt_fp$BAM_path = text_spec(format = "html", x = dt_fp$sample_id, link = dt_fp$BAM_path, underline = TRUE, new_tab = TRUE)
dt_fp$SV_path = text_spec(format = "html", x = dt_fp$sample_id, link = dt_fp$SV_path, underline = TRUE, new_tab = TRUE)
dt_fp$CNV_path = text_spec(format = "html", x = dt_fp$sample_id, link = dt_fp$CNV_path, underline = TRUE, new_tab = TRUE)
kableExtra::kbl(x = dt_fp, col.names = gsub("_", " ", names(dt_fp)), booktabs = TRUE, full_width = TRUE, html_font = "Cambria", escape = FALSE) |> kable_paper(full_width = TRUE) |> kable_styling(bootstrap_options = c("hover", "condensed", "striped"))
```
# DRAGEN summary
## General Statistics
```{r}
#metrics_dir = "/Users/anand/Documents/arc_rsa/metrics/"
mapping_metric_files = list.files(path = params$metrics_dir, pattern = "*.mapping_metrics.csv", full.names = TRUE)
mapping_metrics = lapply(mapping_metric_files, function(x){
xd = data.table::fread(input = x, fill = TRUE, sep = ",")
Input_reads = xd[V3 %in% "Total input reads", V4] |> as.numeric()
Dup_reads = xd[V1 %in% "MAPPING/ALIGNING SUMMARY"][V3 %in% "Number of duplicate marked reads", V5] |> as.numeric()
Properly_paired = xd[V1 %in% "MAPPING/ALIGNING SUMMARY"][V3 %in% "Properly paired reads", V5] |> as.numeric()
Unmapped_reads = xd[V1 %in% "MAPPING/ALIGNING SUMMARY"][V3 %in% "Unmapped reads", V5] |> as.numeric()
Est_fract_contamination = xd[V1 %in% "MAPPING/ALIGNING SUMMARY"][V3 %in% "Estimated sample contamination", V4] |> as.numeric()
Est_readlen = xd[V1 %in% "MAPPING/ALIGNING SUMMARY"][V3 %in% "Estimated read length", V4] |> as.numeric()
median_IS = xd[V1 %in% "MAPPING/ALIGNING SUMMARY"][V3 %in% "Insert length: median", V4] |> as.numeric()
data.table::data.table(Input_reads, Unmapped_reads, Dup_reads, Properly_paired, median_IS, Est_readlen, Est_fract_contamination)
})
names(mapping_metrics) = basename(path = mapping_metric_files) |> data.table::tstrsplit(split = "_", keep = 1) |> unlist()
mapping_metrics = data.table::rbindlist(mapping_metrics, use.names = TRUE, fill = TRUE, idcol = "sample_id")
```
```{r}
k = kableExtra::kbl(x = mapping_metrics, col.names = gsub("_", " ", names(mapping_metrics)), booktabs = TRUE, full_width = TRUE, html_font = "Cambria", escape = FALSE, format.args = list(big.mark = ",")) |> kable_paper(full_width = TRUE) |> kable_styling(bootstrap_options = c("hover", "condensed", "striped", "striped"))
k = column_spec(kable_input = k, column = 3, color = "white",
background = spec_color(mapping_metrics$Unmapped_reads, end = 0.7, direction = 1, alpha = 0.6, palette = viridisLite::viridis(256, 0.6, 0, 0.7, 1, "turbo")))
k = column_spec(kable_input = k, column = 4, color = "white",
background = spec_color(mapping_metrics$Dup_reads, end = 0.7, direction = -1, alpha = 0.6))
k
```
## Alignment metrics {.tabset .tabset-fade .tabset-pills}
```{r, echo=FALSE}
#percent
dragen_metrics_pct = lapply(mapping_metric_files, function(x){
xd = data.table::fread(input = x, fill = TRUE, sep = ",")
pct_prop_paired_reads = xd[V1 %in% "MAPPING/ALIGNING SUMMARY"][V3 %in% "Properly paired reads", V5] |> as.numeric()
pct_paired_discor_reads = xd[V1 %in% "MAPPING/ALIGNING SUMMARY"][V3 %in% "Not properly paired reads (discordant)", V5] |> as.numeric()
pct_singleton = xd[V1 %in% "MAPPING/ALIGNING SUMMARY"][V3 %in% "Singleton reads (itself mapped; mate unmapped)", V5] |> as.numeric()
pct_unmapped = xd[V1 %in% "MAPPING/ALIGNING SUMMARY"][V3 %in% "Unmapped reads", V5] |> as.numeric()
data.table::data.table(pct_prop_paired_reads, pct_paired_discor_reads, pct_singleton, pct_unmapped)
})
names(dragen_metrics_pct) = basename(path = mapping_metric_files) |> data.table::tstrsplit(split = "_", keep = 1) |> unlist()
dragen_metrics_pct = data.table::rbindlist(dragen_metrics_pct, use.names = TRUE, fill = TRUE, idcol = "sample_id")
dragen_metrics_pct = data.table::melt(data = dragen_metrics_pct, id.vars = "sample_id")
#Raw
dragen_metrics_raw = lapply(mapping_metric_files, function(x){
xd = data.table::fread(input = x, fill = TRUE, sep = ",")
raw_prop_paired_reads = xd[V1 %in% "MAPPING/ALIGNING SUMMARY"][V3 %in% "Properly paired reads", V4] |> as.numeric()
raw_paired_discor_reads = xd[V1 %in% "MAPPING/ALIGNING SUMMARY"][V3 %in% "Not properly paired reads (discordant)", V4] |> as.numeric()
raw_singleton = xd[V1 %in% "MAPPING/ALIGNING SUMMARY"][V3 %in% "Singleton reads (itself mapped; mate unmapped)", V4] |> as.numeric()
raw_unmapped = xd[V1 %in% "MAPPING/ALIGNING SUMMARY"][V3 %in% "Unmapped reads", V4] |> as.numeric()
data.table::data.table(raw_prop_paired_reads, raw_paired_discor_reads, raw_singleton, raw_unmapped)
})
names(dragen_metrics_raw) = basename(path = mapping_metric_files) |> data.table::tstrsplit(split = "_", keep = 1) |> unlist()
dragen_metrics_raw = data.table::rbindlist(dragen_metrics_raw, use.names = TRUE, fill = TRUE, idcol = "sample_id")
dragen_metrics_raw = data.table::melt(data = dragen_metrics_raw, id.vars = "sample_id")
```
### Percentages
```{r}
dragen_mapping_colrs2 = setNames(nm = c("pct_prop_paired_reads", "pct_paired_discor_reads", "pct_singleton",
"pct_unmapped"), object = c('#108C44', '#FF7F50', '#941494', '#ED1E24'))
dragen_metrics_pct_plot = ggplot(data = dragen_metrics_pct, aes(x = sample_id, y = value, fill = variable))+geom_bar(stat="identity", colour="white")+theme_minimal()+coord_flip()+scale_fill_manual(values = dragen_mapping_colrs2)+xlab(label = element_blank())+ylab(label = element_blank())
plotly::ggplotly(p = dragen_metrics_pct_plot)
```
### Raw
```{r}
dragen_mapping_colrs = setNames(nm = c("raw_prop_paired_reads", "raw_paired_discor_reads", "raw_singleton",
"raw_unmapped"), object = c('#108C44', '#FF7F50', '#941494', '#ED1E24'))
dragen_metrics_raw_plot = ggplot(data = dragen_metrics_raw, aes(x = sample_id, y = value, fill = variable))+geom_bar(stat="identity", colour="white")+theme_minimal()+coord_flip()+scale_fill_manual(values = dragen_mapping_colrs)+xlab(label = element_blank())+ylab(label = element_blank())
plotly::ggplotly(p = dragen_metrics_raw_plot)
```
## Coverage {.tabset .tabset-fade .tabset-pills}
### XY coverage
```{r}
xy_cov = x[,.(sample_id, autosomal_median_coverage, x_median_coverage, y_median_coverage)]
colnames(xy_cov) = c("sample_id", "autosomes", "X", "Y")
xy_cov = data.table::melt(data = xy_cov, id.vars = "sample_id")
colnames(xy_cov) = c("sample_id", "chromosome", "median_coverage")
xy_cov_plot = ggplot(data = xy_cov, aes(x = chromosome, y = median_coverage, fill = chromosome))+geom_bar(stat="identity")+facet_grid(~sample_id)+theme_minimal()+theme(axis.text.x = element_blank())+ylab("Median coverage")+xlab("")+scale_fill_manual(values = list(autosomes = "#34495e", X = "#e84393", Y = "#0984e3"))
plotly::ggplotly(p = xy_cov_plot)
```
### Genome coverage
```{r}
wgs_coverage_metric_files = list.files(path = params$metrics_dir, pattern = "*.wgs_coverage_metrics.csv", full.names = TRUE)
if(length(wgs_coverage_metric_files) > 0){
genome_cvg = lapply(wgs_coverage_metric_files, function(x){
xd = data.table::fread(input = x, sep = ",", fill = TRUE)
xd = xd[V3 %like% "PCT of genome with coverage"][!V3 %like% "inf"][,.(V3, V4)]
xd$V3 = gsub(pattern = "PCT of genome with coverage ", replacement = "", x = xd$V3)
colnames(xd) = c("coverage", "pct_genome")
xd
})
names(genome_cvg) = data.table::tstrsplit(x = basename(wgs_coverage_metric_files), split = "_", keep = 1) |> unlist() |> as.character()
genome_cvg = data.table::rbindlist(l = genome_cvg, idcol = "sample_id", use.names = TRUE, fill = TRUE)
genome_cvg_plot = ggplot(data = genome_cvg, aes(x = sample_id, y = pct_genome, fill = coverage))+ geom_bar(stat="identity", colour="white")+xlab("")+ylab("% Genome")+theme_minimal()+coord_flip()
plotly::ggplotly(p = genome_cvg_plot)
}
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment