library(Rcpp)
library(purrr)
library(stringr)
library(bench)
# Helper function to both functions
# Extracts the SeqId portion of a string & replaces the "." -> "-"
getSeqId <- function(x) {
x <- stringr::str_trim(x) # zap whitespace
match_mat <- stringr::str_locate(x, "[0-9]{4,5}[-.][0-9]{1,3}([._][0-9]{1,3})?$")
args <- list(
string = x,
start = match_mat[, "start"],
end = match_mat[, "end"]
)
purrr::pmap_chr(args, stringr::str_sub) %>%
stringr::str_replace("\\.", "-")
}
# The C++ version of matchSeqIds
Rcpp::cppFunction('
#include <Rcpp.h>
#include <unordered_map>
Rcpp::StringVector
matchseq_cpp(CharacterVector x, // string containing SeqId entries //
CharacterVector y, // string containing SeqIds to match x //
bool order_by_x = true) { // logical on ordering //
Function base_intersect("intersect"); // cannot use sugar; order differs; small speed cost tho //
Function getSeqId("getSeqId"); // get function from outside //
CharacterVector x_seqs = getSeqId(x); // get SeqIds; non-seqs are NA //
x_seqs = Rcpp::na_omit(x_seqs); // rm NAs //
CharacterVector y_seqs = getSeqId(y); // get SeqIds from y //
int L = y.size();
std::unordered_map< std::string, std::string > hashmap; // initialize hashmap //
for(int i = 0; i < L; i++) {
hashmap.insert(std::make_pair(y_seqs[i], y[i])); // populate hashmap //
}
CharacterVector ord_seqs(L); // initiate vector for intersect //
if (order_by_x) {
ord_seqs = base_intersect(x_seqs, y_seqs); // get SeqId intersection of x & y //
} else {
ord_seqs = base_intersect(y_seqs, x_seqs); // get SeqId intersection of y & x //
}
int n = ord_seqs.size();
Rcpp::StringVector res(n);
for(int j = 0; j < n; j++) {
// get the corresponding y values //
res[j] = hashmap[ Rcpp::as< std::string >(ord_seqs[j]) ];
}
return res;
}')
# The original matchSeqIds
# A Seqid is of this form: XYZ.1234.56 or XYZ-1234_45
# Match on the SeqId portion; NOT the preceeding ":alpha:" string
matchSeqIds <- function(x, y, order.by.x = TRUE) {
x_seqIds <- getSeqId(x) %>% purrr::discard(is.na)
# create lookup table to index SeqIds to their values in 'y'
y_lookup <- as.list(y) %>% purrr::set_names(getSeqId(y))
y_seqIds <- names(y_lookup) %>% purrr::discard(is.na) # rm NAs in 'y'
if (order.by.x) {
order_seqs <- intersect(x_seqIds, y_seqIds)
} else {
order_seqs <- intersect(y_seqIds, x_seqIds)
}
if (length(order_seqs) == 0) {
return(character(0))
}
purrr::map_chr(order_seqs, ~ y_lookup[[.x]])
}
# Generate a dummy long string of combined SeqIds
set.seed(101)
n <- 5000
gene <- replicate(n, paste0(sample(LETTERS, 4, replace = TRUE), collapse = ""))
x <- paste0(gene, ".", sample(1:15000, n), ".", sample(1:999, n, replace = TRUE))
# sample 10 from x and modify the GeneID
# so that full matches can't be done
# must match based on SeqId
y <- sample(x, 2500) # almost all of 'x' but still a subset
y <- paste0(sample(LETTERS, n, replace = TRUE), y)
# Benchmark
bnch <- bench::mark(
matchseq_cpp = matchseq_cpp(x, y),
matchSeqIds = matchSeqIds(x, y),
iterations = 100
)
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
# Absolute
# Only ~ 2x improvement
# Longer 'y' results in > improvement (more matching to do)
bnch
#> # A tibble: 2 x 10
#> expression min mean median max `itr/sec` mem_alloc n_gc n_itr
#> <bch:expr> <bch:> <bch:> <bch:> <bch:t> <dbl> <bch:byt> <dbl> <int>
#> 1 matchseq_cpp 31.5ms 38ms 38ms 46.3ms 26.3 1.23MB 67 100
#> 2 matchSeqIds 66.1ms 75.9ms 75.2ms 116.4ms 13.2 1.47MB 129 100
#> # … with 1 more variable: total_time <bch:tm>
# Relative
summary(bnch, relative = TRUE)
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 2 x 10
#> expression min mean median max `itr/sec` mem_alloc n_gc n_itr
#> <bch:expr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 matchseq_cpp 1 1 1 1 2.00 1 1 1
#> 2 matchSeqIds 2.10 2.00 1.98 2.52 1 1.20 1.93 1
#> # … with 1 more variable: total_time <dbl>
Created on 2019-08-27 by the reprex package (v0.2.1)
Session info
devtools::session_info()
#> ─ Session info ──────────────────────────────────────────────────────────
#> setting value
#> version R version 3.5.2 (2018-12-20)
#> os macOS Mojave 10.14.6
#> system x86_64, darwin15.6.0
#> ui X11
#> language (EN)
#> collate en_US.UTF-8
#> ctype en_US.UTF-8
#> tz America/Denver
#> date 2019-08-27
#>
#> ─ Packages ──────────────────────────────────────────────────────────────
#> package * version date lib source
#> assertthat 0.2.0 2017-04-11 [1] CRAN (R 3.5.0)
#> backports 1.1.3 2018-12-14 [1] CRAN (R 3.5.0)
#> bench * 1.0.1.9000 2019-02-01 [1] Github (r-lib/bench@3e5d63f)
#> callr 3.1.1 2018-12-21 [1] CRAN (R 3.5.0)
#> cli 1.1.0 2019-03-19 [1] RSPM (R 3.5.2)
#> crayon 1.3.4 2017-09-16 [1] RSPM (R 3.5.2)
#> desc 1.2.0 2018-05-01 [1] CRAN (R 3.5.0)
#> devtools 2.0.2 2019-04-08 [1] RSPM (R 3.5.2)
#> digest 0.6.18 2018-10-10 [1] CRAN (R 3.5.0)
#> evaluate 0.13 2019-02-12 [1] CRAN (R 3.5.2)
#> fansi 0.4.0 2018-10-05 [1] CRAN (R 3.5.0)
#> fs 1.2.6 2018-08-23 [1] CRAN (R 3.5.0)
#> glue 1.3.1 2019-03-12 [1] CRAN (R 3.5.2)
#> highr 0.7 2018-06-09 [1] CRAN (R 3.5.0)
#> htmltools 0.3.6 2017-04-28 [1] CRAN (R 3.5.0)
#> knitr 1.22 2019-03-08 [1] CRAN (R 3.5.2)
#> magrittr 1.5 2014-11-22 [1] RSPM (R 3.5.2)
#> memoise 1.1.0 2017-04-21 [1] CRAN (R 3.5.0)
#> pillar 1.3.1 2018-12-15 [1] CRAN (R 3.5.0)
#> pkgbuild 1.0.2 2018-10-16 [1] CRAN (R 3.5.0)
#> pkgconfig 2.0.2 2018-08-16 [1] CRAN (R 3.5.0)
#> pkgload 1.0.2 2018-10-29 [1] CRAN (R 3.5.0)
#> prettyunits 1.0.2 2015-07-13 [1] CRAN (R 3.5.0)
#> processx 3.2.1 2018-12-05 [1] CRAN (R 3.5.0)
#> profmem 0.5.0 2018-01-30 [1] CRAN (R 3.5.0)
#> ps 1.3.0 2018-12-21 [1] CRAN (R 3.5.0)
#> purrr * 0.3.2 2019-03-15 [1] RSPM (R 3.5.2)
#> R6 2.4.0 2019-02-14 [1] CRAN (R 3.5.2)
#> Rcpp * 1.0.1 2019-03-17 [1] RSPM (R 3.5.2)
#> remotes 2.0.2 2018-10-30 [1] CRAN (R 3.5.0)
#> rlang 0.3.4 2019-04-07 [1] RSPM (R 3.5.2)
#> rmarkdown 1.11 2018-12-08 [1] CRAN (R 3.5.0)
#> rprojroot 1.3-2 2018-01-03 [1] CRAN (R 3.5.0)
#> sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 3.5.0)
#> stringi 1.4.3 2019-03-12 [1] CRAN (R 3.5.2)
#> stringr * 1.4.0 2019-02-10 [1] RSPM (R 3.5.2)
#> testthat 2.2.0 2019-07-22 [1] CRAN (R 3.5.2)
#> tibble 2.1.1 2019-03-16 [1] RSPM (R 3.5.2)
#> usethis 1.5.0 2019-04-07 [1] CRAN (R 3.5.2)
#> utf8 1.1.4 2018-05-24 [1] CRAN (R 3.5.0)
#> withr 2.1.2 2018-03-15 [1] CRAN (R 3.5.0)
#> xfun 0.5 2019-02-20 [1] CRAN (R 3.5.2)
#> yaml 2.2.0 2018-07-25 [1] CRAN (R 3.5.0)
#>
#> [1] /Users/sfield/r_libs
#> [2] /Library/Frameworks/R.framework/Versions/3.5/Resources/library