Last active
April 30, 2016 06:36
-
-
Save leeper/4e706b8396386b78a3fe7abb22f154ca to your computer and use it in GitHub Desktop.
Check URLs in a document
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Check URLs in a document | |
## This code will extract URLs from a text document using regex, | |
## then execute an HTTP HEAD request on each and report whether | |
## the request failed, whether a redirect occurred, etc. It might | |
## be useful for cleaning up linkrot. | |
if (!require("httr")) { | |
install.packages("httr", repos = "http://cran.rstudio.com/") | |
} | |
extract_urls <- function(file, ...) { | |
f <- rawToChar(readBin(file, what = "raw", n = 1e7L)) | |
x <- "(http|ftp|https)://([\\w_-]+(?:(?:\\.[\\w_-]+)+))([\\w.,@?^=%&:/~+#-]*[\\w@?^=%&/~+#-])?" | |
# Credit: http://stackoverflow.com/questions/6038061/regular-expression-to-find-urls-within-a-string | |
m <- regmatches(f, gregexpr(x, f, perl=TRUE))[[1]] | |
m | |
} | |
check_one <- function(url, ...) { | |
r <- try(httr::HEAD(url, ...), silent = TRUE) | |
if (inherits(r, "try-error")) { | |
list(url1 = url, | |
url2 = NA_character_, | |
redirect = NA, | |
error = NA, | |
status = NA_real_ | |
) | |
} else { | |
list(url1 = url, | |
url2 = r$url, | |
redirect = !identical(url, r$url), | |
error = http_error(r), | |
status = status_code(r) | |
) | |
} | |
} | |
check_urls <- function(urls, ...) { | |
check <- lapply(urls, check_one, ...) | |
out <- do.call("rbind.data.frame", c(check, stringsAsFactors = FALSE, make.row.names = FALSE)) | |
return(structure(out, class = c("url_check", "data.frame"))) | |
} | |
print.url_check <- function(x, ...) { | |
f <- is.na(x[["url2"]]) | |
r <- x[["redirect"]] | |
e <- x[["error"]] | |
s <- x[["status"]] != 200 | |
print.data.frame(x[ (f | r | e | s) ,, drop = FALSE]) | |
invisible(x) | |
} | |
u <- extract_urls("FILENAME") # extract urls | |
check_urls(u) # check the URLs | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This helped a lot, thank you :-) I might have found 2 small bugs that causes some URLs to falsely appear with error & without redirect:
http://doi.org/10.1016/0022-0981
in the output.