Last active
April 8, 2017 16:34
-
-
Save valentinitnelav/3577d5a1e3e5ac51b6776521da2eec78 to your computer and use it in GitHub Desktop.
Read <div> HTML tag with {rvest} using CSS selector
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Read <div> HTML tag with {rvest} using CSS selector | |
# ==================================================== | |
# Load library | |
library(rvest) | |
# ----------------------- | |
# Read the web page | |
# ----------------------- | |
link <- "http://www.adirondacklakessurvey.org/alscrpt.inc.php?alscpond=020225B&pname=ALLEGANY%20BROOK%20POND" | |
# NOTE: is ethical to store the page and not read it unnecessarily too many times, | |
# overloading their server | |
link.scrap <- read_html(link) | |
# ----------------------- | |
# Read division | |
# ----------------------- | |
div.location <- | |
html_nodes(x = link.scrap, | |
css = '#historic_report_location') %>% | |
html_text(trim = TRUE) %>% # note the trim = TRUE to trim leading and trailing spaces | |
strsplit(split = '\n') %>% # split by \n (new line), will return a list | |
unlist() %>% # or use .[[1]] or `[[`(1) to select only the first element [[1]] of the list | |
trimws() # remove leading and trailing whitespaces | |
# If the `trim = TRUE` is not used in `html_text(trim = TRUE)` above, | |
# then an alternative for skipping unwanted empty values would be to subset: | |
# .[. != ""] # skip empty values, where . (dot) symbolize data as inherited | |
# from the operations above. | |
# If doing so, then don’t forget the `%>%` operator after `trimws()` above. | |
div.location | |
## [1] "Location/General" "Pond Name: ALLEGANY BROOK POND" "Pond #: 020225B" | |
## [4] "Town: Black Brook" "County: Clinton" "USGS Quad: Redford" | |
## [7] "Watershed: Champlain" "In the Adk park?: Y" "Part of the ALTM program?: N" | |
## [10] "Ownership: Private" "Primitive Area: None" "Wilderness Area: None" | |
## [13] "Wild Forest Area: None" | |
# To understand why the need of splitting by \n (new line) was needed, | |
# run only the html_nodes() and html_text() part: | |
div.location <- | |
html_nodes(x = link.scrap, | |
css = '#historic_report_location') %>% | |
html_text(trim = TRUE) | |
# Now, print div.location with print() and cat() | |
# note that cat() knows how to interpret \n (new line) | |
print(div.location) | |
cat(div.location) | |
# All in all, the wanted information is separated by \n (new line) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment