valentinitnelav · April 8, 2017 16:34
diff --git a/rvest_Read_div.R b/rvest_Read_div.R
 # Read <div> HTML tag with {rvest} using CSS selector
 # ====================================================

 # Load library
 library(rvest)

 # -----------------------
 # Read the web page
 # -----------------------
 link <- "http://www.adirondacklakessurvey.org/alscrpt.inc.php?alscpond=020225B&pname=ALLEGANY%20BROOK%20POND"
 # NOTE: is ethical to store the page and not read it unnecessarily too many times, 
 # overloading their server
 link.scrap <- read_html(link)

 # -----------------------
 # Read division
 # -----------------------
 div.location <- 
    html_nodes(x   = link.scrap, 
               css = '#historic_report_location') %>%
    html_text(trim = TRUE) %>% # note the trim = TRUE to trim leading and trailing spaces
    strsplit(split = '\n') %>% # split by \n (new line), will return a list
    unlist() %>% # or use .[[1]] or `[[`(1) to select only the first element [[1]] of the list
    trimws()     # remove leading and trailing whitespaces
    # If the `trim = TRUE` is not used in `html_text(trim = TRUE)` above,
    # then an alternative for skipping unwanted empty values would be to subset:
    # .[. != ""] # skip empty values, where . (dot) symbolize data as inherited
    # from the operations above. 
    # If doing so, then don’t forget the `%>%` operator after `trimws()` above.

 div.location
 ##  [1] "Location/General"               "Pond Name: ALLEGANY BROOK POND" "Pond #: 020225B"               
 ##  [4] "Town: Black Brook"              "County: Clinton"                "USGS Quad: Redford"            
 ##  [7] "Watershed: Champlain"           "In the Adk park?: Y"            "Part of the ALTM program?: N"  
 ## [10] "Ownership: Private"             "Primitive Area: None"           "Wilderness Area: None"         
 ## [13] "Wild Forest Area: None" 

 # To understand why the need of splitting by \n (new line) was needed, 
 # run only the html_nodes() and html_text() part:
 div.location <- 
    html_nodes(x   = link.scrap, 
               css = '#historic_report_location') %>%
    html_text(trim = TRUE)

 # Now, print div.location with print() and cat()
 # note that cat() knows how to interpret \n (new line)
 print(div.location)
 cat(div.location)
 # All in all, the wanted information is separated by \n (new line)
	# Read <div> HTML tag with {rvest} using CSS selector
	# ====================================================

	# Load library
	library(rvest)

	# -----------------------
	# Read the web page
	# -----------------------
	link <- "http://www.adirondacklakessurvey.org/alscrpt.inc.php?alscpond=020225B&pname=ALLEGANY%20BROOK%20POND"
	# NOTE: is ethical to store the page and not read it unnecessarily too many times,
	# overloading their server
	link.scrap <- read_html(link)

	# -----------------------
	# Read division
	# -----------------------
	div.location <-
	html_nodes(x = link.scrap,
	css = '#historic_report_location') %>%
	html_text(trim = TRUE) %>% # note the trim = TRUE to trim leading and trailing spaces
	strsplit(split = '\n') %>% # split by \n (new line), will return a list
	unlist() %>% # or use .[[1]] or `[[`(1) to select only the first element [[1]] of the list
	trimws() # remove leading and trailing whitespaces
	# If the `trim = TRUE` is not used in `html_text(trim = TRUE)` above,
	# then an alternative for skipping unwanted empty values would be to subset:
	# .[. != ""] # skip empty values, where . (dot) symbolize data as inherited
	# from the operations above.
	# If doing so, then don’t forget the `%>%` operator after `trimws()` above.

	div.location
	## [1] "Location/General" "Pond Name: ALLEGANY BROOK POND" "Pond #: 020225B"
	## [4] "Town: Black Brook" "County: Clinton" "USGS Quad: Redford"
	## [7] "Watershed: Champlain" "In the Adk park?: Y" "Part of the ALTM program?: N"
	## [10] "Ownership: Private" "Primitive Area: None" "Wilderness Area: None"
	## [13] "Wild Forest Area: None"

	# To understand why the need of splitting by \n (new line) was needed,
	# run only the html_nodes() and html_text() part:
	div.location <-
	html_nodes(x = link.scrap,
	css = '#historic_report_location') %>%
	html_text(trim = TRUE)

	# Now, print div.location with print() and cat()
	# note that cat() knows how to interpret \n (new line)
	print(div.location)
	cat(div.location)
	# All in all, the wanted information is separated by \n (new line)