noamross · April 1, 2018 17:39
diff --git a/get_bat_wiki_pages.R b/get_bat_wiki_pages.R
 library(tidyverse)
 library(xml2)
 library(rvest)
 library(WikipediR)
 library(urltools)

 # Get all speceies-level page titles from the Wikipedia list of bats
 bat_titles <- read_html("https://en.wikipedia.org/wiki/List_of_bats") %>%
 html_nodes(xpath="//ul/li[contains(., 'Genus')]/ul/li/a[starts-with(@href, '/wiki/')]") %>%
 xml_attr("href") %>%
 basename() %>%
 url_decode()

 # Get the content of all those pages (takes a couple of mins!)
 bat_info <- map_df(bat_titles, function(x) {
 return <- page_content(language="en", project="wikipedia", page_name=x)
 data_frame(title = return$parse$title,
 content = return$parse$text$`*`)
 })

 # Extract just the text from the HTML
 bat_text <- bat_info %>%
 mutate(content = map_chr(content, ~html_text(read_html(.))))
	library(tidyverse)
	library(xml2)
	library(rvest)
	library(WikipediR)
	library(urltools)

	# Get all speceies-level page titles from the Wikipedia list of bats
	bat_titles <- read_html("https://en.wikipedia.org/wiki/List_of_bats") %>%
	html_nodes(xpath="//ul/li[contains(., 'Genus')]/ul/li/a[starts-with(@href, '/wiki/')]") %>%
	xml_attr("href") %>%
	basename() %>%
	url_decode()

	# Get the content of all those pages (takes a couple of mins!)
	bat_info <- map_df(bat_titles, function(x) {
	return <- page_content(language="en", project="wikipedia", page_name=x)
	data_frame(title = return$parse$title,
	content = return$parse$text$`*`)
	})

	# Extract just the text from the HTML
	bat_text <- bat_info %>%
	mutate(content = map_chr(content, ~html_text(read_html(.))))