Last active
April 1, 2018 17:39
-
-
Save noamross/88a51bb880f18da88e4b259eefdefe87 to your computer and use it in GitHub Desktop.
Get the content of all bat Wikipedia pages
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(xml2) | |
library(rvest) | |
library(WikipediR) | |
library(urltools) | |
# Get all speceies-level page titles from the Wikipedia list of bats | |
bat_titles <- read_html("https://en.wikipedia.org/wiki/List_of_bats") %>% | |
html_nodes(xpath="//ul/li[contains(., 'Genus')]/ul/li/a[starts-with(@href, '/wiki/')]") %>% | |
xml_attr("href") %>% | |
basename() %>% | |
url_decode() | |
# Get the content of all those pages (takes a couple of mins!) | |
bat_info <- map_df(bat_titles, function(x) { | |
return <- page_content(language="en", project="wikipedia", page_name=x) | |
data_frame(title = return$parse$title, | |
content = return$parse$text$`*`) | |
}) | |
# Extract just the text from the HTML | |
bat_text <- bat_info %>% | |
mutate(content = map_chr(content, ~html_text(read_html(.)))) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment