Created
March 28, 2019 16:07
-
-
Save paparaka/39f74dd5135f7ec659067fd13cb54866 to your computer and use it in GitHub Desktop.
Brexit alternative votes parsed in R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(rvest) | |
require(stringr) | |
require(data.table) | |
require(tidyverse) | |
library(dplyr) | |
#Specifying the url for desired website to be scraped | |
url <- 'https://www.theguardian.com/uk-news/ng-interactive/2019/mar/27/how-did-your-mp-vote-in-the-indicative-votes' | |
#Reading the HTML code from the website | |
webpage <- read_html(url) | |
#This is the table that hold the data we want | |
rows <- webpage %>% | |
html_nodes('.int-table') %>% | |
html_nodes(xpath = '//div[@class="int-row int-row--mp"]') | |
scrape_row <- function(i) { | |
r_name <- i %>% html_nodes(xpath = 'div[@class="int-cell int-cell--name"]') %>% | |
html_text() %>% str_replace_all("\n", "") | |
r_const <- i %>% html_nodes(xpath = 'div[@class="int-cell int-cell--const"]') %>% | |
html_text() %>% str_replace_all("\n", "") | |
r_party <- i %>% html_nodes(xpath = 'div[1]') %>% | |
html_text() %>% str_replace_all("\n", "") | |
r_colour <- i %>% html_nodes(xpath = 'div[1]') %>% html_attrs() %>% as.character() %>% | |
str_replace("int-cell int-cell--party int-color--", "") | |
r_votes <- i %>% html_nodes(xpath = 'div[@class="int-cell int-cell--vote"]/node()/*') %>% | |
html_attrs() %>% unlist() %>% str_replace("gv-vote-blob gv-", "") | |
votes_id = paste0("V",seq(1,8)) | |
data.frame("name"=r_name,"const"=r_const,"party"=r_party,votes_id,"vote"=r_votes) %>% return() | |
} | |
## apply function on each element of the list | |
out_list <- lapply(rows, scrape_row) | |
# row bind the list | |
DT <- do.call(rbind, out_list) %>% as.data.table() | |
DT %>% spread(key=votes_id, value = vote) | |
# DT %>% separate(col = const, | |
# into = c("const", "refer_position", "percent"), | |
# sep = "\w /(\w \d\%\)" ) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment