-
-
Save hemprichbennett/62e4facca777d1ea0b0287698d337786 to your computer and use it in GitHub Desktop.
This script mines a wikipedia page for bat facts and (if they meet a basic QC) tweets them out. Then it waits 3 hours, before doing it again
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##Wikipedia querying from Noam Ross https://gist.github.com/noamross/88a51bb880f18da88e4b259eefdefe87 | |
library(tidyverse) | |
library(xml2) | |
library(rvest) | |
library(WikipediR) | |
library(urltools) | |
library(stringr) | |
library(tokenizers) | |
library(rtweet) | |
library(here) | |
setwd(here()) | |
wait_in_r <- TRUE | |
wait_duration <- 211*60 #Number of seconds to wait | |
base_url <- "https://en.wikipedia.org/wiki/List_of_bats" | |
hashtag <- "#bats" #If you don't want a hashtag just assign this an empty character string | |
##twitter token was generated with the instructions here(http://rtweet.info/articles/auth.html), but I found it easier to just load the token rather than making it an environment variable | |
twitter_token <- readRDS('/Users/davehemprichbennett/twitter_token.rds') | |
#####Noam's wikipedia querying #### | |
#First, we get some basic information from wikipedia | |
A <- FALSE | |
while(A==FALSE){ #I schedule this by using an infinite loop with Sys.sleep used. This is a bad way of doing it, but works. To be ironed out later | |
# Get all speceies-level page titles from the Wikipedia list of bats | |
bat_titles <- read_html(base_url) %>% | |
html_nodes(xpath="//ul/li[contains(., 'Genus')]/ul/li/a[starts-with(@href, '/wiki/')]") %>% | |
xml_attr("href") %>% | |
basename() %>% | |
url_decode() | |
#####And now we're on Dave's far less elegant code ##### | |
#Now we select a random bat-page, then some random sequential sentences from it, then if they pass some | |
#QC steps, we output it | |
tweetable <- FALSE | |
while(tweetable==FALSE){ | |
bat_info <- map_df(bat_titles[sample(seq(1,length(bat_titles)),1)], function(x) { | |
return <- page_content(language="en", project="wikipedia", page_name=x) | |
data_frame(title = return$parse$title, | |
content = return$parse$text$`*`) | |
})#Get the info from a random page | |
# Extract just the text from the HTML | |
bat_text <- bat_info %>% | |
mutate(content = map_chr(content, ~html_text(read_html(.)))) | |
row <- sample(seq(1,nrow(bat_text)), 1) #Chose a random entry | |
sp_name <- bat_text[row,1]$title #Get the bat species' name | |
section_names <- str_extract_all(bat_text[row,2], ".+\\[edit\\]")[[1]] #Break the wall of text up into names and items | |
section_names <- gsub('\\[edit\\]', '', section_names) | |
sections <- str_split(bat_text[row,2], ".+\\[edit\\]")[[1]] | |
sections <- sections[-1] # I don't want the first entry, its a bit dull and gets in the way of the next line | |
names(sections) <- section_names | |
sections <- gsub('\\\n', '', sections) #Clean out all the newline characters | |
sections <- gsub('\\[.\\]', '', sections) #Clean out all the references, they'll make no sense out of context | |
sections <- gsub('\\\\', '', sections) | |
if(length(sections)<2){#Some pages only contain references (length == 1) or are completely blank (length ==0). These are garbage and should be skipped | |
next() | |
} | |
if('References' %in% names(sections)){ | |
ref_pos <- which(names(sections)=='References') #Find where the references section is, get rid of it as it would make for a terrible tweet | |
sections <- sections[-ref_pos] | |
} | |
if('Sources' %in% names(sections)){ | |
source_pos <- which(names(sections)=='Sources') #Ditto for sources | |
sections <- sections[-source_pos] | |
} | |
if('Footnotes' %in% names(sections)){ | |
source_pos <- which(names(sections)=='Footnotes') #Ditto for Footnotes | |
sections <- sections[-source_pos] | |
} | |
if(length(sections)==0){ | |
next() | |
} | |
if(length(grep('easurements', names(sections)))>0){ #Measurement based tweets would be very boring, delete the section now | |
measurement_position <- grep('easurements', names(sections)) | |
sections <- sections[-measurement_position] | |
} | |
if(length(sections)==0){ | |
next() | |
} | |
section_choice <- sample(seq(1,length(sections)),1)#Choose a random section to tweet from | |
sentences <- tokenize_sentences(sections[section_choice])[[1]] #Convert the block of text into a vector wher each item is a sentence | |
if(length(sentences)<2){ #Skip if the section is empty of tiny | |
next() | |
} | |
n_sentences<- length(sentences) #Get how many sentences there are, required for the next two lines | |
start_point <- sample(seq(1,n_sentences-1), 1) #The position of | |
end_point <- start_point+ sample(c(1,2),1) #Where will we end our text chunk | |
outstring <- paste(sentences[c(start_point, end_point)], collapse = ' ') #Make a string out of this | |
####Now its wikimedia time to get an image and its creator #### | |
photo_details <- str_split(bat_info[1,2], pattern = '\" src')[[1]][1] | |
photo_details <- str_split(photo_details, pattern = "<img alt=\\\"")[[1]][2] #This is the NAME of the image, to be queried on wikimedia | |
photo_details <- gsub(' ', '_', photo_details) | |
if(nchar(photo_details)==0){#Unable to get a decent photo (another good photo may be available in the page in a different position but the code isn't complex enough to search for it) so skipping | |
next() | |
} | |
photo_credit_url <- paste('https://commons.wikimedia.org/wiki/File:', photo_details, sep ='') #Becuase wikipedia doesn't include creditation for images from wikimedia on the wikipedia page in question, we instead have to query wikimedia for the creditation instead *eyeroll emoji* | |
wikimedia_text <- html_text(read_html(photo_credit_url)) | |
author <- str_split(pattern='Author', wikimedia_text)[[1]][2] | |
author <- str_split(pattern='\n', author)[[1]][2] | |
if(is.na(author)){ #If we were unable to get any info for the author, skip it | |
next() | |
} | |
if(nchar(author)==0){ #If we were unable to get any info for the author, skip it | |
next() | |
} | |
#Now to begin getting the bat image | |
bat_wiki <- read_html(photo_credit_url) | |
bat_media <- html_nodes(bat_wiki, ".internal") | |
bat_media_inf <- html_attrs(bat_media)[[1]] | |
photo_url <- bat_media_inf[1] | |
if(length(photo_url)!=1){ #If theres no url available for the image, or our regex gets confused and gives us too many potential images, skip | |
next() | |
} | |
####Now we put all the text together to make the tweet string#### | |
outstring <- gsub("\\s*\\[[^\\)]+\\]","",outstring) #Kill any references that have made it through, as nobody wants [11] in their tweet | |
outstring <- gsub(".*\\] ","",outstring) #Also occasionally your tweet will begin with half a reference e.g. '"16] Nothing is known about the diet...' This sorts that | |
outstring <- gsub(".*\\]","",outstring) #Also occasionally your tweet will begin with half a reference e.g. '"16]Nothing is known about the diet...' This sorts that | |
outstring <- gsub("\\[","",outstring) #Or sometimes ends like ' Leaves from Balanites species and several insects may also be eaten.[' | |
if(grepl("NA", outstring)){ #If we've selected empty space, skip to a new iteration of the while loop | |
next() | |
} | |
outstring <- paste(sp_name, ': ', outstring, sep ='') #Start making the tweet | |
#print(outstring) | |
n_chars <- nchar(outstring) | |
extra_length <- 24 + nchar(hashtag) + nchar(paste('Image by ', author, sep = '')) #all urls take up 23 characters, then one for a space after it, then more for image attribution | |
if(n_chars <240-extra_length & n_chars >120){ #If the string is a tweetable length and long enough to be interesting, tweet it | |
page <- page_info("en", "wikipedia", page=sp_name , as_wikitext=TRUE) | |
url <- page$query$pages[[1]]$fullurl | |
outstring <- paste(outstring, 'Image by', author, url, hashtag, sep =' ') | |
print(outstring) | |
tweetable <- TRUE | |
} | |
#outstring <- paste(outstring, 'put_url_here', '#bats', sep =' ') | |
#print(outstring) | |
#cat('outstring is ', n_chars, ' long') | |
#readline(prompt="Press [enter] to continue") | |
} | |
download.file(photo_url, 'temp.jpg', mode = 'wb') #This is done near the bottom so we don't regularly download images for potential tweets that don't pass our QC | |
#####twitter things #### | |
##Vignette of instructions for using it here http://rtweet.info/articles/auth.html | |
#now we just tweet the output | |
post_tweet(status = outstring, token = twitter_token, | |
in_reply_to_status_id = NULL, media = './temp.jpg') | |
file.remove('temp.jpg') | |
print(Sys.time()) | |
if(wait_in_r==TRUE){ | |
Sys.sleep(wait_duration) #The number of seconds to sleep for | |
}else{ | |
A <- TRUE | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment