Skip to content

Instantly share code, notes, and snippets.

Forked from daattali/linkedin.R
Created March 6, 2017 19:11
Show Gist options
  • Save vipl0ve/48f678525c53ee54b9d49f6d01a3a29b to your computer and use it in GitHub Desktop.
Save vipl0ve/48f678525c53ee54b9d49f6d01a3a29b to your computer and use it in GitHub Desktop.
Scraping Twitter and LinkedIn info in R
# Get a person's name, location, summary, # of connections, and skills & endorsements from LinkedIn
# URL of the LinkedIn page
user_url <- ""
# since the information isn't available without being logged in, the web
# scraper needs to log in. Provide your LinkedIn user/pw here (this isn't stored
# anywhere as you can see, it's just used to log in during the scrape session)
username <- "yourusername"
password <- "yourpassword"
# takes a couple seconds and might throw a warning, but ignore the warning
# (linkedin_info <- scrape_linkedin(user_url))
scrape_linkedin <- function(user_url) {
linkedin_url <- ""
pgsession <- html_session(linkedin_url)
pgform <- html_form(pgsession)[[1]]
filled_form <- set_values(pgform,
session_key = username,
session_password = password)
submit_form(pgsession, filled_form)
pgsession <- jump_to(pgsession, user_url)
page_html <- read_html(pgsession)
name <-
page_html %>% html_nodes("#name") %>% html_text()
location <-
page_html %>% html_nodes("#location .locality") %>% html_text()
num_connections <-
page_html %>% html_nodes(".member-connections strong") %>% html_text()
summary <-
page_html %>% html_nodes("#summary-item-view") %>% html_text()
skills_nodes <-
page_html %>% html_nodes("#profile-skills .skill-pill")
skills <-
lapply(skills_nodes, function(node) {
num <- node %>% html_nodes(".num-endorsements") %>% html_text()
name <- node %>% html_nodes(".endorse-item-name-text") %>% html_text()
data.frame(name = name, num = num)
skills <-, skills)
name = name,
location = location,
num_connections = num_connections,
summary = summary,
skills = skills
# Make a wordcloud of the most common words in a person's tweets
# Need to create a Twitter App and get credentials
# Username of the Twitter user
name <- "daattali"
user <- userTimeline(user = name, n = 3200, includeRts = FALSE, excludeReplies = TRUE)
tweets <- sapply(user, function(x) { strsplit(gsub("[^[:alnum:] ]", "", x$text), " +")[[1]] })
topwords <-
tweets %>%
paste(collapse = " ") %>%
str_split("\\s") %>%
unlist %>%
tolower %>%
removePunctuation %>%
removeWords(stopwords("english")) %>%
#wordStem %>%
.[. != ""] %>%
table %>%
sort(decreasing = TRUE) %>%
wordcloud(names(topwordscloud), topwords, min.freq = 3)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment