Created
December 5, 2019 16:25
-
-
Save adrienshen/90a9d8f0f5e2ad1b971ce69505ef6107 to your computer and use it in GitHub Desktop.
Web scraping film scripts using R and PostgreSQL
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# source and credits: https://www.mikejohnpage.com/blog/web-scraping-r-s-data-table-and-writing-to-postgresql-and-mysql/ | |
library(tidyverse) | |
library(robotstxt) | |
library(data.table) | |
library(rvest) # scraping library | |
library(stringi) | |
library(tidytext) | |
library(RPostgreSQL) | |
rtxt <- robotstxt("https://www.imsdb.com") | |
rtxt$permissions | |
paths_allowed("https://www.imsdb.com/scripts/10-Things-I-Hate-About-You.html") | |
url <- "https://www.imsdb.com/all%20scripts/" | |
links <- read_html(url) %>% | |
html_nodes("a") %>% | |
html_attr("href") | |
names <- read_html(url) %>% | |
html_nodes("a") %>% | |
html_text() | |
movies_dt <- data.table(links = links, names = names) | |
head(movies_dt) | |
tail(movies_dt) | |
movies_dt <- movies_dt[str_detect(links, "^/Movie")][!1:5] | |
names_replace <- str_replace_all(movies_dt$names, " ", "-") %>% | |
paste0(".html") | |
movies_dt$script = NA | |
movies_dt.sample = movies_dt[20:25] | |
names_replace.sample = names_replace[20:25] | |
for (i in seq_along(names_replace.sample)) { | |
temp_url <- paste0("https://www.imsdb.com/scripts/", names_replace.sample[i]) | |
if (html_session(temp_url)$response$status_code == 404) { | |
movies_dt.sample$script[i] <- NA | |
next | |
} | |
script_text <- temp_url %>% | |
read_html() %>% | |
html_nodes("pre") %>% | |
html_text() | |
if (length(script_text) == 0) { | |
movies_dt.sample$script[i] = NA | |
} else { | |
movies_dt.sample$script[i] <- script_text | |
} | |
} | |
# Inspect the first few lines of movies_dt after the scraping | |
substring(movies_dt.sample$scripts[1], 1, 1000) | |
cat(substring(movies_dt.sample$scripts[1], 1, 2000)) | |
movies_dt.sample[, scripts := str_replace(movies_dt.sample$scripts, "\\\n|\\\t|\\\r", "")] | |
movies_dt.sample <- unique(movies_dt.sample) | |
# movies_dt.first10 <- na.omit(movies_dt.first10) | |
# title_of_medium_articles <- "Transition into Data Engineering from a SWE Series" | |
movies_dt.sample[, script := str_to_lower(script)] | |
movies_dt.sample | |
movies_dt.sample[, script := stri_trans_general(movies_dt.sample$script, "latin-ascii")] | |
movies_dt.sample[, script := str_replace_all(script, "[^[:alnum:]']", " ")] | |
cat(substring(movies_dt.sample$script[1:10], 1, 2000)) | |
# make one row for each work in script | |
tidy_movies <- unnest_tokens(movies_dt.sample, word, script, token = "words") | |
# remove stop words | |
tidy_movies <- tidy_movies %>% anti_join(stop_words) | |
tidy_movies | |
## PostgreSQL write to | |
conn <- dbConnect(drv = dbDriver("PostgreSQL"), | |
dbname = "learnde", | |
host = "localhost", | |
port = 5432, | |
user = "adrienshen", | |
password = "") | |
dbWriteTable(conn=conn, | |
name="movie_scripts_words", | |
value=tidy_movies) | |
dbWriteTable(conn=conn, | |
name="movie_scripts", | |
value=movies_dt.sample) | |
dbDisconnect(conn) | |
# cat(substring(movies_dt.sample$script[1:10], 1, 2000)) | |
conn <- dbConnect(drv = dbDriver("PostgreSQL"), | |
dbname = "learnde", | |
host = "localhost", | |
port = 5432, | |
user = "adrienshen", | |
password = "") | |
# send query | |
query <- dbSendQuery(conn=conn, | |
"select word, count(*) as count | |
from movie_scripts_words | |
where names = '500 Days of Summer' | |
group by word | |
order by count desc;") | |
tidy_movies_st <- dbFetch(query, -1) | |
# to be cont. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment