Last active
November 29, 2017 13:27
-
-
Save josefslerka/fb8e0300f47bd5241a614c7a70352d21 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://www.analyticsvidhya.com/blog/2017/03/beginners-guide-on-web-scraping-in-r-using-rvest-with-hands-on-knowledge/ | |
library('rvest') | |
url <- 'http://www.imdb.com/search/title?count=300&release_date=2016,2016&title_type=feature' | |
webpage <- read_html(url) | |
rank_data_html <- html_nodes(webpage,'.text-primary') | |
#Converting the ranking data to text | |
rank_data <- html_text(rank_data_html) | |
#Data-Preprocessing: Converting rankings to numerical | |
rank_data<-as.numeric(rank_data) | |
#Using CSS selectors to scrap the title section | |
title_data_html <- html_nodes(webpage,'.lister-item-header a') | |
#Converting the title data to text | |
title_data <- html_text(title_data_html) | |
#Using CSS selectors to scrap the description section | |
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted') | |
#Converting the description data to text | |
description_data <- html_text(description_data_html) | |
#Data-Preprocessing: removing '\n' | |
description_data<-gsub("\n","",description_data) | |
#Using CSS selectors to scrap the Movie runtime section | |
runtime_data_html <- html_nodes(webpage,'.text-muted .runtime') | |
#Converting the runtime data to text | |
runtime_data <- html_text(runtime_data_html) | |
#Data-Preprocessing: removing mins and converting it to numerical | |
runtime_data<-gsub(" min","",runtime_data) | |
runtime_data<-as.numeric(runtime_data) | |
#Using CSS selectors to scrap the Movie genre section | |
genre_data_html <- html_nodes(webpage,'.genre') | |
#Converting the genre data to text | |
genre_data <- html_text(genre_data_html) | |
#Data-Preprocessing: removing \n | |
genre_data<-gsub("\n","",genre_data) | |
#Data-Preprocessing: removing excess spaces | |
genre_data<-gsub(" ","",genre_data) | |
#taking only the first genre of each movie | |
genre_data<-gsub(",.*","",genre_data) | |
#Convering each genre from text to factor | |
genre_data<-as.factor(genre_data) | |
#Using CSS selectors to scrap the IMDB rating section | |
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong') | |
#Converting the ratings data to text | |
rating_data <- html_text(rating_data_html) | |
rating_data<-as.numeric(rating_data) | |
movies_df<-data.frame(Rank = rank_data, Title = title_data, Description = description_data, Runtime = runtime_data, Genre = genre_data, Rating = rating_data) | |
summary(movies_df$Runtime) | |
plot(movies_df$Runtime,movies_df$Rating) | |
table(movies_df$Genre) | |
freq <- table(movies_df$Genre) | |
pie(freq) | |
cor(movies_df$Runtime,movies_df$Rating) | |
comedy_df <- subset(movies_df, Genre=="Comedy") | |
drama_df <- subset(movies_df, Genre=="Drama") | |
action_df <- subset(movies_df, Genre=="Action") | |
boxplot(comedy_df$Rating,action_df$Rating,drama_df$Rating) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment