AlienDeg · January 4, 2018 16:46
diff --git a/daph-scrapping.R b/daph-scrapping.R
 library(rvest)
 library(dplyr)
 library(stringr)
 library(tidytext)
 library(ggplot2)
 library(ggthemes)
 library(tm)

 out = NULL
 for (i in 1:6 ){
  for (j in 1:6) {
      if ( i > 1) {
     x <- read_html(paste0("http://www.analyticshour.io/all-podcast-episodes/page/", i))
    
    htmlNode <- x %>% 
      html_nodes(paste0("body > div.super-container.light-icons > div.main-content.page.archive-page > div > div > div > div > article:nth-child(",j,") > div > footer > ul > li.title.not-truncate ")) %>%
      html_children 
    
    check <- x %>% 
      html_nodes(paste0("body > div.super-container.light-icons > div.main-content.page.archive-page > div > div > div > div > article:nth-child(",j,") > div > footer > ul > li.title.not-truncate > a")) %>% html_text() %>% substr(2, 4) %>% as.numeric()
    
    url <- substr(htmlNode, gregexpr('"', htmlNode)[[1]][1]+1, gregexpr('"', htmlNode)[[1]][2]-1)

   
   
    
    if (check > 53 & check < 79) { #2017 episodes     
      
      content <- read_html(url) 
      post_id <- content %>%   html_nodes("body") %>% html_attr("class") %>% {gsub("\\D", "", .)} 
      
      test_text<- content %>% 
        html_nodes(paste0("#post-",post_id," > div ")) %>% html_text() %>% strsplit(" ") %>% unlist
      
      test_text <- test_text[which(grepl('Transcript', test_text))+1:length(test_text)] 
      
      text_df <- data.frame(ep = check, test_text)
      out = rbind(out,text_df)
    } else (
      break
    )
    
      } else {
      
        x <- read_html('http://www.analyticshour.io/all-podcast-episodes/')
        
        htmlNode <- x %>% 
          html_nodes(paste0("body > div.super-container.light-icons > div.main-content.page.archive-page > div > div > div > div > article:nth-child(",j,") > div > footer > ul > li.title.not-truncate ")) %>%
          html_children 
        
        check <- x %>% 
          html_nodes(paste0("body > div.super-container.light-icons > div.main-content.page.archive-page > div > div > div > div > article:nth-child(",j,") > div > footer > ul > li.title.not-truncate > a")) %>% html_text() %>% substr(2, 4) %>% as.numeric()
        
        url <- substr(htmlNode, gregexpr('"', htmlNode)[[1]][1]+1, gregexpr('"', htmlNode)[[1]][2]-1)
        
        
        
        
        if (check > 53 & check < 79) { #2017 episodes     
          
          content <- read_html(url) 
          post_id <- content %>%   html_nodes("body") %>% html_attr("class") %>% {gsub("\\D", "", .)} 
          
          test_text<- content %>% 
            html_nodes(paste0("#post-",post_id," > div ")) %>% html_text() %>% strsplit(" ") %>% unlist
          
          test_text <- test_text[which(grepl('Transcript', test_text))+1:length(test_text)] 
          
          text_df <- data.frame(ep = check, test_text)
          out = rbind(out,text_df)
        } 
        
      }
        
        
        
        
    }
  }
  



 out$word <- str_replace_all(out$test_text, "‘|“|\\.|\\,|\\?|\\:|\"|\\!|\\`|/","")
 out$word <- str_replace_all(out$word,"\n"," ")
 out$word <- iconv(out$word, "latin1", "ASCII", sub="")
 out <- out[c(1,3)] %>%   mutate(word = strsplit(as.character(word), " ")) %>%  unnest(word)
 out$word <- tolower(out$word)

 out <- out[complete.cases(out), ]

 out %>% filter(word == '[laughter]') %>% group_by(ep) %>% summarise(n = n()) %>% ggplot(aes(x = ep, y = n)) + geom_col(fill = "#EF4A62") + theme_hc() + xlab('Episode') + ylab('Number of laughs') + ggtitle("The funniest episode?")

 out %>% filter(grepl("fuck", word)) %>% group_by(ep) %>% summarise(n = n()) %>% ggplot(aes(x = ep, y = n)) + geom_col(fill = "#3FA0D9") + theme_hc() + xlab('Episode') + ylab('Number of #$%^') + ggtitle("Not for children!!")

 out %>% filter(grepl("machine|^ai$|artificial", word)) %>% group_by(ep) %>% summarise(n = n()) %>% ggplot(aes(x = ep, y = n)) + geom_col(fill = "#39308A") + theme_hc() + xlab('Episode') + ylab('Machine learnings / ai mentions') + ggtitle("Machines taking over the world")

 out %>% filter(grepl("mobile", word)) %>% group_by(ep) %>% summarise(n = n()) %>% ggplot(aes(x = ep, y = n)) + geom_col(fill = "#39308A") + theme_hc() + xlab('Episode') + ylab('Machine learnings / ai mentions') + ggtitle("Was 2017 year of mobile?")

 out %>% filter(grepl("^adobe$|^google$", word)) %>% group_by(ep,word) %>% summarise(n = n()) %>% ggplot(aes(x = ep, y = n, fill = word)) + geom_col() + theme_hc() + xlab('Episode') + ylab('Machine learnings / ai mentions') + ggtitle("Google vs adobe") + scale_fill_manual(values = c("#2B2047","#FFC519"))

 out <- out %>%  filter(!word %in% stop_words$word & !word %in%  c('mh','jn','sa','tw','mk','cb','im','ar','yeah','youre','[chuckle]','[laughter]','gonna','dont','ive','[music]','ss','isnt','tim','moe','helbling','jd','youve','bit','lot','whos','ago','hes','shes','doesnt','michael','wilson','theyre','wanna','mg')) 
 out %>% group_by(ep,word) %>% summarise(n = n()) %>% ungroup() %>% group_by(ep) %>% filter(n == max(n)) 
 forCloud <- out %>% group_by(word) %>% summarise(n = n())
 wordcloud(forCloud$word, forCloud$n, min.freq=60,colors=brewer.pal(6, "Dark2"))
	library(rvest)
	library(dplyr)
	library(stringr)
	library(tidytext)
	library(ggplot2)
	library(ggthemes)
	library(tm)

	out = NULL
	for (i in 1:6 ){
	for (j in 1:6) {
	if ( i > 1) {
	x <- read_html(paste0("http://www.analyticshour.io/all-podcast-episodes/page/", i))

	htmlNode <- x %>%
	html_nodes(paste0("body > div.super-container.light-icons > div.main-content.page.archive-page > div > div > div > div > article:nth-child(",j,") > div > footer > ul > li.title.not-truncate ")) %>%
	html_children

	check <- x %>%
	html_nodes(paste0("body > div.super-container.light-icons > div.main-content.page.archive-page > div > div > div > div > article:nth-child(",j,") > div > footer > ul > li.title.not-truncate > a")) %>% html_text() %>% substr(2, 4) %>% as.numeric()

	url <- substr(htmlNode, gregexpr('"', htmlNode)[[1]][1]+1, gregexpr('"', htmlNode)[[1]][2]-1)




	if (check > 53 & check < 79) { #2017 episodes

	content <- read_html(url)
	post_id <- content %>% html_nodes("body") %>% html_attr("class") %>% {gsub("\\D", "", .)}

	test_text<- content %>%
	html_nodes(paste0("#post-",post_id," > div ")) %>% html_text() %>% strsplit(" ") %>% unlist

	test_text <- test_text[which(grepl('Transcript', test_text))+1:length(test_text)]

	text_df <- data.frame(ep = check, test_text)
	out = rbind(out,text_df)
	} else (
	break
	)

	} else {

	x <- read_html('http://www.analyticshour.io/all-podcast-episodes/')

	htmlNode <- x %>%
	html_nodes(paste0("body > div.super-container.light-icons > div.main-content.page.archive-page > div > div > div > div > article:nth-child(",j,") > div > footer > ul > li.title.not-truncate ")) %>%
	html_children

	check <- x %>%
	html_nodes(paste0("body > div.super-container.light-icons > div.main-content.page.archive-page > div > div > div > div > article:nth-child(",j,") > div > footer > ul > li.title.not-truncate > a")) %>% html_text() %>% substr(2, 4) %>% as.numeric()

	url <- substr(htmlNode, gregexpr('"', htmlNode)[[1]][1]+1, gregexpr('"', htmlNode)[[1]][2]-1)




	if (check > 53 & check < 79) { #2017 episodes

	content <- read_html(url)
	post_id <- content %>% html_nodes("body") %>% html_attr("class") %>% {gsub("\\D", "", .)}

	test_text<- content %>%
	html_nodes(paste0("#post-",post_id," > div ")) %>% html_text() %>% strsplit(" ") %>% unlist

	test_text <- test_text[which(grepl('Transcript', test_text))+1:length(test_text)]

	text_df <- data.frame(ep = check, test_text)
	out = rbind(out,text_df)
	}

	}




	}
	}




	out$word <- str_replace_all(out$test_text, "‘\|“\|\\.\|\\,\|\\?\|\\:\|\"\|\\!\|\\`\|/","")
	out$word <- str_replace_all(out$word,"\n"," ")
	out$word <- iconv(out$word, "latin1", "ASCII", sub="")
	out <- out[c(1,3)] %>% mutate(word = strsplit(as.character(word), " ")) %>% unnest(word)
	out$word <- tolower(out$word)

	out <- out[complete.cases(out), ]

	out %>% filter(word == '[laughter]') %>% group_by(ep) %>% summarise(n = n()) %>% ggplot(aes(x = ep, y = n)) + geom_col(fill = "#EF4A62") + theme_hc() + xlab('Episode') + ylab('Number of laughs') + ggtitle("The funniest episode?")

	out %>% filter(grepl("fuck", word)) %>% group_by(ep) %>% summarise(n = n()) %>% ggplot(aes(x = ep, y = n)) + geom_col(fill = "#3FA0D9") + theme_hc() + xlab('Episode') + ylab('Number of #$%^') + ggtitle("Not for children!!")

	out %>% filter(grepl("machine\|^ai$\|artificial", word)) %>% group_by(ep) %>% summarise(n = n()) %>% ggplot(aes(x = ep, y = n)) + geom_col(fill = "#39308A") + theme_hc() + xlab('Episode') + ylab('Machine learnings / ai mentions') + ggtitle("Machines taking over the world")

	out %>% filter(grepl("mobile", word)) %>% group_by(ep) %>% summarise(n = n()) %>% ggplot(aes(x = ep, y = n)) + geom_col(fill = "#39308A") + theme_hc() + xlab('Episode') + ylab('Machine learnings / ai mentions') + ggtitle("Was 2017 year of mobile?")

	out %>% filter(grepl("^adobe$\|^google$", word)) %>% group_by(ep,word) %>% summarise(n = n()) %>% ggplot(aes(x = ep, y = n, fill = word)) + geom_col() + theme_hc() + xlab('Episode') + ylab('Machine learnings / ai mentions') + ggtitle("Google vs adobe") + scale_fill_manual(values = c("#2B2047","#FFC519"))

	out <- out %>% filter(!word %in% stop_words$word & !word %in% c('mh','jn','sa','tw','mk','cb','im','ar','yeah','youre','[chuckle]','[laughter]','gonna','dont','ive','[music]','ss','isnt','tim','moe','helbling','jd','youve','bit','lot','whos','ago','hes','shes','doesnt','michael','wilson','theyre','wanna','mg'))
	out %>% group_by(ep,word) %>% summarise(n = n()) %>% ungroup() %>% group_by(ep) %>% filter(n == max(n))
	forCloud <- out %>% group_by(word) %>% summarise(n = n())
	wordcloud(forCloud$word, forCloud$n, min.freq=60,colors=brewer.pal(6, "Dark2"))