marceloszilagyi · September 6, 2017 17:19
diff --git a/gistfile1.txt b/gistfile1.txt
 # this script validates the prediction using a test dataset
 # Library Load ------------------------------------------------------------

 listpackages = c('tm', 'tidyverse','ggplot2','scales','DT', 'tidyr', 'igraph','magrittr','gridExtra','readr','stringi','stringr','textclean','reshape2', 'tidytext','data.table')
 loaded = suppressMessages(suppressWarnings(
  sapply(listpackages, function (x) library(x,character.only = T))
 ))

 rm(list = c('listpackages','loaded'))

 library("tidyverse")
 library("stringr") 
 library("textclean") 
 library("lexicon")
 library("magrittr")
 library(shiny)
 library(stringi)


 # Load support files -----------------------------------------------------------

 # get the badwords
 badwords_file = list.files(recursive = TRUE, pattern = glob2rx('*badwords.txt'))
 if(length(badwords_file)==0) {
  download.file('https://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en',"badwords.txt")}
 if(!("badwords" %in% ls())) {
  badwords = as_tibble(read.csv(badwords_file[1])); colnames(badwords) <- "word"}

 # get the dictionary
 dictionary_file = list.files(recursive = TRUE, pattern = glob2rx('*dictionary.txt'))
 if(length(dictionary_file)==0){
  download.file('https://raw.githubusercontent.com/dwyl/english-words/master/words3.txt',destfile = "dictionary.txt")
 }

 suppressMessages(
  suppressWarnings(
    if(!("dictionary" %in% ls())) {
      dictionary = read_csv(dictionary_file[1],col_names = "word")
      dictionary  = dictionary %>% mutate(dictionary="dictionary")
    }))

 # add "BADWORD" as a word in the dictionary
 dictionary = dictionary %>% add_row(word = c("badword","BADWORD"), dictionary = c("dictionary","dictionary"))

 # get a stoplist
 if(!("otherstoplist" %in% ls())){
  otherstoplist = as_tibble(tm::stopwords("en")) %>% rename(word=value)
  otherstoplist = otherstoplist %>% mutate(n_ss= dense_rank(str_detect(otherstoplist$word,"'"))) %>% arrange(desc(n_ss))
 }

 # get a contraction list and expand it  
 contractions = lexicon::key_contractions
 addedcontractons = bind_cols(contraction = str_replace_all(contractions$contraction,"'",""), 
                             expanded = contractions$expanded)
 contractions = bind_rows(contractions,addedcontractons)
 contractions = contractions %>% filter(contraction != "its")
 contrac_repl = contractions$expanded
 names(contrac_repl) <- paste0("\\b",contractions$contraction,"\\b") # to force the edge of words

 # Function to expand contractions and convert numbers to "num" ----- 
 expand_contraction = function (text) { 
  text %<>% str_replace_all(pattern = "[`''']",replacement = "'" ) 
  text %<>% tolower()
  text %<>% str_replace_all(pattern = "\\b\\w*\\d,*\\.*\\w*,*\\.*\\b", replacement = "num") # the real regex is \b\w*\d,*\.*\w*,*\.*\b
  text %<>% str_replace_all(pattern = "\\b(num)\\W*\\S*(num)\\b", replacement = "num")
  {for (i in (seq_along(contrac_repl)))
    text %<>% str_replace_all(pattern = tolower(names(contrac_repl))[i] ,replacement = tolower(contrac_repl[i]))
  }
  # after the contractions, replace the any 's after the word 
  text %<>% str_replace_all("'s","")
  return(text)
 }


 # READ THE TEST DATASET  - remove badwords ------------------------------------------------------

 wordbyword_dev = fread("wordbyword_test.csv")

 # replace badwords with the mark "BADWORD" 
 wordbyword_dev = wordbyword_dev %>% mutate(word=replace(word, word %in% badwords$word, "badword"))

 # here I manually add  new words to dictionary based on the results above
 newwords = c("badword","num","lol","blog","obama","facebook","omg","website","nfl","nba","ceo","google", "u.s","a.m","u.k", "p.m")
 newtibble = as.tibble(cbind(word = newwords,dictionary = "newwords"))
 dictionary = rbind(dictionary,newtibble)

 '%ni%' <- Negate('%in%')
 # calculate the unigrams for words that are not dictionary words
 notreallywords = wordbyword_dev %>% filter (word %ni% dictionary$word)
 unigramsnonwords =  notreallywords %>% count(word) %>% arrange(desc(n))
 unigramsnonwords

 # replace non dictionary with the mark "UNKWORD".
 # https://stackoverflow.com/questions/38351820/negation-of-in-in-r
 `%nin%` = Negate(`%in%`)
 wordbyword_dev = wordbyword_dev %>% mutate(original_word =word, word= replace(word, word %nin% dictionary$word, "UNKWORD"))

 # Add words to the tidy table for future calculation--------------

 # add bigrams to the table
 wordbyword_dev = wordbyword_dev %>% mutate (next_word = if_else(lead(linenumber)==linenumber,lead(word),""))

 # add trigrams to the table 
 wordbyword_dev = wordbyword_dev %>% mutate (sec_next_word = if_else(lead(linenumber, n=2)==linenumber,lead(word, n=2),""))

 # add fourgrams to the table 
 wordbyword_dev = wordbyword_dev %>% mutate (third_next_word = if_else(lead(linenumber, n=3)==linenumber,lead(word, n=3),""))

 # add 5 grams to the table 
 wordbyword_dev = wordbyword_dev %>% mutate (fourth_next_word = if_else(lead(linenumber, n=4)==linenumber,lead(word, n=4),""))

 write_csv(wordbyword_dev,"wordbyword_test_ready_only_dic.csv")

 # Create unigrams, bigrams...-----

 if(file.exists("wordbyword_test_ready_only_dic.csv", recursive=TRUE)){
  wordbyword_dev = fread(list.files(pattern = "wordbyword_test_ready_only_dic.csv",recursive=TRUE)[1])
 }

 # this function excludes cases that are 'end of message'
 # reference here - https://stackoverflow.com/questions/37363583/dplyr-filter-if-any-variable-is-equal-to-a-value

 exclude <- function(a,test_val,na.rm=T)
 {out <- a %>% filter(!rowSums(a==test_val,na.rm=na.rm))
 return(out)
 }

 unigrams =  wordbyword_dev %>% count(word) %>% arrange(desc(n)) %>% mutate(uniprop = n/sum(n))
 write_csv(unigrams,"unigrams_test.csv")
 rm(unigrams)

 bigrams =   wordbyword_dev %>% count(word,next_word) %>% arrange(desc(n)) %>% group_by(word) %>% mutate(biprop = n/sum(n))
 write_csv(bigrams,"bigrams_test.csv")
 rm(bigrams)

 trigrams =  wordbyword_dev %>% count(word,next_word,sec_next_word) %>% arrange(desc(n)) %>% group_by(word,next_word) %>% mutate(triprop = n/sum(n))
 write_csv(trigrams,"trigrams_test.csv")
 rm(trigrams)

 fourgrams = wordbyword_dev %>% count(word,next_word,sec_next_word,third_next_word) %>% arrange(desc(n)) %>% group_by(word,next_word,sec_next_word) %>% mutate(fourprop = n/sum(n))
 write_csv(fourgrams,"fourgrams_test.csv") 
 rm(fourgrams)

 fivegrams = wordbyword_dev %>% count(word,next_word,sec_next_word,third_next_word,fourth_next_word) %>% arrange(desc(n)) %>% group_by(word,next_word,sec_next_word,third_next_word) %>% mutate(fiveprop = n/sum(n))
 write_csv(fivegrams,"fivegrams_test.csv") 
 rm(fivegrams)

 # Retrive the ngrams
 fivegrams_test <- fread("fivegrams_test.csv")
 fourgrams_test <- fread("fourgrams_test.csv")
 trigrams_test <- fread("trigrams_test.csv")
 bigrams_test <- fread("bigrams_test.csv")
 unigrams_test <- fread("unigrams_test.csv")

 # remove the "" (end of prhase)
 bigrams_test <- bigrams_test %>%  filter(next_word!="")
 trigrams_test <- trigrams_test %>%  filter(next_word != "", sec_next_word != "")
 fourgrams_test <- fourgrams_test %>% filter(next_word != "", sec_next_word != "", third_next_word != "")
 fivegrams_test <- fivegrams_test %>% filter(next_word != "", sec_next_word != "", third_next_word != "", fourth_next_word != "")

 # count the loss due to pruning

 total_five <- sum(fivegrams_test$n)
 total_four <- sum(fourgrams_test$n)
 total_trig <- sum(trigrams_test$n)
 total_twog <- sum(bigrams_test$n)
 total_unig <- sum(unigrams_test$n)

 # prune
 fivegrams_test = fivegrams_test %>% filter(n>3)
 fourgrams_test = fourgrams_test %>% filter(n>2)
 trigrams_test  = trigrams_test %>% filter(n>2)
 bigrams_test   = bigrams_test %>% filter(n>1)

 # loss 
 lossfive_test = sum(fivegrams_test$n)/total_five
 lossfour_test = sum(fourgrams_test$n)/total_four
 losstrig_test = sum(trigrams_test$n)/total_trig
 losstwog_test = sum(bigrams_test$n)/total_twog


 # stupid backoff function ------
 phrase = "every inch of you is perfect from the bottom to the"
 wordpredict <- function (phrase) {
  # pass by the dictionary and contraction
  phrase <- expand_contraction(phrase)
  #splittext <- str_split(phrase," ",simplify = T)
  splittext <- stri_split_boundaries(phrase, type="word",tokens_only = T,skip_word_none=TRUE, simplify = T)
  splittext <- ifelse(splittext %in% dictionary$word, splittext, "UNKWORD")
  splittext <- ifelse(splittext %in% badwords$word, "badword", splittext)
  lengthtext = length(splittext)
  backoff_index = ifelse(lengthtext>4,4,lengthtext)
  # pick at maximum four words
  if (lengthtext >1) {word_search_last = splittext[lengthtext]}
  if (lengthtext >2) {word_search_last_minus_one = splittext[lengthtext-1]} 
  if (lengthtext >3) {word_search_last_minus_two = splittext[lengthtext-2]} 
  if (lengthtext >4) {word_search_last_minus_three = splittext[lengthtext-3]}                    
  
  if (lengthtext >4) {fivechance = fivegrams %>% filter(third_next_word==word_search_last, sec_next_word == word_search_last_minus_one, next_word == word_search_last_minus_two, word == word_search_last_minus_three) %>% top_n(5,wt = fiveprop) %>% mutate(chance = fiveprop*0.4^(backoff_index-4), origin="fivegrams") %>% ungroup() %>% select(selection = fourth_next_word, chance, origin)}
  
  if (lengthtext >3) {fourchance = fourgrams %>% filter(sec_next_word==word_search_last, next_word==word_search_last_minus_one, word==word_search_last_minus_two) %>% top_n(5, wt=fourprop) %>% mutate(chance = fourprop*0.4^(backoff_index-3), origin = "fourgrams") %>% ungroup() %>% select(selection = third_next_word, chance, origin)}
  
  if (lengthtext >2) {threechance = trigrams %>% filter(next_word==word_search_last,word==word_search_last_minus_one) %>% top_n(5, wt=triprop) %>% mutate(chance = triprop*0.4^(backoff_index-2), origin = "trigrams") %>% ungroup() %>% select(selection = sec_next_word, chance, origin)}
  
  if (lengthtext >1) {bichance = bigrams %>% filter(word==word_search_last) %>% top_n(5, wt=biprop) %>% mutate(chance = biprop*0.4^(backoff_index-1), origin = "bigrams") %>% ungroup() %>% select(selection = next_word, chance, origin)}
  
  if (lengthtext == 1) {unichance = unigrams %>% top_n(5, uniprop) %>% mutate(chance = uniprop*0.4^(backoff_index-1), origin = "unigram") %>% ungroup() %>% select(selection = word, chance, origin)}
  
  
  chance_final <- rbind(if(exists("fivechance")){fivechance},
                        if(exists("fourchance")){fourchance},
                        if(exists("threechance")){threechance},
                        if(exists("bichance")){bichance},
                        if(exists("unichance")){unichance})
  
  if(exists("fivechance")){rm(fivechance)} 
  if(exists("fourchance")){rm(fourchance)}
  if(exists("threechance")){rm(threechance)}
  if(exists("bichance")){rm(bichance)}
  if(exists("unichance")){rm(unichance)}
  
  ifelse(exists("chance_final"), 
         chance_final2 <- chance_final %>% arrange(selection,desc(chance)) %>% distinct(selection,.keep_all = TRUE), 
         chance_final2 <- unigrams %>% top_n(5, wt=uniprop))
  return(list(splittext,chance_final2,lengthtext,backoff_index))
 }

 # retrieve the dev grams to run the function----
 fivegrams <- fread("lean_five.csv")
 fourgrams <- fread("lean_four.csv")
 trigrams <- fread("lean_tri.csv")
 bigrams <- fread("lean_bi.csv")
 unigrams <- fread("lean_uni.csv")

 # run the function for the fivegrams ----

 wordpredict("to live and let")[[2]]$selection

 un# write a function to return T or F and store in the fivegrams

 checkmatch <- function(aword, bword, cword, dword, eword){
  
  results = paste0(aword, bword, cword, dword, collapse = " ") %>% wordpredict() %>% extract2(2) %>% select(selection) 
  checkmatch <- is.element(eword[1],results$selection)
  return(checkmatch)
 }


 fivegrams_test = fivegrams_test %>% mutate(result = checkmatch(word,next_word,sec_next_word,third_next_word,fourth_next_word))
	# this script validates the prediction using a test dataset
	# Library Load ------------------------------------------------------------

	listpackages = c('tm', 'tidyverse','ggplot2','scales','DT', 'tidyr', 'igraph','magrittr','gridExtra','readr','stringi','stringr','textclean','reshape2', 'tidytext','data.table')
	loaded = suppressMessages(suppressWarnings(
	sapply(listpackages, function (x) library(x,character.only = T))
	))

	rm(list = c('listpackages','loaded'))

	library("tidyverse")
	library("stringr")
	library("textclean")
	library("lexicon")
	library("magrittr")
	library(shiny)
	library(stringi)


	# Load support files -----------------------------------------------------------

	# get the badwords
	badwords_file = list.files(recursive = TRUE, pattern = glob2rx('*badwords.txt'))
	if(length(badwords_file)==0) {
	download.file('https://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en',"badwords.txt")}
	if(!("badwords" %in% ls())) {
	badwords = as_tibble(read.csv(badwords_file[1])); colnames(badwords) <- "word"}

	# get the dictionary
	dictionary_file = list.files(recursive = TRUE, pattern = glob2rx('*dictionary.txt'))
	if(length(dictionary_file)==0){
	download.file('https://raw.githubusercontent.com/dwyl/english-words/master/words3.txt',destfile = "dictionary.txt")
	}

	suppressMessages(
	suppressWarnings(
	if(!("dictionary" %in% ls())) {
	dictionary = read_csv(dictionary_file[1],col_names = "word")
	dictionary = dictionary %>% mutate(dictionary="dictionary")
	}))

	# add "BADWORD" as a word in the dictionary
	dictionary = dictionary %>% add_row(word = c("badword","BADWORD"), dictionary = c("dictionary","dictionary"))

	# get a stoplist
	if(!("otherstoplist" %in% ls())){
	otherstoplist = as_tibble(tm::stopwords("en")) %>% rename(word=value)
	otherstoplist = otherstoplist %>% mutate(n_ss= dense_rank(str_detect(otherstoplist$word,"'"))) %>% arrange(desc(n_ss))
	}

	# get a contraction list and expand it
	contractions = lexicon::key_contractions
	addedcontractons = bind_cols(contraction = str_replace_all(contractions$contraction,"'",""),
	expanded = contractions$expanded)
	contractions = bind_rows(contractions,addedcontractons)
	contractions = contractions %>% filter(contraction != "its")
	contrac_repl = contractions$expanded
	names(contrac_repl) <- paste0("\\b",contractions$contraction,"\\b") # to force the edge of words

	# Function to expand contractions and convert numbers to "num" -----
	expand_contraction = function (text) {
	text %<>% str_replace_all(pattern = "[`''']",replacement = "'" )
	text %<>% tolower()
	text %<>% str_replace_all(pattern = "\\b\\w\\d,\\.\\w,\\.\\b", replacement = "num") # the real regex is \b\w\d,\.\w,\.\b
	text %<>% str_replace_all(pattern = "\\b(num)\\W\\S(num)\\b", replacement = "num")
	{for (i in (seq_along(contrac_repl)))
	text %<>% str_replace_all(pattern = tolower(names(contrac_repl))[i] ,replacement = tolower(contrac_repl[i]))
	}
	# after the contractions, replace the any 's after the word
	text %<>% str_replace_all("'s","")
	return(text)
	}


	# READ THE TEST DATASET - remove badwords ------------------------------------------------------

	wordbyword_dev = fread("wordbyword_test.csv")

	# replace badwords with the mark "BADWORD"
	wordbyword_dev = wordbyword_dev %>% mutate(word=replace(word, word %in% badwords$word, "badword"))

	# here I manually add new words to dictionary based on the results above
	newwords = c("badword","num","lol","blog","obama","facebook","omg","website","nfl","nba","ceo","google", "u.s","a.m","u.k", "p.m")
	newtibble = as.tibble(cbind(word = newwords,dictionary = "newwords"))
	dictionary = rbind(dictionary,newtibble)

	'%ni%' <- Negate('%in%')
	# calculate the unigrams for words that are not dictionary words
	notreallywords = wordbyword_dev %>% filter (word %ni% dictionary$word)
	unigramsnonwords = notreallywords %>% count(word) %>% arrange(desc(n))
	unigramsnonwords

	# replace non dictionary with the mark "UNKWORD".
	# https://stackoverflow.com/questions/38351820/negation-of-in-in-r
	`%nin%` = Negate(`%in%`)
	wordbyword_dev = wordbyword_dev %>% mutate(original_word =word, word= replace(word, word %nin% dictionary$word, "UNKWORD"))

	# Add words to the tidy table for future calculation--------------

	# add bigrams to the table
	wordbyword_dev = wordbyword_dev %>% mutate (next_word = if_else(lead(linenumber)==linenumber,lead(word),""))

	# add trigrams to the table
	wordbyword_dev = wordbyword_dev %>% mutate (sec_next_word = if_else(lead(linenumber, n=2)==linenumber,lead(word, n=2),""))

	# add fourgrams to the table
	wordbyword_dev = wordbyword_dev %>% mutate (third_next_word = if_else(lead(linenumber, n=3)==linenumber,lead(word, n=3),""))

	# add 5 grams to the table
	wordbyword_dev = wordbyword_dev %>% mutate (fourth_next_word = if_else(lead(linenumber, n=4)==linenumber,lead(word, n=4),""))

	write_csv(wordbyword_dev,"wordbyword_test_ready_only_dic.csv")

	# Create unigrams, bigrams...-----

	if(file.exists("wordbyword_test_ready_only_dic.csv", recursive=TRUE)){
	wordbyword_dev = fread(list.files(pattern = "wordbyword_test_ready_only_dic.csv",recursive=TRUE)[1])
	}

	# this function excludes cases that are 'end of message'
	# reference here - https://stackoverflow.com/questions/37363583/dplyr-filter-if-any-variable-is-equal-to-a-value

	exclude <- function(a,test_val,na.rm=T)
	{out <- a %>% filter(!rowSums(a==test_val,na.rm=na.rm))
	return(out)
	}

	unigrams = wordbyword_dev %>% count(word) %>% arrange(desc(n)) %>% mutate(uniprop = n/sum(n))
	write_csv(unigrams,"unigrams_test.csv")
	rm(unigrams)

	bigrams = wordbyword_dev %>% count(word,next_word) %>% arrange(desc(n)) %>% group_by(word) %>% mutate(biprop = n/sum(n))
	write_csv(bigrams,"bigrams_test.csv")
	rm(bigrams)

	trigrams = wordbyword_dev %>% count(word,next_word,sec_next_word) %>% arrange(desc(n)) %>% group_by(word,next_word) %>% mutate(triprop = n/sum(n))
	write_csv(trigrams,"trigrams_test.csv")
	rm(trigrams)

	fourgrams = wordbyword_dev %>% count(word,next_word,sec_next_word,third_next_word) %>% arrange(desc(n)) %>% group_by(word,next_word,sec_next_word) %>% mutate(fourprop = n/sum(n))
	write_csv(fourgrams,"fourgrams_test.csv")
	rm(fourgrams)

	fivegrams = wordbyword_dev %>% count(word,next_word,sec_next_word,third_next_word,fourth_next_word) %>% arrange(desc(n)) %>% group_by(word,next_word,sec_next_word,third_next_word) %>% mutate(fiveprop = n/sum(n))
	write_csv(fivegrams,"fivegrams_test.csv")
	rm(fivegrams)

	# Retrive the ngrams
	fivegrams_test <- fread("fivegrams_test.csv")
	fourgrams_test <- fread("fourgrams_test.csv")
	trigrams_test <- fread("trigrams_test.csv")
	bigrams_test <- fread("bigrams_test.csv")
	unigrams_test <- fread("unigrams_test.csv")

	# remove the "" (end of prhase)
	bigrams_test <- bigrams_test %>% filter(next_word!="")
	trigrams_test <- trigrams_test %>% filter(next_word != "", sec_next_word != "")
	fourgrams_test <- fourgrams_test %>% filter(next_word != "", sec_next_word != "", third_next_word != "")
	fivegrams_test <- fivegrams_test %>% filter(next_word != "", sec_next_word != "", third_next_word != "", fourth_next_word != "")

	# count the loss due to pruning

	total_five <- sum(fivegrams_test$n)
	total_four <- sum(fourgrams_test$n)
	total_trig <- sum(trigrams_test$n)
	total_twog <- sum(bigrams_test$n)
	total_unig <- sum(unigrams_test$n)

	# prune
	fivegrams_test = fivegrams_test %>% filter(n>3)
	fourgrams_test = fourgrams_test %>% filter(n>2)
	trigrams_test = trigrams_test %>% filter(n>2)
	bigrams_test = bigrams_test %>% filter(n>1)

	# loss
	lossfive_test = sum(fivegrams_test$n)/total_five
	lossfour_test = sum(fourgrams_test$n)/total_four
	losstrig_test = sum(trigrams_test$n)/total_trig
	losstwog_test = sum(bigrams_test$n)/total_twog


	# stupid backoff function ------
	phrase = "every inch of you is perfect from the bottom to the"
	wordpredict <- function (phrase) {
	# pass by the dictionary and contraction
	phrase <- expand_contraction(phrase)
	#splittext <- str_split(phrase," ",simplify = T)
	splittext <- stri_split_boundaries(phrase, type="word",tokens_only = T,skip_word_none=TRUE, simplify = T)
	splittext <- ifelse(splittext %in% dictionary$word, splittext, "UNKWORD")
	splittext <- ifelse(splittext %in% badwords$word, "badword", splittext)
	lengthtext = length(splittext)
	backoff_index = ifelse(lengthtext>4,4,lengthtext)
	# pick at maximum four words
	if (lengthtext >1) {word_search_last = splittext[lengthtext]}
	if (lengthtext >2) {word_search_last_minus_one = splittext[lengthtext-1]}
	if (lengthtext >3) {word_search_last_minus_two = splittext[lengthtext-2]}
	if (lengthtext >4) {word_search_last_minus_three = splittext[lengthtext-3]}

	if (lengthtext >4) {fivechance = fivegrams %>% filter(third_next_word==word_search_last, sec_next_word == word_search_last_minus_one, next_word == word_search_last_minus_two, word == word_search_last_minus_three) %>% top_n(5,wt = fiveprop) %>% mutate(chance = fiveprop*0.4^(backoff_index-4), origin="fivegrams") %>% ungroup() %>% select(selection = fourth_next_word, chance, origin)}

	if (lengthtext >3) {fourchance = fourgrams %>% filter(sec_next_word==word_search_last, next_word==word_search_last_minus_one, word==word_search_last_minus_two) %>% top_n(5, wt=fourprop) %>% mutate(chance = fourprop*0.4^(backoff_index-3), origin = "fourgrams") %>% ungroup() %>% select(selection = third_next_word, chance, origin)}

	if (lengthtext >2) {threechance = trigrams %>% filter(next_word==word_search_last,word==word_search_last_minus_one) %>% top_n(5, wt=triprop) %>% mutate(chance = triprop*0.4^(backoff_index-2), origin = "trigrams") %>% ungroup() %>% select(selection = sec_next_word, chance, origin)}

	if (lengthtext >1) {bichance = bigrams %>% filter(word==word_search_last) %>% top_n(5, wt=biprop) %>% mutate(chance = biprop*0.4^(backoff_index-1), origin = "bigrams") %>% ungroup() %>% select(selection = next_word, chance, origin)}

	if (lengthtext == 1) {unichance = unigrams %>% top_n(5, uniprop) %>% mutate(chance = uniprop*0.4^(backoff_index-1), origin = "unigram") %>% ungroup() %>% select(selection = word, chance, origin)}


	chance_final <- rbind(if(exists("fivechance")){fivechance},
	if(exists("fourchance")){fourchance},
	if(exists("threechance")){threechance},
	if(exists("bichance")){bichance},
	if(exists("unichance")){unichance})

	if(exists("fivechance")){rm(fivechance)}
	if(exists("fourchance")){rm(fourchance)}
	if(exists("threechance")){rm(threechance)}
	if(exists("bichance")){rm(bichance)}
	if(exists("unichance")){rm(unichance)}

	ifelse(exists("chance_final"),
	chance_final2 <- chance_final %>% arrange(selection,desc(chance)) %>% distinct(selection,.keep_all = TRUE),
	chance_final2 <- unigrams %>% top_n(5, wt=uniprop))
	return(list(splittext,chance_final2,lengthtext,backoff_index))
	}

	# retrieve the dev grams to run the function----
	fivegrams <- fread("lean_five.csv")
	fourgrams <- fread("lean_four.csv")
	trigrams <- fread("lean_tri.csv")
	bigrams <- fread("lean_bi.csv")
	unigrams <- fread("lean_uni.csv")

	# run the function for the fivegrams ----

	wordpredict("to live and let")[[2]]$selection

	un# write a function to return T or F and store in the fivegrams

	checkmatch <- function(aword, bword, cword, dword, eword){

	results = paste0(aword, bword, cword, dword, collapse = " ") %>% wordpredict() %>% extract2(2) %>% select(selection)
	checkmatch <- is.element(eword[1],results$selection)
	return(checkmatch)
	}


	fivegrams_test = fivegrams_test %>% mutate(result = checkmatch(word,next_word,sec_next_word,third_next_word,fourth_next_word))