Created
September 6, 2017 17:19
-
-
Save marceloszilagyi/19b27b538799fb1020731764aae2323c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# this script validates the prediction using a test dataset | |
# Library Load ------------------------------------------------------------ | |
listpackages = c('tm', 'tidyverse','ggplot2','scales','DT', 'tidyr', 'igraph','magrittr','gridExtra','readr','stringi','stringr','textclean','reshape2', 'tidytext','data.table') | |
loaded = suppressMessages(suppressWarnings( | |
sapply(listpackages, function (x) library(x,character.only = T)) | |
)) | |
rm(list = c('listpackages','loaded')) | |
library("tidyverse") | |
library("stringr") | |
library("textclean") | |
library("lexicon") | |
library("magrittr") | |
library(shiny) | |
library(stringi) | |
# Load support files ----------------------------------------------------------- | |
# get the badwords | |
badwords_file = list.files(recursive = TRUE, pattern = glob2rx('*badwords.txt')) | |
if(length(badwords_file)==0) { | |
download.file('https://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en',"badwords.txt")} | |
if(!("badwords" %in% ls())) { | |
badwords = as_tibble(read.csv(badwords_file[1])); colnames(badwords) <- "word"} | |
# get the dictionary | |
dictionary_file = list.files(recursive = TRUE, pattern = glob2rx('*dictionary.txt')) | |
if(length(dictionary_file)==0){ | |
download.file('https://raw.githubusercontent.com/dwyl/english-words/master/words3.txt',destfile = "dictionary.txt") | |
} | |
suppressMessages( | |
suppressWarnings( | |
if(!("dictionary" %in% ls())) { | |
dictionary = read_csv(dictionary_file[1],col_names = "word") | |
dictionary = dictionary %>% mutate(dictionary="dictionary") | |
})) | |
# add "BADWORD" as a word in the dictionary | |
dictionary = dictionary %>% add_row(word = c("badword","BADWORD"), dictionary = c("dictionary","dictionary")) | |
# get a stoplist | |
if(!("otherstoplist" %in% ls())){ | |
otherstoplist = as_tibble(tm::stopwords("en")) %>% rename(word=value) | |
otherstoplist = otherstoplist %>% mutate(n_ss= dense_rank(str_detect(otherstoplist$word,"'"))) %>% arrange(desc(n_ss)) | |
} | |
# get a contraction list and expand it | |
contractions = lexicon::key_contractions | |
addedcontractons = bind_cols(contraction = str_replace_all(contractions$contraction,"'",""), | |
expanded = contractions$expanded) | |
contractions = bind_rows(contractions,addedcontractons) | |
contractions = contractions %>% filter(contraction != "its") | |
contrac_repl = contractions$expanded | |
names(contrac_repl) <- paste0("\\b",contractions$contraction,"\\b") # to force the edge of words | |
# Function to expand contractions and convert numbers to "num" ----- | |
expand_contraction = function (text) { | |
text %<>% str_replace_all(pattern = "[`''']",replacement = "'" ) | |
text %<>% tolower() | |
text %<>% str_replace_all(pattern = "\\b\\w*\\d,*\\.*\\w*,*\\.*\\b", replacement = "num") # the real regex is \b\w*\d,*\.*\w*,*\.*\b | |
text %<>% str_replace_all(pattern = "\\b(num)\\W*\\S*(num)\\b", replacement = "num") | |
{for (i in (seq_along(contrac_repl))) | |
text %<>% str_replace_all(pattern = tolower(names(contrac_repl))[i] ,replacement = tolower(contrac_repl[i])) | |
} | |
# after the contractions, replace the any 's after the word | |
text %<>% str_replace_all("'s","") | |
return(text) | |
} | |
# READ THE TEST DATASET - remove badwords ------------------------------------------------------ | |
wordbyword_dev = fread("wordbyword_test.csv") | |
# replace badwords with the mark "BADWORD" | |
wordbyword_dev = wordbyword_dev %>% mutate(word=replace(word, word %in% badwords$word, "badword")) | |
# here I manually add new words to dictionary based on the results above | |
newwords = c("badword","num","lol","blog","obama","facebook","omg","website","nfl","nba","ceo","google", "u.s","a.m","u.k", "p.m") | |
newtibble = as.tibble(cbind(word = newwords,dictionary = "newwords")) | |
dictionary = rbind(dictionary,newtibble) | |
'%ni%' <- Negate('%in%') | |
# calculate the unigrams for words that are not dictionary words | |
notreallywords = wordbyword_dev %>% filter (word %ni% dictionary$word) | |
unigramsnonwords = notreallywords %>% count(word) %>% arrange(desc(n)) | |
unigramsnonwords | |
# replace non dictionary with the mark "UNKWORD". | |
# https://stackoverflow.com/questions/38351820/negation-of-in-in-r | |
`%nin%` = Negate(`%in%`) | |
wordbyword_dev = wordbyword_dev %>% mutate(original_word =word, word= replace(word, word %nin% dictionary$word, "UNKWORD")) | |
# Add words to the tidy table for future calculation-------------- | |
# add bigrams to the table | |
wordbyword_dev = wordbyword_dev %>% mutate (next_word = if_else(lead(linenumber)==linenumber,lead(word),"")) | |
# add trigrams to the table | |
wordbyword_dev = wordbyword_dev %>% mutate (sec_next_word = if_else(lead(linenumber, n=2)==linenumber,lead(word, n=2),"")) | |
# add fourgrams to the table | |
wordbyword_dev = wordbyword_dev %>% mutate (third_next_word = if_else(lead(linenumber, n=3)==linenumber,lead(word, n=3),"")) | |
# add 5 grams to the table | |
wordbyword_dev = wordbyword_dev %>% mutate (fourth_next_word = if_else(lead(linenumber, n=4)==linenumber,lead(word, n=4),"")) | |
write_csv(wordbyword_dev,"wordbyword_test_ready_only_dic.csv") | |
# Create unigrams, bigrams...----- | |
if(file.exists("wordbyword_test_ready_only_dic.csv", recursive=TRUE)){ | |
wordbyword_dev = fread(list.files(pattern = "wordbyword_test_ready_only_dic.csv",recursive=TRUE)[1]) | |
} | |
# this function excludes cases that are 'end of message' | |
# reference here - https://stackoverflow.com/questions/37363583/dplyr-filter-if-any-variable-is-equal-to-a-value | |
exclude <- function(a,test_val,na.rm=T) | |
{out <- a %>% filter(!rowSums(a==test_val,na.rm=na.rm)) | |
return(out) | |
} | |
unigrams = wordbyword_dev %>% count(word) %>% arrange(desc(n)) %>% mutate(uniprop = n/sum(n)) | |
write_csv(unigrams,"unigrams_test.csv") | |
rm(unigrams) | |
bigrams = wordbyword_dev %>% count(word,next_word) %>% arrange(desc(n)) %>% group_by(word) %>% mutate(biprop = n/sum(n)) | |
write_csv(bigrams,"bigrams_test.csv") | |
rm(bigrams) | |
trigrams = wordbyword_dev %>% count(word,next_word,sec_next_word) %>% arrange(desc(n)) %>% group_by(word,next_word) %>% mutate(triprop = n/sum(n)) | |
write_csv(trigrams,"trigrams_test.csv") | |
rm(trigrams) | |
fourgrams = wordbyword_dev %>% count(word,next_word,sec_next_word,third_next_word) %>% arrange(desc(n)) %>% group_by(word,next_word,sec_next_word) %>% mutate(fourprop = n/sum(n)) | |
write_csv(fourgrams,"fourgrams_test.csv") | |
rm(fourgrams) | |
fivegrams = wordbyword_dev %>% count(word,next_word,sec_next_word,third_next_word,fourth_next_word) %>% arrange(desc(n)) %>% group_by(word,next_word,sec_next_word,third_next_word) %>% mutate(fiveprop = n/sum(n)) | |
write_csv(fivegrams,"fivegrams_test.csv") | |
rm(fivegrams) | |
# Retrive the ngrams | |
fivegrams_test <- fread("fivegrams_test.csv") | |
fourgrams_test <- fread("fourgrams_test.csv") | |
trigrams_test <- fread("trigrams_test.csv") | |
bigrams_test <- fread("bigrams_test.csv") | |
unigrams_test <- fread("unigrams_test.csv") | |
# remove the "" (end of prhase) | |
bigrams_test <- bigrams_test %>% filter(next_word!="") | |
trigrams_test <- trigrams_test %>% filter(next_word != "", sec_next_word != "") | |
fourgrams_test <- fourgrams_test %>% filter(next_word != "", sec_next_word != "", third_next_word != "") | |
fivegrams_test <- fivegrams_test %>% filter(next_word != "", sec_next_word != "", third_next_word != "", fourth_next_word != "") | |
# count the loss due to pruning | |
total_five <- sum(fivegrams_test$n) | |
total_four <- sum(fourgrams_test$n) | |
total_trig <- sum(trigrams_test$n) | |
total_twog <- sum(bigrams_test$n) | |
total_unig <- sum(unigrams_test$n) | |
# prune | |
fivegrams_test = fivegrams_test %>% filter(n>3) | |
fourgrams_test = fourgrams_test %>% filter(n>2) | |
trigrams_test = trigrams_test %>% filter(n>2) | |
bigrams_test = bigrams_test %>% filter(n>1) | |
# loss | |
lossfive_test = sum(fivegrams_test$n)/total_five | |
lossfour_test = sum(fourgrams_test$n)/total_four | |
losstrig_test = sum(trigrams_test$n)/total_trig | |
losstwog_test = sum(bigrams_test$n)/total_twog | |
# stupid backoff function ------ | |
phrase = "every inch of you is perfect from the bottom to the" | |
wordpredict <- function (phrase) { | |
# pass by the dictionary and contraction | |
phrase <- expand_contraction(phrase) | |
#splittext <- str_split(phrase," ",simplify = T) | |
splittext <- stri_split_boundaries(phrase, type="word",tokens_only = T,skip_word_none=TRUE, simplify = T) | |
splittext <- ifelse(splittext %in% dictionary$word, splittext, "UNKWORD") | |
splittext <- ifelse(splittext %in% badwords$word, "badword", splittext) | |
lengthtext = length(splittext) | |
backoff_index = ifelse(lengthtext>4,4,lengthtext) | |
# pick at maximum four words | |
if (lengthtext >1) {word_search_last = splittext[lengthtext]} | |
if (lengthtext >2) {word_search_last_minus_one = splittext[lengthtext-1]} | |
if (lengthtext >3) {word_search_last_minus_two = splittext[lengthtext-2]} | |
if (lengthtext >4) {word_search_last_minus_three = splittext[lengthtext-3]} | |
if (lengthtext >4) {fivechance = fivegrams %>% filter(third_next_word==word_search_last, sec_next_word == word_search_last_minus_one, next_word == word_search_last_minus_two, word == word_search_last_minus_three) %>% top_n(5,wt = fiveprop) %>% mutate(chance = fiveprop*0.4^(backoff_index-4), origin="fivegrams") %>% ungroup() %>% select(selection = fourth_next_word, chance, origin)} | |
if (lengthtext >3) {fourchance = fourgrams %>% filter(sec_next_word==word_search_last, next_word==word_search_last_minus_one, word==word_search_last_minus_two) %>% top_n(5, wt=fourprop) %>% mutate(chance = fourprop*0.4^(backoff_index-3), origin = "fourgrams") %>% ungroup() %>% select(selection = third_next_word, chance, origin)} | |
if (lengthtext >2) {threechance = trigrams %>% filter(next_word==word_search_last,word==word_search_last_minus_one) %>% top_n(5, wt=triprop) %>% mutate(chance = triprop*0.4^(backoff_index-2), origin = "trigrams") %>% ungroup() %>% select(selection = sec_next_word, chance, origin)} | |
if (lengthtext >1) {bichance = bigrams %>% filter(word==word_search_last) %>% top_n(5, wt=biprop) %>% mutate(chance = biprop*0.4^(backoff_index-1), origin = "bigrams") %>% ungroup() %>% select(selection = next_word, chance, origin)} | |
if (lengthtext == 1) {unichance = unigrams %>% top_n(5, uniprop) %>% mutate(chance = uniprop*0.4^(backoff_index-1), origin = "unigram") %>% ungroup() %>% select(selection = word, chance, origin)} | |
chance_final <- rbind(if(exists("fivechance")){fivechance}, | |
if(exists("fourchance")){fourchance}, | |
if(exists("threechance")){threechance}, | |
if(exists("bichance")){bichance}, | |
if(exists("unichance")){unichance}) | |
if(exists("fivechance")){rm(fivechance)} | |
if(exists("fourchance")){rm(fourchance)} | |
if(exists("threechance")){rm(threechance)} | |
if(exists("bichance")){rm(bichance)} | |
if(exists("unichance")){rm(unichance)} | |
ifelse(exists("chance_final"), | |
chance_final2 <- chance_final %>% arrange(selection,desc(chance)) %>% distinct(selection,.keep_all = TRUE), | |
chance_final2 <- unigrams %>% top_n(5, wt=uniprop)) | |
return(list(splittext,chance_final2,lengthtext,backoff_index)) | |
} | |
# retrieve the dev grams to run the function---- | |
fivegrams <- fread("lean_five.csv") | |
fourgrams <- fread("lean_four.csv") | |
trigrams <- fread("lean_tri.csv") | |
bigrams <- fread("lean_bi.csv") | |
unigrams <- fread("lean_uni.csv") | |
# run the function for the fivegrams ---- | |
wordpredict("to live and let")[[2]]$selection | |
un# write a function to return T or F and store in the fivegrams | |
checkmatch <- function(aword, bword, cword, dword, eword){ | |
results = paste0(aword, bword, cword, dword, collapse = " ") %>% wordpredict() %>% extract2(2) %>% select(selection) | |
checkmatch <- is.element(eword[1],results$selection) | |
return(checkmatch) | |
} | |
fivegrams_test = fivegrams_test %>% mutate(result = checkmatch(word,next_word,sec_next_word,third_next_word,fourth_next_word)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment