Created
September 28, 2018 07:06
-
-
Save myrddian/01713b8d846dc52c6a0d267ccde100fb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
model.analysis <-function(test_res, predict_rsp) { | |
col_names <- c("TruePositive","FalsePositive", | |
"FalseNegative","Accuracy","Precision","Recall","F1") | |
ret_val <- data.frame(matrix(ncol = length(col_names), nrow = 0)) | |
colnames(ret_val) <- col_names | |
ret_val[nrow(ret_val) + 1,] = c(tp,fp,fn,accuracy,precision,recall,f1) | |
return(ret_val) | |
} | |
#Penalty Calculator | |
model.penalty_calc <- function(thrs, penal_threshold) | |
{ | |
#return zero | |
if(penal_threshold == FALSE) { | |
return(0) | |
} | |
#Otherwise calculate and return | |
n_p <- (thrs*0.5) - 0.25 | |
#Clip return values to within -0.25 and 0.25 | |
if(n_p > 0.25) | |
{ | |
return(0.25) | |
} | |
if(n_p < -0.25) | |
{ | |
return(-0.25) | |
} | |
#Return the penalty | |
return(n_p) | |
} | |
model.acc_score <- function(analysis_res, penalty_cal) | |
{ | |
return(as.numeric(analysis_res$Accuracy) + penalty_cal ) | |
} | |
model.actuals_score <- function(analysis_res, penalty_cal) | |
{ | |
return(as.numeric(analysis_res$TruePositive) - as.numeric(analysis_res$FalsePositive) - as.numeric(analysis_res$FalseNegative) ) | |
} | |
model.comp_score <- function(analysis_res, penalty_cal) | |
{ | |
return(as.numeric(analysis_res$F1) + as.numeric(analysis_res$Accuracy) + penalty_cal ) | |
} | |
#Scoring function to determine model fitness | |
model.score <- function(analysis_res, penalty_cal) | |
{ | |
return(as.numeric(analysis_res$F1) + penalty_cal ) | |
} | |
#Generic data massaging function in case its needed | |
model.correct_factor <- function(inputSet) | |
{ | |
inputSet$V21 <- factor(inputSet$V21) | |
return(inputSet) | |
} | |
model.print_status <- function(talk, min_talk, formula, best_form,steps,best_step,score,best_score, i,best_thrs, thr_pn, best_penalty, analysis_res, best_analysis, progress_only) | |
{ | |
#Print debug information, don't do this on JUPYTER | |
if(talk) | |
{ | |
if(progress_only) | |
{ | |
print(paste("Test No: ", steps)) | |
} | |
else | |
{ | |
print("--------------------------------------") | |
print(paste("Test No: ", steps)) | |
print(paste("Best Found at Step:", best_step)) | |
print("--------------------------------------") | |
if(min_talk == FALSE) | |
{ | |
print(paste("Formula : ", formula)) | |
print(paste("Best Formula: ", best_form)) | |
print("--------------------------------------") | |
print(paste("Score : ", score)) | |
print(paste("Best Score: ", best_score)) | |
print("--------------------------------------") | |
print(paste("Threshold: ", i)) | |
print(paste("Best Thrs:", best_thrs)) | |
print("--------------------------------------") | |
print(paste("Threshold Penalty : ", thr_pn)) | |
print(paste("Best Thrs penalty :", best_penalty)) | |
print("--------------------------------------") | |
} | |
else | |
{ | |
print("=====================================================") | |
print("Current Matrix: ") | |
print(analysis_res) | |
print("Best Analysis Matrix: ") | |
print(best_analysis) | |
print("=====================================================") | |
} | |
} | |
} | |
} | |
model.multi_pick <- function(var_size, var_map, vars, target_var){ | |
rng_pic_size = sample(1:var_size, 1) | |
rng_pic_sel = sample(1:var_size, rng_pic_size) | |
for(rng_pic in rng_pic_sel) | |
{ | |
if(var_map[rng_pic] == TRUE) | |
{ | |
var_map[rng_pic] = FALSE | |
} | |
else | |
{ | |
var_map[rng_pic] = TRUE | |
} | |
#Prevent a null formula | |
if(any(var_map) == FALSE) | |
{ | |
var_map[rng_pic] = TRUE | |
} | |
} | |
#Generate new formula | |
new_list <- c() | |
for(i in 1:var_size) | |
{ | |
if(var_map[i]){ | |
new_list <- c(new_list,vars[i]) | |
} | |
} | |
new_list <- new_list[!is.na(new_list)] #Remove empty elements in formula | |
formula <- paste(target_var, paste("~", paste(new_list, collapse="+"))) #Collapse to something glm can use | |
return(formula) | |
} | |
model.single_pick <- function(var_size, var_map, vars, target_var){ | |
#Pick variable to toggle | |
rng_pic = sample(1:var_size, 1) | |
#Toggle the selection | |
if(var_map[rng_pic] == TRUE) | |
{ | |
var_map[rng_pic] = FALSE | |
} | |
else | |
{ | |
var_map[rng_pic] = TRUE | |
} | |
#Prevent a null formula | |
if(any(var_map) == FALSE) | |
{ | |
var_map[rng_pic] = TRUE | |
} | |
#Generate new formula | |
new_list <- c() | |
for(i in 1:var_size) | |
{ | |
if(var_map[i]){ | |
new_list <- c(new_list,vars[i]) | |
} | |
} | |
new_list <- new_list[!is.na(new_list)] #Remove empty elements in formula | |
formula <- paste(target_var, paste("~", paste(new_list, collapse="+"))) #Collapse to something glm can use | |
return(formula) | |
} | |
#The stochastic hill climbing optimiser | |
model.optimiser <- function(trainSet, #Training Set | |
testSet, #Test Set | |
testSetRsp = testSet$V21, #Set the test rsp | |
iter=10000, #No of iterations to optimise | |
target_var="V21", #target variable | |
talk=TRUE, #print optimising information as it runs | |
min_talk = FALSE, #Only put the analysis matrix results | |
progress_only = FALSE, #Only put a progress/percent complete | |
penal_threshold = TRUE, #panalise low threshold values | |
no_threshold = FALSE, # Dont move the threshold value | |
def_threshold = 0.5, #default threshold | |
thrs_low_min = 0, #Lowest possible threshold the optimiser will take | |
penalty_calc = model.penalty_calc, #Penalty Calculator | |
calc_score = model.score, #Score calculator | |
model_analysis, #functions to analyse model results | |
run_model, #Model to execute for evaluation | |
data_massager = model.correct_factor, | |
monte_carlo = FALSE , #Do a monte-carlo style optimisation instead | |
multi_pick = FALSE, #Pick multiple variables to mutate) | |
ret_conf = FALSE ) #Calculate and return the confusion matrix in the end | |
{ | |
#Load lib | |
library(caret) # for confusion matrix and other stats | |
library(e1071) # for naiveBayes | |
library(rpart) | |
#Correct Factor | |
testSet <- data_massager(testSet) | |
trainSet <- data_massager(trainSet) | |
#Initialise optimiser | |
var_size = length(names(trainSet)) -1 | |
vars = names(trainSet) | |
var_map = logical(length = var_size) | |
var_map[1] = TRUE | |
best_formula <- var_map | |
best_thrs <- 0.5 | |
true_count = 1 | |
best_tp <- 0 | |
best_f1 <- 0 | |
best_score <- 0 | |
best_penalty <- 0 | |
best_accuracy <- 0 | |
best_step <- 0 | |
best_form <- "" | |
best_analysis <- data.frame(matrix(ncol = 0, nrow = 0)) | |
#Start the hillclimb | |
for(steps in 1:iter) | |
{ | |
#Copy the best formula on the current one to mutate | |
if(monte_carlo == FALSE ) | |
{ | |
var_map <- best_formula | |
} | |
else | |
{ | |
#Start a new | |
var_map = logical(length = var_size) | |
} | |
if(multi_pick) | |
{ | |
formula <- model.multi_pick(var_size, var_map, vars, target_var) | |
} | |
else | |
{ | |
formula <- model.single_pick(var_size, var_map, vars, target_var) | |
} | |
#Generate a random threshold value b/w 0 and 1 | |
if(no_threshold == FALSE) | |
{ | |
i <- rnorm(1,mean=0.5,sd=0.25) | |
if( i < thrs_low_min ) {i <- def_threshold} #Prevent less than 0 | |
if( i > 1 ) {i <- def_threshold} #Prevent a greater than 1 | |
} | |
else | |
{ | |
i = def_threshold | |
} | |
#Run the model | |
tst_res <- run_model(formula,trainSet,testSet,i) #Analyse its cost | |
analysis_res <- model_analysis(tst_res, testSetRsp) #Generate analysis data | |
#Score | |
if(is.nan(as.numeric(analysis_res$F1)) == FALSE) #If the it has NAN in the confusion matrix reject the mutation | |
{ | |
#Prefer results with a higher threshold, the higher the better, | |
#Move the optimiser to a higher F1 score and higher threshold for the logistic function | |
thr_pn <- penalty_calc(i, penal_threshold) #Calculate a penalty for using a lower threshold | |
score <- calc_score(analysis_res, thr_pn ) #Calculate score | |
model.print_status(talk, min_talk, formula, best_form,steps,best_step,score,best_score, i,best_thrs, thr_pn, best_penalty, analysis_res, best_analysis, progress_only) | |
if(score > best_score) #If its score is better accept the mutation as the best so far | |
{ | |
best_formula <- var_map | |
best_thrs <- i | |
best_tp <- as.numeric(analysis_res$TruePositive) | |
best_f1 <- as.numeric(analysis_res$F1) | |
best_score <- score | |
best_form <- formula | |
best_step <- steps | |
best_accuracy <- as.numeric(analysis_res$Accuracy) | |
best_penalty <- thr_pn | |
best_analysis <- analysis_res | |
best_res <- tst_res | |
} | |
} | |
} | |
print("=====================================================") | |
print("=====================================================") | |
print("Final Result: ") | |
print(paste("Best Score:", best_score)) | |
print(paste("Best Thrs:", best_thrs)) | |
print(paste("Best TP: ", best_tp)) | |
print(paste("Best F1 Score: ", best_f1)) | |
print(paste("Best Formula: ", best_form)) | |
print(paste("Best Found at Step:", best_step)) | |
print("Best Analysis Matrix: ") | |
print(best_analysis) | |
print("=====================================================") | |
print("=====================================================") | |
if(ret_conf){ | |
tst_rsp <- as.factor(best_res) | |
prd_rsp <- as.factor(testSetRsp) | |
return(confusionMatrix(data=tst_rsp, reference=prd_rsp)) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment