Skip to content

Instantly share code, notes, and snippets.

@myrddian
Created September 28, 2018 07:06
Show Gist options
  • Save myrddian/01713b8d846dc52c6a0d267ccde100fb to your computer and use it in GitHub Desktop.
Save myrddian/01713b8d846dc52c6a0d267ccde100fb to your computer and use it in GitHub Desktop.
model.analysis <-function(test_res, predict_rsp) {
col_names <- c("TruePositive","FalsePositive",
"FalseNegative","Accuracy","Precision","Recall","F1")
ret_val <- data.frame(matrix(ncol = length(col_names), nrow = 0))
colnames(ret_val) <- col_names
ret_val[nrow(ret_val) + 1,] = c(tp,fp,fn,accuracy,precision,recall,f1)
return(ret_val)
}
#Penalty Calculator
model.penalty_calc <- function(thrs, penal_threshold)
{
#return zero
if(penal_threshold == FALSE) {
return(0)
}
#Otherwise calculate and return
n_p <- (thrs*0.5) - 0.25
#Clip return values to within -0.25 and 0.25
if(n_p > 0.25)
{
return(0.25)
}
if(n_p < -0.25)
{
return(-0.25)
}
#Return the penalty
return(n_p)
}
model.acc_score <- function(analysis_res, penalty_cal)
{
return(as.numeric(analysis_res$Accuracy) + penalty_cal )
}
model.actuals_score <- function(analysis_res, penalty_cal)
{
return(as.numeric(analysis_res$TruePositive) - as.numeric(analysis_res$FalsePositive) - as.numeric(analysis_res$FalseNegative) )
}
model.comp_score <- function(analysis_res, penalty_cal)
{
return(as.numeric(analysis_res$F1) + as.numeric(analysis_res$Accuracy) + penalty_cal )
}
#Scoring function to determine model fitness
model.score <- function(analysis_res, penalty_cal)
{
return(as.numeric(analysis_res$F1) + penalty_cal )
}
#Generic data massaging function in case its needed
model.correct_factor <- function(inputSet)
{
inputSet$V21 <- factor(inputSet$V21)
return(inputSet)
}
model.print_status <- function(talk, min_talk, formula, best_form,steps,best_step,score,best_score, i,best_thrs, thr_pn, best_penalty, analysis_res, best_analysis, progress_only)
{
#Print debug information, don't do this on JUPYTER
if(talk)
{
if(progress_only)
{
print(paste("Test No: ", steps))
}
else
{
print("--------------------------------------")
print(paste("Test No: ", steps))
print(paste("Best Found at Step:", best_step))
print("--------------------------------------")
if(min_talk == FALSE)
{
print(paste("Formula : ", formula))
print(paste("Best Formula: ", best_form))
print("--------------------------------------")
print(paste("Score : ", score))
print(paste("Best Score: ", best_score))
print("--------------------------------------")
print(paste("Threshold: ", i))
print(paste("Best Thrs:", best_thrs))
print("--------------------------------------")
print(paste("Threshold Penalty : ", thr_pn))
print(paste("Best Thrs penalty :", best_penalty))
print("--------------------------------------")
}
else
{
print("=====================================================")
print("Current Matrix: ")
print(analysis_res)
print("Best Analysis Matrix: ")
print(best_analysis)
print("=====================================================")
}
}
}
}
model.multi_pick <- function(var_size, var_map, vars, target_var){
rng_pic_size = sample(1:var_size, 1)
rng_pic_sel = sample(1:var_size, rng_pic_size)
for(rng_pic in rng_pic_sel)
{
if(var_map[rng_pic] == TRUE)
{
var_map[rng_pic] = FALSE
}
else
{
var_map[rng_pic] = TRUE
}
#Prevent a null formula
if(any(var_map) == FALSE)
{
var_map[rng_pic] = TRUE
}
}
#Generate new formula
new_list <- c()
for(i in 1:var_size)
{
if(var_map[i]){
new_list <- c(new_list,vars[i])
}
}
new_list <- new_list[!is.na(new_list)] #Remove empty elements in formula
formula <- paste(target_var, paste("~", paste(new_list, collapse="+"))) #Collapse to something glm can use
return(formula)
}
model.single_pick <- function(var_size, var_map, vars, target_var){
#Pick variable to toggle
rng_pic = sample(1:var_size, 1)
#Toggle the selection
if(var_map[rng_pic] == TRUE)
{
var_map[rng_pic] = FALSE
}
else
{
var_map[rng_pic] = TRUE
}
#Prevent a null formula
if(any(var_map) == FALSE)
{
var_map[rng_pic] = TRUE
}
#Generate new formula
new_list <- c()
for(i in 1:var_size)
{
if(var_map[i]){
new_list <- c(new_list,vars[i])
}
}
new_list <- new_list[!is.na(new_list)] #Remove empty elements in formula
formula <- paste(target_var, paste("~", paste(new_list, collapse="+"))) #Collapse to something glm can use
return(formula)
}
#The stochastic hill climbing optimiser
model.optimiser <- function(trainSet, #Training Set
testSet, #Test Set
testSetRsp = testSet$V21, #Set the test rsp
iter=10000, #No of iterations to optimise
target_var="V21", #target variable
talk=TRUE, #print optimising information as it runs
min_talk = FALSE, #Only put the analysis matrix results
progress_only = FALSE, #Only put a progress/percent complete
penal_threshold = TRUE, #panalise low threshold values
no_threshold = FALSE, # Dont move the threshold value
def_threshold = 0.5, #default threshold
thrs_low_min = 0, #Lowest possible threshold the optimiser will take
penalty_calc = model.penalty_calc, #Penalty Calculator
calc_score = model.score, #Score calculator
model_analysis, #functions to analyse model results
run_model, #Model to execute for evaluation
data_massager = model.correct_factor,
monte_carlo = FALSE , #Do a monte-carlo style optimisation instead
multi_pick = FALSE, #Pick multiple variables to mutate)
ret_conf = FALSE ) #Calculate and return the confusion matrix in the end
{
#Load lib
library(caret) # for confusion matrix and other stats
library(e1071) # for naiveBayes
library(rpart)
#Correct Factor
testSet <- data_massager(testSet)
trainSet <- data_massager(trainSet)
#Initialise optimiser
var_size = length(names(trainSet)) -1
vars = names(trainSet)
var_map = logical(length = var_size)
var_map[1] = TRUE
best_formula <- var_map
best_thrs <- 0.5
true_count = 1
best_tp <- 0
best_f1 <- 0
best_score <- 0
best_penalty <- 0
best_accuracy <- 0
best_step <- 0
best_form <- ""
best_analysis <- data.frame(matrix(ncol = 0, nrow = 0))
#Start the hillclimb
for(steps in 1:iter)
{
#Copy the best formula on the current one to mutate
if(monte_carlo == FALSE )
{
var_map <- best_formula
}
else
{
#Start a new
var_map = logical(length = var_size)
}
if(multi_pick)
{
formula <- model.multi_pick(var_size, var_map, vars, target_var)
}
else
{
formula <- model.single_pick(var_size, var_map, vars, target_var)
}
#Generate a random threshold value b/w 0 and 1
if(no_threshold == FALSE)
{
i <- rnorm(1,mean=0.5,sd=0.25)
if( i < thrs_low_min ) {i <- def_threshold} #Prevent less than 0
if( i > 1 ) {i <- def_threshold} #Prevent a greater than 1
}
else
{
i = def_threshold
}
#Run the model
tst_res <- run_model(formula,trainSet,testSet,i) #Analyse its cost
analysis_res <- model_analysis(tst_res, testSetRsp) #Generate analysis data
#Score
if(is.nan(as.numeric(analysis_res$F1)) == FALSE) #If the it has NAN in the confusion matrix reject the mutation
{
#Prefer results with a higher threshold, the higher the better,
#Move the optimiser to a higher F1 score and higher threshold for the logistic function
thr_pn <- penalty_calc(i, penal_threshold) #Calculate a penalty for using a lower threshold
score <- calc_score(analysis_res, thr_pn ) #Calculate score
model.print_status(talk, min_talk, formula, best_form,steps,best_step,score,best_score, i,best_thrs, thr_pn, best_penalty, analysis_res, best_analysis, progress_only)
if(score > best_score) #If its score is better accept the mutation as the best so far
{
best_formula <- var_map
best_thrs <- i
best_tp <- as.numeric(analysis_res$TruePositive)
best_f1 <- as.numeric(analysis_res$F1)
best_score <- score
best_form <- formula
best_step <- steps
best_accuracy <- as.numeric(analysis_res$Accuracy)
best_penalty <- thr_pn
best_analysis <- analysis_res
best_res <- tst_res
}
}
}
print("=====================================================")
print("=====================================================")
print("Final Result: ")
print(paste("Best Score:", best_score))
print(paste("Best Thrs:", best_thrs))
print(paste("Best TP: ", best_tp))
print(paste("Best F1 Score: ", best_f1))
print(paste("Best Formula: ", best_form))
print(paste("Best Found at Step:", best_step))
print("Best Analysis Matrix: ")
print(best_analysis)
print("=====================================================")
print("=====================================================")
if(ret_conf){
tst_rsp <- as.factor(best_res)
prd_rsp <- as.factor(testSetRsp)
return(confusionMatrix(data=tst_rsp, reference=prd_rsp))
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment