myrddian · September 28, 2018 07:06
diff --git a/optimiser.R b/optimiser.R
 model.analysis <-function(test_res, predict_rsp) {


  col_names <- c("TruePositive","FalsePositive",
                 "FalseNegative","Accuracy","Precision","Recall","F1")
  ret_val <- data.frame(matrix(ncol = length(col_names), nrow = 0))
  colnames(ret_val) <- col_names
  ret_val[nrow(ret_val) + 1,] = c(tp,fp,fn,accuracy,precision,recall,f1)
  return(ret_val)
 }


 #Penalty Calculator
 model.penalty_calc <- function(thrs, penal_threshold)
 {
  #return zero
  if(penal_threshold == FALSE) {
    return(0)
  }
  
  #Otherwise calculate and return
  n_p <- (thrs*0.5) - 0.25
  
  #Clip return values to within -0.25 and 0.25
  if(n_p > 0.25)
  {
    return(0.25)
  }
  if(n_p < -0.25)
  {
    return(-0.25)
  }
  #Return the penalty
  return(n_p)
 }

 model.acc_score <- function(analysis_res, penalty_cal)
 {
  return(as.numeric(analysis_res$Accuracy) +  penalty_cal )
 }

 model.actuals_score <- function(analysis_res, penalty_cal)
 {
  return(as.numeric(analysis_res$TruePositive) - as.numeric(analysis_res$FalsePositive) - as.numeric(analysis_res$FalseNegative) )
 }

 model.comp_score <- function(analysis_res, penalty_cal)
 {
  return(as.numeric(analysis_res$F1) + as.numeric(analysis_res$Accuracy) +  penalty_cal )
 }

 #Scoring function to determine model fitness
 model.score <- function(analysis_res, penalty_cal)
 {
  return(as.numeric(analysis_res$F1) +  penalty_cal )
 }

 #Generic data massaging function in case its needed
 model.correct_factor <- function(inputSet)
 {
  inputSet$V21  <- factor(inputSet$V21)
  return(inputSet)
 }

 model.print_status <- function(talk, min_talk, formula, best_form,steps,best_step,score,best_score, i,best_thrs, thr_pn, best_penalty, analysis_res, best_analysis, progress_only)
 {
  #Print debug information, don't do this on JUPYTER
  if(talk)
  {
    if(progress_only)
    {
      print(paste("Test No: ", steps))
    }
    else
    {
      print("--------------------------------------")
      print(paste("Test No: ", steps))
      print(paste("Best Found at Step:", best_step))
      print("--------------------------------------")
      if(min_talk == FALSE)
      {
        print(paste("Formula : ", formula))
        print(paste("Best Formula: ", best_form))
        print("--------------------------------------")
        print(paste("Score : ", score))
        print(paste("Best Score: ", best_score))
        print("--------------------------------------")
        print(paste("Threshold: ", i))
        print(paste("Best Thrs:", best_thrs))
        print("--------------------------------------")
        print(paste("Threshold Penalty : ", thr_pn))
        print(paste("Best Thrs penalty :", best_penalty))
        print("--------------------------------------")
      }
      else
      {
        print("=====================================================")
        print("Current Matrix: ")
        print(analysis_res)
        print("Best Analysis Matrix: ")
        print(best_analysis)
        print("=====================================================")
      }
    }
  }
 }

 model.multi_pick <- function(var_size, var_map, vars, target_var){
  rng_pic_size = sample(1:var_size, 1)
  rng_pic_sel = sample(1:var_size, rng_pic_size)
  
  for(rng_pic in rng_pic_sel)
  {
    if(var_map[rng_pic] == TRUE)
    {
      var_map[rng_pic] = FALSE
    }
    else
    {
      var_map[rng_pic] = TRUE
    }
    #Prevent a null formula
    if(any(var_map) == FALSE)
    {
      var_map[rng_pic] = TRUE
    }
  }
  #Generate new formula
  new_list <- c()
  for(i in 1:var_size)
  {
    if(var_map[i]){
      new_list <- c(new_list,vars[i])
    }
  }
  new_list <- new_list[!is.na(new_list)] #Remove empty elements in formula
  formula <- paste(target_var, paste("~", paste(new_list, collapse="+"))) #Collapse to something glm can use
  return(formula)
 }

 model.single_pick <- function(var_size, var_map, vars, target_var){
  
  #Pick variable to toggle
  rng_pic = sample(1:var_size, 1)
  #Toggle the selection
  if(var_map[rng_pic] == TRUE)
  {
    var_map[rng_pic] = FALSE
  }
  else
  {
    var_map[rng_pic] = TRUE
  }
  #Prevent a null formula
  if(any(var_map) == FALSE)
  {
    var_map[rng_pic] = TRUE
  }
  
  #Generate new formula
  new_list <- c()
  for(i in 1:var_size)
  {
    if(var_map[i]){
      new_list <- c(new_list,vars[i])
    }
  }
  new_list <- new_list[!is.na(new_list)] #Remove empty elements in formula
  formula <- paste(target_var, paste("~", paste(new_list, collapse="+"))) #Collapse to something glm can use
  return(formula)
 }

 #The stochastic hill climbing optimiser
 model.optimiser <- function(trainSet, #Training Set
                         testSet,     #Test Set
                         testSetRsp = testSet$V21, #Set the test rsp
                         iter=10000,  #No of iterations to optimise
                         target_var="V21",  #target variable
                         talk=TRUE,  #print optimising information as it runs
                         min_talk = FALSE, #Only put the analysis matrix results
                         progress_only = FALSE, #Only put a progress/percent complete
                         penal_threshold = TRUE,   #panalise low threshold values
                         no_threshold = FALSE, # Dont move the threshold value
                         def_threshold = 0.5, #default threshold
                         thrs_low_min = 0, #Lowest possible threshold the optimiser will take
                         penalty_calc = model.penalty_calc, #Penalty Calculator
                         calc_score = model.score, #Score calculator
                         model_analysis,  #functions to analyse model results
                         run_model, #Model to execute for evaluation
                         data_massager = model.correct_factor,
                         monte_carlo = FALSE , #Do a monte-carlo style optimisation instead 
                         multi_pick = FALSE, #Pick multiple variables to mutate)   
                         ret_conf = FALSE )  #Calculate and return the confusion matrix in the end
 {
  #Load lib
  library(caret) # for confusion matrix and other stats
  library(e1071) # for naiveBayes
  library(rpart)
  
  #Correct Factor
  testSet  <- data_massager(testSet)
  trainSet <- data_massager(trainSet)
  
  #Initialise optimiser
  var_size = length(names(trainSet)) -1
  vars  = names(trainSet)
  var_map = logical(length = var_size)
  var_map[1] = TRUE
  best_formula <- var_map
  best_thrs <- 0.5
  true_count = 1
  best_tp <- 0 
  best_f1  <- 0 
  best_score <- 0
  best_penalty <- 0
  best_accuracy <- 0
  best_step <- 0
  best_form <- ""
  best_analysis <- data.frame(matrix(ncol = 0, nrow = 0))
  #Start the hillclimb
  for(steps in 1:iter)
  {
    #Copy the best formula on the current one to mutate
    if(monte_carlo == FALSE )
    {
      var_map <- best_formula
    }
    else
    {
      #Start a new
      var_map = logical(length = var_size)
    }
    
    if(multi_pick)
    {
      formula <- model.multi_pick(var_size, var_map, vars, target_var) 
    }
    else
    {
      formula <- model.single_pick(var_size, var_map, vars, target_var) 
    }
    
    #Generate a random threshold value b/w 0 and 1
    if(no_threshold == FALSE)
    {
      i <- rnorm(1,mean=0.5,sd=0.25)
      if( i < thrs_low_min ) {i <- def_threshold} #Prevent less than 0
      if( i > 1 ) {i <- def_threshold} #Prevent a greater than 1 
    }
    else
    {
      i = def_threshold
    }

    #Run the model
    tst_res <- run_model(formula,trainSet,testSet,i) #Analyse its cost
    analysis_res <- model_analysis(tst_res, testSetRsp)  #Generate analysis data

    #Score
    if(is.nan(as.numeric(analysis_res$F1)) == FALSE)  #If the it has NAN in the confusion matrix reject the mutation
    {
      #Prefer results with a higher threshold, the higher the better, 
      #Move the optimiser to a higher F1 score and higher threshold for the logistic function
      thr_pn <- penalty_calc(i, penal_threshold) #Calculate a penalty for using a lower threshold
      score <-  calc_score(analysis_res, thr_pn )   #Calculate score
      
      model.print_status(talk, min_talk, formula, best_form,steps,best_step,score,best_score, i,best_thrs, thr_pn, best_penalty, analysis_res, best_analysis, progress_only)
     
      if(score > best_score)   #If its score is better accept the mutation as the best so far
      {
        best_formula <- var_map
        best_thrs <- i
        best_tp <- as.numeric(analysis_res$TruePositive)
        best_f1 <- as.numeric(analysis_res$F1)
        best_score <- score
        best_form <- formula
        best_step <- steps
        best_accuracy <- as.numeric(analysis_res$Accuracy)
        best_penalty <- thr_pn
        best_analysis <- analysis_res
        best_res <- tst_res
      }
    }
  }
  print("=====================================================")
  print("=====================================================")
  print("Final Result: ")
  print(paste("Best Score:", best_score))
  print(paste("Best Thrs:", best_thrs))
  print(paste("Best TP: ", best_tp))
  print(paste("Best F1 Score: ", best_f1))
  print(paste("Best Formula: ", best_form))
  print(paste("Best Found at Step:", best_step))
  print("Best Analysis Matrix: ")
  print(best_analysis)
  print("=====================================================")
  print("=====================================================")
  
  if(ret_conf){
    tst_rsp <- as.factor(best_res)
    prd_rsp <- as.factor(testSetRsp)
    return(confusionMatrix(data=tst_rsp, reference=prd_rsp))
  }
 }
	model.analysis <-function(test_res, predict_rsp) {


	col_names <- c("TruePositive","FalsePositive",
	"FalseNegative","Accuracy","Precision","Recall","F1")
	ret_val <- data.frame(matrix(ncol = length(col_names), nrow = 0))
	colnames(ret_val) <- col_names
	ret_val[nrow(ret_val) + 1,] = c(tp,fp,fn,accuracy,precision,recall,f1)
	return(ret_val)
	}


	#Penalty Calculator
	model.penalty_calc <- function(thrs, penal_threshold)
	{
	#return zero
	if(penal_threshold == FALSE) {
	return(0)
	}

	#Otherwise calculate and return
	n_p <- (thrs*0.5) - 0.25

	#Clip return values to within -0.25 and 0.25
	if(n_p > 0.25)
	{
	return(0.25)
	}
	if(n_p < -0.25)
	{
	return(-0.25)
	}
	#Return the penalty
	return(n_p)
	}

	model.acc_score <- function(analysis_res, penalty_cal)
	{
	return(as.numeric(analysis_res$Accuracy) + penalty_cal )
	}

	model.actuals_score <- function(analysis_res, penalty_cal)
	{
	return(as.numeric(analysis_res$TruePositive) - as.numeric(analysis_res$FalsePositive) - as.numeric(analysis_res$FalseNegative) )
	}

	model.comp_score <- function(analysis_res, penalty_cal)
	{
	return(as.numeric(analysis_res$F1) + as.numeric(analysis_res$Accuracy) + penalty_cal )
	}

	#Scoring function to determine model fitness
	model.score <- function(analysis_res, penalty_cal)
	{
	return(as.numeric(analysis_res$F1) + penalty_cal )
	}

	#Generic data massaging function in case its needed
	model.correct_factor <- function(inputSet)
	{
	inputSet$V21 <- factor(inputSet$V21)
	return(inputSet)
	}

	model.print_status <- function(talk, min_talk, formula, best_form,steps,best_step,score,best_score, i,best_thrs, thr_pn, best_penalty, analysis_res, best_analysis, progress_only)
	{
	#Print debug information, don't do this on JUPYTER
	if(talk)
	{
	if(progress_only)
	{
	print(paste("Test No: ", steps))
	}
	else
	{
	print("--------------------------------------")
	print(paste("Test No: ", steps))
	print(paste("Best Found at Step:", best_step))
	print("--------------------------------------")
	if(min_talk == FALSE)
	{
	print(paste("Formula : ", formula))
	print(paste("Best Formula: ", best_form))
	print("--------------------------------------")
	print(paste("Score : ", score))
	print(paste("Best Score: ", best_score))
	print("--------------------------------------")
	print(paste("Threshold: ", i))
	print(paste("Best Thrs:", best_thrs))
	print("--------------------------------------")
	print(paste("Threshold Penalty : ", thr_pn))
	print(paste("Best Thrs penalty :", best_penalty))
	print("--------------------------------------")
	}
	else
	{
	print("=====================================================")
	print("Current Matrix: ")
	print(analysis_res)
	print("Best Analysis Matrix: ")
	print(best_analysis)
	print("=====================================================")
	}
	}
	}
	}

	model.multi_pick <- function(var_size, var_map, vars, target_var){
	rng_pic_size = sample(1:var_size, 1)
	rng_pic_sel = sample(1:var_size, rng_pic_size)

	for(rng_pic in rng_pic_sel)
	{
	if(var_map[rng_pic] == TRUE)
	{
	var_map[rng_pic] = FALSE
	}
	else
	{
	var_map[rng_pic] = TRUE
	}
	#Prevent a null formula
	if(any(var_map) == FALSE)
	{
	var_map[rng_pic] = TRUE
	}
	}
	#Generate new formula
	new_list <- c()
	for(i in 1:var_size)
	{
	if(var_map[i]){
	new_list <- c(new_list,vars[i])
	}
	}
	new_list <- new_list[!is.na(new_list)] #Remove empty elements in formula
	formula <- paste(target_var, paste("~", paste(new_list, collapse="+"))) #Collapse to something glm can use
	return(formula)
	}

	model.single_pick <- function(var_size, var_map, vars, target_var){

	#Pick variable to toggle
	rng_pic = sample(1:var_size, 1)
	#Toggle the selection
	if(var_map[rng_pic] == TRUE)
	{
	var_map[rng_pic] = FALSE
	}
	else
	{
	var_map[rng_pic] = TRUE
	}
	#Prevent a null formula
	if(any(var_map) == FALSE)
	{
	var_map[rng_pic] = TRUE
	}

	#Generate new formula
	new_list <- c()
	for(i in 1:var_size)
	{
	if(var_map[i]){
	new_list <- c(new_list,vars[i])
	}
	}
	new_list <- new_list[!is.na(new_list)] #Remove empty elements in formula
	formula <- paste(target_var, paste("~", paste(new_list, collapse="+"))) #Collapse to something glm can use
	return(formula)
	}

	#The stochastic hill climbing optimiser
	model.optimiser <- function(trainSet, #Training Set
	testSet, #Test Set
	testSetRsp = testSet$V21, #Set the test rsp
	iter=10000, #No of iterations to optimise
	target_var="V21", #target variable
	talk=TRUE, #print optimising information as it runs
	min_talk = FALSE, #Only put the analysis matrix results
	progress_only = FALSE, #Only put a progress/percent complete
	penal_threshold = TRUE, #panalise low threshold values
	no_threshold = FALSE, # Dont move the threshold value
	def_threshold = 0.5, #default threshold
	thrs_low_min = 0, #Lowest possible threshold the optimiser will take
	penalty_calc = model.penalty_calc, #Penalty Calculator
	calc_score = model.score, #Score calculator
	model_analysis, #functions to analyse model results
	run_model, #Model to execute for evaluation
	data_massager = model.correct_factor,
	monte_carlo = FALSE , #Do a monte-carlo style optimisation instead
	multi_pick = FALSE, #Pick multiple variables to mutate)
	ret_conf = FALSE ) #Calculate and return the confusion matrix in the end
	{
	#Load lib
	library(caret) # for confusion matrix and other stats
	library(e1071) # for naiveBayes
	library(rpart)

	#Correct Factor
	testSet <- data_massager(testSet)
	trainSet <- data_massager(trainSet)

	#Initialise optimiser
	var_size = length(names(trainSet)) -1
	vars = names(trainSet)
	var_map = logical(length = var_size)
	var_map[1] = TRUE
	best_formula <- var_map
	best_thrs <- 0.5
	true_count = 1
	best_tp <- 0
	best_f1 <- 0
	best_score <- 0
	best_penalty <- 0
	best_accuracy <- 0
	best_step <- 0
	best_form <- ""
	best_analysis <- data.frame(matrix(ncol = 0, nrow = 0))
	#Start the hillclimb
	for(steps in 1:iter)
	{
	#Copy the best formula on the current one to mutate
	if(monte_carlo == FALSE )
	{
	var_map <- best_formula
	}
	else
	{
	#Start a new
	var_map = logical(length = var_size)
	}

	if(multi_pick)
	{
	formula <- model.multi_pick(var_size, var_map, vars, target_var)
	}
	else
	{
	formula <- model.single_pick(var_size, var_map, vars, target_var)
	}

	#Generate a random threshold value b/w 0 and 1
	if(no_threshold == FALSE)
	{
	i <- rnorm(1,mean=0.5,sd=0.25)
	if( i < thrs_low_min ) {i <- def_threshold} #Prevent less than 0
	if( i > 1 ) {i <- def_threshold} #Prevent a greater than 1
	}
	else
	{
	i = def_threshold
	}

	#Run the model
	tst_res <- run_model(formula,trainSet,testSet,i) #Analyse its cost
	analysis_res <- model_analysis(tst_res, testSetRsp) #Generate analysis data

	#Score
	if(is.nan(as.numeric(analysis_res$F1)) == FALSE) #If the it has NAN in the confusion matrix reject the mutation
	{
	#Prefer results with a higher threshold, the higher the better,
	#Move the optimiser to a higher F1 score and higher threshold for the logistic function
	thr_pn <- penalty_calc(i, penal_threshold) #Calculate a penalty for using a lower threshold
	score <- calc_score(analysis_res, thr_pn ) #Calculate score

	model.print_status(talk, min_talk, formula, best_form,steps,best_step,score,best_score, i,best_thrs, thr_pn, best_penalty, analysis_res, best_analysis, progress_only)

	if(score > best_score) #If its score is better accept the mutation as the best so far
	{
	best_formula <- var_map
	best_thrs <- i
	best_tp <- as.numeric(analysis_res$TruePositive)
	best_f1 <- as.numeric(analysis_res$F1)
	best_score <- score
	best_form <- formula
	best_step <- steps
	best_accuracy <- as.numeric(analysis_res$Accuracy)
	best_penalty <- thr_pn
	best_analysis <- analysis_res
	best_res <- tst_res
	}
	}
	}
	print("=====================================================")
	print("=====================================================")
	print("Final Result: ")
	print(paste("Best Score:", best_score))
	print(paste("Best Thrs:", best_thrs))
	print(paste("Best TP: ", best_tp))
	print(paste("Best F1 Score: ", best_f1))
	print(paste("Best Formula: ", best_form))
	print(paste("Best Found at Step:", best_step))
	print("Best Analysis Matrix: ")
	print(best_analysis)
	print("=====================================================")
	print("=====================================================")

	if(ret_conf){
	tst_rsp <- as.factor(best_res)
	prd_rsp <- as.factor(testSetRsp)
	return(confusionMatrix(data=tst_rsp, reference=prd_rsp))
	}
	}