myrddian · September 29, 2018 04:38
diff --git a/A2.R b/A2.R
 #returns a randomly generated dirichlet 
 random_dirichlet <- function(n_var, alpha, beta) {
  
  return_val <- 0
  for(i in 1:n_var) {
    return_val[i] <- rgamma(1,shape=alpha,scale = beta)
  }

  #Normalise and round to a nice values
  return_val <- round(return_val/sum(return_val) * 100,1)/100

  return(return_val)  
 }

 entropy_x_y_d <- function(dataset, alpha)
 {
    
    
    #Get sample sizes #Add dirichlet 
    for(i in 1:length(dataset$AUSTRALIA))
    {
        #Generate a dirichlet, 5 variables, alpha set to the input parameted and beta 1
        dirichlet <- random_dirichlet(5,alpha,1)
        dataset$AUSTRALIA[i] <-  dataset$AUSTRALIA[i] + round(dirichlet[1]* alpha,0)
        dataset$CHINA[i]     <-  dataset$CHINA[i]     + round(dirichlet[2]* alpha,0) 
        dataset$ENGLAND[i]   <-  dataset$ENGLAND[i]   + round(dirichlet[3]* alpha,0) 
        dataset$INDIA[i]     <-  dataset$INDIA[i]     + round(dirichlet[4]* alpha,0) 
        dataset$OTHERS[i]    <-  dataset$OTHERS[i]    + round(dirichlet[5]* alpha,0)
    }
    
    dataset$lga_size <- dataset$AUSTRALIA + dataset$CHINA + dataset$ENGLAND + dataset$INDIA + dataset$OTHERS
    total_size <- sum(dataset$lga_size) 
    
    #Calculate expected value
    dataset$lga_e <- ( 
          (-1 * ( (dataset$AUSTRALIA / dataset$lga_size ) * log2(dataset$AUSTRALIA / dataset$lga_size) )) +
          (-1 * ( (dataset$ENGLAND / dataset$lga_size )   * log2(dataset$ENGLAND / dataset$lga_size) ) ) + 
          (-1 * ( (dataset$CHINA / dataset$lga_size )     * log2(dataset$CHINA / dataset$lga_size) ) ) + 
          (-1 * ( (dataset$INDIA / dataset$lga_size )     * log2(dataset$INDIA / dataset$lga_size) ) ) + 
          (-1 * ( (dataset$OTHERS / dataset$lga_size )    * log2(dataset$OTHERS / dataset$lga_size) ) )  
    )
    
    #Pre-compute mean weighted average and multiply with the expected Entropy value
    dataset$lg_mwa <- (dataset$lga_size/total_size ) * dataset$lga_e
    #Sum the values
    t_ent <- sum(dataset$lg_mwa)
    
    return(t_ent)
 }

 #Generate the plot data
 for(i in 1:100)
 {
    #Add 5 to increase the starting bias as anythign <5 will cause it to generate 0 
    #Values
    plot_points[i] <-  entropy_x_y_d(data, i + 5)
 }
 #Plot the data
 plot(seq(1,100), plot_points, ylab="Entropy (in bits)", xlab="Alpha size")



 #Initial starting condition 0-set array and the test beta
 ex_b = c(0.43, -1.7, 2.8)
 initial_theta <- rep(0,ncol(XTest))

 #Z Function its the function that adds all the inputs and returns
 #A value to the tanh activator
 q2.util.z <- function(beta, i_num, x_row) {
    zv <- beta[1] + x_row[2]*beta[2] + x_row[3]*beta[3]
    return(zv)
 }

 #Various utility functions that cover steps of the 
 #the log likelihood
 q2.util.p      <- function(i_z) return(exp(i_z) + exp(-1*i_z)) #Log of the thanh activator function
 q2.util.l      <- function(p_z) return(-1 *log(p_z))           #Returns the -log of the value
 q2.util.yb     <- function(y,p_z) return(y*p_z)                #multiplies Y by p_z 
 q2.util.norm   <- function(v,n) return(v/n)                    #divides all the values by n


 #This function is the loglikelihood function as provided
 #The algorithm is basic, it adds up all the
 #values according to the function supplied
 q2.t_log <- function(init_b=ex_b,X=XTest,Y=YTest)
 {
  
  n <- length(Y)
  sum_t <-0
  
  for(i in 1:n)
  {
     x_r <- X[i,]
     zv <- q2.util.z(init_b,i,x_r)
     pv <- q2.util.p(zv)
     ll <- (q2.util.l(pv) + q2.util.yb(Y[i],zv))
     sum_t <- sum_t + ll
  }
  sum_t <- (sum_t ) * -1
  return(sum_t)
 }

 #This is the gradient decent function
 #It is based on the derivative of the provided log likelihood function
 #the form yi*xi - bi*tanh(z)
 #where z = b0x0 + b1x1 + b2x2
 #While a specifc special derived b0 function was calculated
 #I found that the function worked with the general function
 q2.t_gradient <- function(init_b=ex_b,X=XTest,Y=YTest)
 {
 
  n <- length(Y)
  alpha <- 0.00526  #Value experimentally determined for stability
  gradient <- init_b #copy the array
  
    for(i in 1:n) 
    {
         x_r <- X[i,]
         zv <- q2.util.z(gradient,i,x_r)
         #Work out the gradient using the formula 
         b0 <- ((Y[i]*x_r[1]) - (gradient[1]*tanh(zv)))
         b1 <- ((Y[i]*x_r[2]) - (gradient[2]*tanh(zv)))
         b2 <- ((Y[i]*x_r[3]) - (gradient[3]*tanh(zv)))
         #Once the gradients are calculated put them back into the array
         #following the formula x = x - alpha*gradient(x)
         gradient[1] =  gradient[1] - (alpha * b0 )
         gradient[2] =  gradient[2] - (alpha * b1 )
         gradient[3] =  gradient[3] - (alpha * b2 )  
    }
  return(gradient)
 }

 q2.var.normal <- 0
 q2.var.normal.sd <- 10
 #Ok the Guasssian theory for P(BETA|X) = (P(X|BETA)P(BETA))/P(X)
 #The log taken is: log P(BETA) + log P(X|BETA) −log P(X)
 #Normally P(X) is ignored following: https://wiseodd.github.io/techblog/2017/01/01/mle-vs-map/
 #Give us log P(BETA) + log P(X|BETA) - log P(X|BETA) is our MLE to begin with!
 #Where P(BETA) = Normal(BETA)
 #So We take our MLE and ADD the P(BETA) Component to the solver
 #This function calculates the log of the normal distribution using the PDF
 #https://www.ime.unicamp.br/~cnaber/optim%202.pdf
 #Following the document the beta prior is calculated as log of the pnorm 
 #Not the LogNorm distribution, if follows (1/sd*2)*b^2
 #Gives us the log of the norm in the P(BETA) Component
 q2.util.p_beta <- function(beta) {
  lp1 <- beta[1]^2  
  lp2 <- beta[2]^2  
  lp3 <- beta[3]^2 
  lp <- (lp1 + lp2 + lp3)*(1/(q2.var.normal.sd*2)) #Finally compute the log prior  
  return(lp)   #the betas n-curve prior  
 }


 q2.t_blog <- function(init_b=ex_b,X=XTest,Y=YTest)
 {
  n <- length(Y)
  sum_t <-0
  lp <- q2.util.p_beta(init_b)  
  for(i in 1:n)
  {
     x_r <- X[i,]
     zv <- q2.util.z(init_b,i,x_r)
     pv <- q2.util.p(zv)
     ll <- (q2.util.l(pv) + q2.util.yb(Y[i],zv))
     sum_t <- sum_t + ll
  }
  sum_t <- (sum_t * -1) + lp
  return(sum_t)
 }

 print("Sanity check the likelihood function")
 q2.t_log()
 q2.t_blog()
 print("Sanity Check the gradient - see if its close to the given beta when it returns")
 q2.t_gradient()
 print("Sanity Check the gradient - see if its behaves with a 0 set")
 q2.t_gradient(init_b=initial_theta)
 print("Optim - no Gradient used - No MAP")
 optim(par=initial_theta,X=X,Y=Y,fn=q2.t_log)$par
 print("BFGS - Gradient used - No MAP")
 optim(par=initial_theta,X=X,Y=Y,fn=q2.t_log,gr=q2.t_gradient, method="BFGS")$par
 print("Optim - no Gradient used - using MAP")
 optim(par=initial_theta,X=X,Y=Y,fn=q2.t_blog)$par
 print("BFGS - Gradient used - using MAP")
 optim(par=initial_theta,X=X,Y=Y,fn=q2.t_blog,gr=q2.t_gradient, method="BFGS")$par
	#returns a randomly generated dirichlet
	random_dirichlet <- function(n_var, alpha, beta) {

	return_val <- 0
	for(i in 1:n_var) {
	return_val[i] <- rgamma(1,shape=alpha,scale = beta)
	}

	#Normalise and round to a nice values
	return_val <- round(return_val/sum(return_val) * 100,1)/100

	return(return_val)
	}

	entropy_x_y_d <- function(dataset, alpha)
	{


	#Get sample sizes #Add dirichlet
	for(i in 1:length(dataset$AUSTRALIA))
	{
	#Generate a dirichlet, 5 variables, alpha set to the input parameted and beta 1
	dirichlet <- random_dirichlet(5,alpha,1)
	dataset$AUSTRALIA[i] <- dataset$AUSTRALIA[i] + round(dirichlet[1]* alpha,0)
	dataset$CHINA[i] <- dataset$CHINA[i] + round(dirichlet[2]* alpha,0)
	dataset$ENGLAND[i] <- dataset$ENGLAND[i] + round(dirichlet[3]* alpha,0)
	dataset$INDIA[i] <- dataset$INDIA[i] + round(dirichlet[4]* alpha,0)
	dataset$OTHERS[i] <- dataset$OTHERS[i] + round(dirichlet[5]* alpha,0)
	}

	dataset$lga_size <- dataset$AUSTRALIA + dataset$CHINA + dataset$ENGLAND + dataset$INDIA + dataset$OTHERS
	total_size <- sum(dataset$lga_size)

	#Calculate expected value
	dataset$lga_e <- (
	(-1 * ( (dataset$AUSTRALIA / dataset$lga_size ) * log2(dataset$AUSTRALIA / dataset$lga_size) )) +
	(-1 * ( (dataset$ENGLAND / dataset$lga_size ) * log2(dataset$ENGLAND / dataset$lga_size) ) ) +
	(-1 * ( (dataset$CHINA / dataset$lga_size ) * log2(dataset$CHINA / dataset$lga_size) ) ) +
	(-1 * ( (dataset$INDIA / dataset$lga_size ) * log2(dataset$INDIA / dataset$lga_size) ) ) +
	(-1 * ( (dataset$OTHERS / dataset$lga_size ) * log2(dataset$OTHERS / dataset$lga_size) ) )
	)

	#Pre-compute mean weighted average and multiply with the expected Entropy value
	dataset$lg_mwa <- (dataset$lga_size/total_size ) * dataset$lga_e
	#Sum the values
	t_ent <- sum(dataset$lg_mwa)

	return(t_ent)
	}

	#Generate the plot data
	for(i in 1:100)
	{
	#Add 5 to increase the starting bias as anythign <5 will cause it to generate 0
	#Values
	plot_points[i] <- entropy_x_y_d(data, i + 5)
	}
	#Plot the data
	plot(seq(1,100), plot_points, ylab="Entropy (in bits)", xlab="Alpha size")



	#Initial starting condition 0-set array and the test beta
	ex_b = c(0.43, -1.7, 2.8)
	initial_theta <- rep(0,ncol(XTest))

	#Z Function its the function that adds all the inputs and returns
	#A value to the tanh activator
	q2.util.z <- function(beta, i_num, x_row) {
	zv <- beta[1] + x_row[2]beta[2] + x_row[3]beta[3]
	return(zv)
	}

	#Various utility functions that cover steps of the
	#the log likelihood
	q2.util.p <- function(i_z) return(exp(i_z) + exp(-1*i_z)) #Log of the thanh activator function
	q2.util.l <- function(p_z) return(-1 *log(p_z)) #Returns the -log of the value
	q2.util.yb <- function(y,p_z) return(y*p_z) #multiplies Y by p_z
	q2.util.norm <- function(v,n) return(v/n) #divides all the values by n


	#This function is the loglikelihood function as provided
	#The algorithm is basic, it adds up all the
	#values according to the function supplied
	q2.t_log <- function(init_b=ex_b,X=XTest,Y=YTest)
	{

	n <- length(Y)
	sum_t <-0

	for(i in 1:n)
	{
	x_r <- X[i,]
	zv <- q2.util.z(init_b,i,x_r)
	pv <- q2.util.p(zv)
	ll <- (q2.util.l(pv) + q2.util.yb(Y[i],zv))
	sum_t <- sum_t + ll
	}
	sum_t <- (sum_t ) * -1
	return(sum_t)
	}

	#This is the gradient decent function
	#It is based on the derivative of the provided log likelihood function
	#the form yixi - bitanh(z)
	#where z = b0x0 + b1x1 + b2x2
	#While a specifc special derived b0 function was calculated
	#I found that the function worked with the general function
	q2.t_gradient <- function(init_b=ex_b,X=XTest,Y=YTest)
	{

	n <- length(Y)
	alpha <- 0.00526 #Value experimentally determined for stability
	gradient <- init_b #copy the array

	for(i in 1:n)
	{
	x_r <- X[i,]
	zv <- q2.util.z(gradient,i,x_r)
	#Work out the gradient using the formula
	b0 <- ((Y[i]x_r[1]) - (gradient[1]tanh(zv)))
	b1 <- ((Y[i]x_r[2]) - (gradient[2]tanh(zv)))
	b2 <- ((Y[i]x_r[3]) - (gradient[3]tanh(zv)))
	#Once the gradients are calculated put them back into the array
	#following the formula x = x - alpha*gradient(x)
	gradient[1] = gradient[1] - (alpha * b0 )
	gradient[2] = gradient[2] - (alpha * b1 )
	gradient[3] = gradient[3] - (alpha * b2 )
	}
	return(gradient)
	}

	q2.var.normal <- 0
	q2.var.normal.sd <- 10
	#Ok the Guasssian theory for P(BETA\|X) = (P(X\|BETA)P(BETA))/P(X)
	#The log taken is: log P(BETA) + log P(X\|BETA) −log P(X)
	#Normally P(X) is ignored following: https://wiseodd.github.io/techblog/2017/01/01/mle-vs-map/
	#Give us log P(BETA) + log P(X\|BETA) - log P(X\|BETA) is our MLE to begin with!
	#Where P(BETA) = Normal(BETA)
	#So We take our MLE and ADD the P(BETA) Component to the solver
	#This function calculates the log of the normal distribution using the PDF
	#https://www.ime.unicamp.br/~cnaber/optim%202.pdf
	#Following the document the beta prior is calculated as log of the pnorm
	#Not the LogNorm distribution, if follows (1/sd2)b^2
	#Gives us the log of the norm in the P(BETA) Component
	q2.util.p_beta <- function(beta) {
	lp1 <- beta[1]^2
	lp2 <- beta[2]^2
	lp3 <- beta[3]^2
	lp <- (lp1 + lp2 + lp3)(1/(q2.var.normal.sd2)) #Finally compute the log prior
	return(lp) #the betas n-curve prior
	}


	q2.t_blog <- function(init_b=ex_b,X=XTest,Y=YTest)
	{
	n <- length(Y)
	sum_t <-0
	lp <- q2.util.p_beta(init_b)
	for(i in 1:n)
	{
	x_r <- X[i,]
	zv <- q2.util.z(init_b,i,x_r)
	pv <- q2.util.p(zv)
	ll <- (q2.util.l(pv) + q2.util.yb(Y[i],zv))
	sum_t <- sum_t + ll
	}
	sum_t <- (sum_t * -1) + lp
	return(sum_t)
	}

	print("Sanity check the likelihood function")
	q2.t_log()
	q2.t_blog()
	print("Sanity Check the gradient - see if its close to the given beta when it returns")
	q2.t_gradient()
	print("Sanity Check the gradient - see if its behaves with a 0 set")
	q2.t_gradient(init_b=initial_theta)
	print("Optim - no Gradient used - No MAP")
	optim(par=initial_theta,X=X,Y=Y,fn=q2.t_log)$par
	print("BFGS - Gradient used - No MAP")
	optim(par=initial_theta,X=X,Y=Y,fn=q2.t_log,gr=q2.t_gradient, method="BFGS")$par
	print("Optim - no Gradient used - using MAP")
	optim(par=initial_theta,X=X,Y=Y,fn=q2.t_blog)$par
	print("BFGS - Gradient used - using MAP")
	optim(par=initial_theta,X=X,Y=Y,fn=q2.t_blog,gr=q2.t_gradient, method="BFGS")$par