Skip to content

Instantly share code, notes, and snippets.

@myrddian
Created September 29, 2018 04:38
Show Gist options
  • Save myrddian/b2d364a0930af11b683a7bf8193700d9 to your computer and use it in GitHub Desktop.
Save myrddian/b2d364a0930af11b683a7bf8193700d9 to your computer and use it in GitHub Desktop.
#returns a randomly generated dirichlet
random_dirichlet <- function(n_var, alpha, beta) {
return_val <- 0
for(i in 1:n_var) {
return_val[i] <- rgamma(1,shape=alpha,scale = beta)
}
#Normalise and round to a nice values
return_val <- round(return_val/sum(return_val) * 100,1)/100
return(return_val)
}
entropy_x_y_d <- function(dataset, alpha)
{
#Get sample sizes #Add dirichlet
for(i in 1:length(dataset$AUSTRALIA))
{
#Generate a dirichlet, 5 variables, alpha set to the input parameted and beta 1
dirichlet <- random_dirichlet(5,alpha,1)
dataset$AUSTRALIA[i] <- dataset$AUSTRALIA[i] + round(dirichlet[1]* alpha,0)
dataset$CHINA[i] <- dataset$CHINA[i] + round(dirichlet[2]* alpha,0)
dataset$ENGLAND[i] <- dataset$ENGLAND[i] + round(dirichlet[3]* alpha,0)
dataset$INDIA[i] <- dataset$INDIA[i] + round(dirichlet[4]* alpha,0)
dataset$OTHERS[i] <- dataset$OTHERS[i] + round(dirichlet[5]* alpha,0)
}
dataset$lga_size <- dataset$AUSTRALIA + dataset$CHINA + dataset$ENGLAND + dataset$INDIA + dataset$OTHERS
total_size <- sum(dataset$lga_size)
#Calculate expected value
dataset$lga_e <- (
(-1 * ( (dataset$AUSTRALIA / dataset$lga_size ) * log2(dataset$AUSTRALIA / dataset$lga_size) )) +
(-1 * ( (dataset$ENGLAND / dataset$lga_size ) * log2(dataset$ENGLAND / dataset$lga_size) ) ) +
(-1 * ( (dataset$CHINA / dataset$lga_size ) * log2(dataset$CHINA / dataset$lga_size) ) ) +
(-1 * ( (dataset$INDIA / dataset$lga_size ) * log2(dataset$INDIA / dataset$lga_size) ) ) +
(-1 * ( (dataset$OTHERS / dataset$lga_size ) * log2(dataset$OTHERS / dataset$lga_size) ) )
)
#Pre-compute mean weighted average and multiply with the expected Entropy value
dataset$lg_mwa <- (dataset$lga_size/total_size ) * dataset$lga_e
#Sum the values
t_ent <- sum(dataset$lg_mwa)
return(t_ent)
}
#Generate the plot data
for(i in 1:100)
{
#Add 5 to increase the starting bias as anythign <5 will cause it to generate 0
#Values
plot_points[i] <- entropy_x_y_d(data, i + 5)
}
#Plot the data
plot(seq(1,100), plot_points, ylab="Entropy (in bits)", xlab="Alpha size")
#Initial starting condition 0-set array and the test beta
ex_b = c(0.43, -1.7, 2.8)
initial_theta <- rep(0,ncol(XTest))
#Z Function its the function that adds all the inputs and returns
#A value to the tanh activator
q2.util.z <- function(beta, i_num, x_row) {
zv <- beta[1] + x_row[2]*beta[2] + x_row[3]*beta[3]
return(zv)
}
#Various utility functions that cover steps of the
#the log likelihood
q2.util.p <- function(i_z) return(exp(i_z) + exp(-1*i_z)) #Log of the thanh activator function
q2.util.l <- function(p_z) return(-1 *log(p_z)) #Returns the -log of the value
q2.util.yb <- function(y,p_z) return(y*p_z) #multiplies Y by p_z
q2.util.norm <- function(v,n) return(v/n) #divides all the values by n
#This function is the loglikelihood function as provided
#The algorithm is basic, it adds up all the
#values according to the function supplied
q2.t_log <- function(init_b=ex_b,X=XTest,Y=YTest)
{
n <- length(Y)
sum_t <-0
for(i in 1:n)
{
x_r <- X[i,]
zv <- q2.util.z(init_b,i,x_r)
pv <- q2.util.p(zv)
ll <- (q2.util.l(pv) + q2.util.yb(Y[i],zv))
sum_t <- sum_t + ll
}
sum_t <- (sum_t ) * -1
return(sum_t)
}
#This is the gradient decent function
#It is based on the derivative of the provided log likelihood function
#the form yi*xi - bi*tanh(z)
#where z = b0x0 + b1x1 + b2x2
#While a specifc special derived b0 function was calculated
#I found that the function worked with the general function
q2.t_gradient <- function(init_b=ex_b,X=XTest,Y=YTest)
{
n <- length(Y)
alpha <- 0.00526 #Value experimentally determined for stability
gradient <- init_b #copy the array
for(i in 1:n)
{
x_r <- X[i,]
zv <- q2.util.z(gradient,i,x_r)
#Work out the gradient using the formula
b0 <- ((Y[i]*x_r[1]) - (gradient[1]*tanh(zv)))
b1 <- ((Y[i]*x_r[2]) - (gradient[2]*tanh(zv)))
b2 <- ((Y[i]*x_r[3]) - (gradient[3]*tanh(zv)))
#Once the gradients are calculated put them back into the array
#following the formula x = x - alpha*gradient(x)
gradient[1] = gradient[1] - (alpha * b0 )
gradient[2] = gradient[2] - (alpha * b1 )
gradient[3] = gradient[3] - (alpha * b2 )
}
return(gradient)
}
q2.var.normal <- 0
q2.var.normal.sd <- 10
#Ok the Guasssian theory for P(BETA|X) = (P(X|BETA)P(BETA))/P(X)
#The log taken is: log P(BETA) + log P(X|BETA) −log P(X)
#Normally P(X) is ignored following: https://wiseodd.github.io/techblog/2017/01/01/mle-vs-map/
#Give us log P(BETA) + log P(X|BETA) - log P(X|BETA) is our MLE to begin with!
#Where P(BETA) = Normal(BETA)
#So We take our MLE and ADD the P(BETA) Component to the solver
#This function calculates the log of the normal distribution using the PDF
#https://www.ime.unicamp.br/~cnaber/optim%202.pdf
#Following the document the beta prior is calculated as log of the pnorm
#Not the LogNorm distribution, if follows (1/sd*2)*b^2
#Gives us the log of the norm in the P(BETA) Component
q2.util.p_beta <- function(beta) {
lp1 <- beta[1]^2
lp2 <- beta[2]^2
lp3 <- beta[3]^2
lp <- (lp1 + lp2 + lp3)*(1/(q2.var.normal.sd*2)) #Finally compute the log prior
return(lp) #the betas n-curve prior
}
q2.t_blog <- function(init_b=ex_b,X=XTest,Y=YTest)
{
n <- length(Y)
sum_t <-0
lp <- q2.util.p_beta(init_b)
for(i in 1:n)
{
x_r <- X[i,]
zv <- q2.util.z(init_b,i,x_r)
pv <- q2.util.p(zv)
ll <- (q2.util.l(pv) + q2.util.yb(Y[i],zv))
sum_t <- sum_t + ll
}
sum_t <- (sum_t * -1) + lp
return(sum_t)
}
print("Sanity check the likelihood function")
q2.t_log()
q2.t_blog()
print("Sanity Check the gradient - see if its close to the given beta when it returns")
q2.t_gradient()
print("Sanity Check the gradient - see if its behaves with a 0 set")
q2.t_gradient(init_b=initial_theta)
print("Optim - no Gradient used - No MAP")
optim(par=initial_theta,X=X,Y=Y,fn=q2.t_log)$par
print("BFGS - Gradient used - No MAP")
optim(par=initial_theta,X=X,Y=Y,fn=q2.t_log,gr=q2.t_gradient, method="BFGS")$par
print("Optim - no Gradient used - using MAP")
optim(par=initial_theta,X=X,Y=Y,fn=q2.t_blog)$par
print("BFGS - Gradient used - using MAP")
optim(par=initial_theta,X=X,Y=Y,fn=q2.t_blog,gr=q2.t_gradient, method="BFGS")$par
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment