-
-
Save botev/f8b32c00eafee222e47393f7f0747666 to your computer and use it in GitHub Desktop.
import numpy as np | |
import argparse | |
import theano | |
import theano.tensor as T | |
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams | |
from theano.gradient import zero_grad | |
import lasagne.layers as L | |
import lasagne.nonlinearities as nl | |
from lasagne.updates import adam | |
from lasagne.random import get_rng | |
def run(optimizer, epochs=100, batch_size=250, seed=None): | |
if seed is not None: | |
np.random.seed(seed) | |
dataset = Mnist() | |
dataset.load() | |
# Build the training function: | |
f = build_train_func(optimizer, [784, 512, 512, 64]) | |
num_batches = int(np.ceil(50000.0 / float(batch_size))) | |
results = np.zeros((num_batches * epochs, 1)) | |
i = 0 | |
for e in range(epochs): | |
for x, _ in dataset.iter("train", batch_size): | |
# Sample binarised version | |
x = (np.random.rand(*x.shape) < x).astype(theano.config.floatX) | |
results[i, 0] = f(x) | |
print("[{}]Loss:".format(i), results[i, 0]) | |
i += 1 | |
np.savetxt("results.csv", results, delimiter=",") | |
print("Done") | |
def build_train_func(optimizer, architecture): | |
# Input | |
l_in = L.InputLayer((None, architecture[0]), name="x") | |
# x | |
layer = l_in | |
# Encoder x -> z | |
for i, a in enumerate(architecture[1:-1]): | |
name = "encode_{}".format(i + 1) | |
layer = L.DenseLayer(layer, a, nonlinearity=nl.tanh, name=name) | |
name = "encode_{}".format(len(architecture) - 1) | |
layer = L.DenseLayer(layer, 2 * architecture[-1], | |
nonlinearity=nl.identity, | |
name=name) | |
# Q(z|x) | |
l_q = GaussianParameters(layer, name="q") | |
# Samples from Q | |
layer = GaussianSampler(l_q, name="q_sample") | |
# Decoder z -> x | |
for i, a in reversed(list(enumerate(architecture[1:-1]))): | |
name = "decode_{}".format(i + 1) | |
layer = L.DenseLayer(layer, a, nonlinearity=nl.tanh, name=name) | |
# P(x|z) | |
layer = L.DenseLayer(layer, architecture[0], | |
nonlinearity=nl.sigmoid, | |
name="p") | |
q_params, prediction = L.get_output((l_q, layer)) | |
log_p_x = T.nnet.binary_crossentropy(prediction, l_in.input_var) | |
log_p_x = T.sum(log_p_x, axis=1) | |
kl = guassian_kl(q_params) | |
neg_elbo = T.mean(log_p_x + kl) | |
params = L.get_all_params(layer, trainable=True) | |
if optimizer == "yellow_fin": | |
updates = yellow_fin(neg_elbo, params) | |
elif optimizer == "adam": | |
updates = adam(neg_elbo, params, 0.001) | |
else: | |
raise NotImplementedError | |
return theano.function([l_in.input_var], neg_elbo, updates=updates) | |
class GaussianParameters(L.Layer): | |
def __init__(self, incoming, nonlinearity=nl.softplus, | |
eps=1e-6, **kwargs): | |
super(GaussianParameters, self).__init__(incoming, **kwargs) | |
self.nonlinearity = nonlinearity | |
self.eps = eps | |
@staticmethod | |
def get_raw(distribution): | |
d = T.int_div(distribution.shape[1], 2) | |
mu = distribution[:, :d] | |
pre_sigma = distribution[:, d:] | |
return mu, pre_sigma | |
def get_output_for(self, inputs, **kwargs): | |
mu, pre_sigma = GaussianParameters.get_raw(inputs) | |
sigma = self.nonlinearity(pre_sigma) + self.eps | |
return T.concatenate((mu, sigma), axis=1) | |
class GaussianSampler(L.Layer): | |
def __init__(self, incoming, | |
num_samples=1, | |
**kwargs): | |
super(GaussianSampler, self).__init__(incoming, **kwargs) | |
self.num_samples = num_samples | |
def get_output_shape_for(self, input_shape): | |
shape = input_shape | |
assert shape[1] % 2 == 0 | |
out_shape = (shape[0] * self.num_samples if shape[0] is not None else None, shape[1] // 2) | |
return out_shape | |
def get_output_for(self, inputs, **kwargs): | |
mu, sigma = GaussianParameters.get_raw(inputs) | |
n, d = mu.shape | |
epsilon = normal((n, self.num_samples, d)) | |
samples = mu.dimshuffle(0, 'x', 1) + sigma.dimshuffle(0, 'x', 1) * epsilon | |
samples = T.reshape(samples, (n * self.num_samples, d)) | |
return samples | |
def guassian_kl(q_params): | |
q_mu, q_sigma = GaussianParameters.get_raw(q_params) | |
# Trace | |
kl = T.sqr(q_sigma) | |
# Means difference | |
kl += T.sqr(q_mu - 0) | |
# Both times inverse of sigma p | |
kl /= T.sqr(1) | |
# Minus 1 | |
kl -= T.constant(1) | |
# Log determinant of p_sigma | |
kl += 2 * T.log(1) | |
# Log determinant of q_sigma | |
kl -= 2 * T.log(q_sigma) | |
# Flatten | |
kl = T.flatten(kl, outdim=2) | |
# Calculate per data point | |
kl = 0.5 * T.sum(kl, axis=1) | |
return kl | |
def normal(shape): | |
random = random or RandomStreams(get_rng().randint(1, 2147462579)) | |
samples = random.normal(shape, dtype=theano.config.floatX) | |
samples = zero_grad(samples) | |
return samples | |
class Mnist(object): | |
""" | |
The Mixed National Institute of Standards and Technology Dataset | |
""" | |
download_url = "http://deeplearning.net/data/mnist/mnist.pkl.gz" | |
def __init__(self, flatten=True, path=None): | |
self.flatten = flatten | |
self.path = "./mnist" | |
self.data = dict() | |
def iter(self, partition, batch_size, shuffle=True): | |
n = self.data[partition][1].shape[0] | |
index = np.arange(n) | |
if shuffle: | |
np.random.shuffle(index) | |
for i in range(0, n, batch_size): | |
yield (x[index[i: i + batch_size]] for x in self.data[partition]) | |
def load(self, verbose=True): | |
c = 0 | |
while not self.verify_download(): | |
self.download() | |
c += 1 | |
if c >= 5: | |
raise ValueError("Could not verify download after 5 attempts.") | |
file_name = os.path.join(self.path, "mnist.pkl.gz") | |
with gzip.open(file_name, 'rb') as f: | |
data = load(f, encoding='latin1') | |
if self.flatten: | |
self.data["train"] = data[0] | |
self.data["valid"] = data[1] | |
self.data["test"] = data[2] | |
else: | |
self.data["train"] = [data[0][0].reshape((-1, 28, 28)), data[0][1]] | |
self.data["valid"] = [data[1][0].reshape((-1, 28, 28)), data[1][1]] | |
self.data["test"] = [data[2][0].reshape((-1, 28, 28)), data[2][1]] | |
if verbose: | |
print("Train - Images:", self.data["train"][0].shape, "Targets:", self.data["train"][1].shape) | |
print("Valid - Images:", self.data["valid"][0].shape, "Targets:", self.data["valid"][1].shape) | |
print("Test - Images:", self.data["test"][0].shape, "Targets:", self.data["test"][1].shape) | |
def download(self): | |
""" | |
Downloads the data set from the website. | |
:return: | |
""" | |
if not os.path.exists(self.path): | |
os.makedirs(self.path) | |
file_path = os.path.join(self.path, "mnist.pkl.gz") | |
general_utils.download_file(Mnist.download_url, file_path) | |
def verify_download(self): | |
""" | |
Verifies that the dataset has been downloaded and is present. | |
:return: | |
""" | |
return os.path.exists(os.path.join(self.path, "mnist.pkl.gz")) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument("optimizer", type=str) | |
parser.add_argument("--seed", type=int, default=None) | |
parser.add_argument("--epochs", type=int, default=100) | |
parser.add_argument("--batch_size", type=int, default=250) | |
run(**vars(parser.parse_args())) |
import numpy as np | |
import theano | |
import theano.tensor as T | |
from theano.printing import Print | |
fx = theano.config.floatX | |
def print_values(var, msg): | |
""" | |
Makes an op to print the values of the variable with the message | |
""" | |
return Print(msg)(var) | |
def solve(c, d, h_min, debug=False): | |
# We have the equation x^2 D^2 + (1-x)^4 * C / h_min^2 | |
# where x = sqrt(mu) | |
# Minimising this reduces to solving | |
# y^3 + p * y + p = 0 | |
# y = x - 1 | |
# p = (D^2 h_min^2)/(2 * C) | |
p = (T.sqr(d) * T.sqr(h_min)) / (T.constant(2) * c) | |
w3 = p * (T.sqrt(0.25 + p / 27.0) - 0.5) | |
w = T.power(w3, 1.0 / 3.0) | |
y = w - p / (3 * w) | |
sqrt_mu = y + 1 | |
if debug: | |
value = print_values(y*y*y + p*y + p, "derivative_value") | |
sqrt_mu += 1e-20 * value | |
return sqrt_mu | |
def solve_np(c, d, h_min): | |
p = (np.square(d) * np.square(h_min)) / np.asarray(2 * c, dtype=fx) | |
coeffs = np.asarray([-1.0, 3.0, - 3 - p, 1.0], dtype=fx) | |
return np.roots(coeffs) | |
if __name__ == "__main__": | |
N = 20 | |
D = 5 | |
c = (np.random.rand(N) * D).astype(fx) | |
d = (np.random.rand(N) * D).astype(fx) | |
h_min = (np.random.rand(N) * D).astype(fx) | |
for i in range(N): | |
x1 = solve(c[i], d[i], h_min[i]).eval().item() | |
x2 = solve_np(c[i], d[i], h_min[i]) | |
ind = np.real(x2) == np.sqrt(x2 * np.conj(x2)) | |
x2 = np.real(x2[ind])[0] | |
print("{:.10f} - {:.10f} = {:.10f}".format(x1, x2, (x1 - x2))) |
import numpy as np | |
import theano | |
import theano.tensor as T | |
from theano.printing import Print | |
from collections import OrderedDict | |
def yellow_fin(loss, params, beta=0.999, | |
learning_rate_init=1.0, momentum_init=0.0, | |
learning_rate_factor=1.0, t=None, | |
window_width=20, debug=False): | |
""" | |
The Yellow Fin algorithm. | |
:param loss: Theano expression of the loss | |
:param params: List of shared variables | |
:param beta: The moving average smoothing variable | |
:param learning_rate_init: initial learning rate | |
:param momentum_init: initial momentum | |
:padam learning_rate_factor: An additionaly multiplication factor applied to the base learning_rate. | |
:param t: optionally pass your own time variable | |
:param window_width: width of the window for calculating h_max and h_min | |
:param debug: flag for debugging | |
:return: | |
""" | |
grads = T.grad(loss, params) | |
updates = OrderedDict() | |
alpha = theano.shared(value_floatX(learning_rate_init), name="learning_rate") | |
mu = theano.shared(value_floatX(momentum_init), name="momentum") | |
if t is None: | |
t = theano.shared(np.asarray(0).astype(np.int32), name="t") | |
updates[t] = t + 1 | |
# Fetch variables from the routines | |
h_max, h_min = curvature_range(grads, beta, t, updates, window_width) | |
c = gradient_variance(grads, params, beta, updates) | |
d = distance_to_optim(grads, beta, updates) | |
if debug: | |
h_max = print_values(h_max, "h_max") | |
h_min = print_values(h_min, "h_min") | |
c = print_values(c, "c") | |
d = print_values(d, "d") | |
# Get the solution to the minimisation problem for mu | |
sqrt_mu1 = solve(c, d, h_min) | |
if debug: | |
sqrt_mu1 = print_values(sqrt_mu1, "sqrt_mu1") | |
sqrt_mu2 = (T.sqrt(h_max) - T.sqrt(h_min)) / (T.sqrt(h_max) + T.sqrt(h_min)) | |
sqrt_mu = T.maximum(sqrt_mu1, sqrt_mu2) | |
# Given the solution the final mu_t and alpha_t | |
alpha_t = T.sqr(1 - sqrt_mu) / h_min | |
mu_t = T.sqr(sqrt_mu) | |
if debug: | |
mu_t = print_values(mu_t, "mu_t") | |
alpha_t = print_values(alpha_t, "alpha_t") | |
# Update moving averages | |
updates[mu] = ema(beta, mu, mu_t) | |
updates[alpha] = ema(beta, alpha, alpha_t) | |
if debug: | |
updates[mu] = print_values(updates[mu], "mu") | |
updates[alpha] = print_values(updates[alpha], "alpha") | |
# Apply momentum | |
momentum(grads, params, learning_rate_factor * updates[alpha], updates[mu], updates) | |
return updates | |
def curvature_range(grads, beta, t, updates, window_width=20, debug=False): | |
""" | |
Routine for calculating the h_max and h_min curvature range. | |
""" | |
# Update the window | |
window = theano.shared(T.zeros((window_width, )).eval(), name="window") | |
t_mod = T.mod_check(t, window_width) | |
updates[window] = T.set_subtensor(window[t_mod], sum(T.sum(T.sqr(g)) for g in grads)) | |
if debug: | |
updates[window] = print_values(updates[window], "window") | |
# Get the h_max_t and h_min_t | |
t = T.minimum(t + 1, window_width) | |
h_max_t = T.max(updates[window][:t]) | |
h_min_t = T.min(updates[window][:t]) | |
# Update the moving averages | |
h_max = theano.shared(value_floatX(0.0), name="h_max") | |
h_min = theano.shared(value_floatX(0.0), name="h_min") | |
updates[h_max] = ema(beta, h_max, h_max_t) | |
updates[h_min] = ema(beta, h_min, h_min_t) | |
return updates[h_max], updates[h_min] | |
def gradient_variance(grads, params, beta, updates): | |
""" | |
Routine for calculating the variance of the gradients. | |
""" | |
# Total variance | |
variance = 0 | |
for param, grad in zip(params, grads): | |
# Make shared variables | |
mom1 = shared_mirror(param) | |
mom2 = shared_mirror(param) | |
# Update moving averages | |
updates[mom1] = ema(beta, mom1, grad) | |
updates[mom2] = ema(beta, mom2, T.sqr(grad)) | |
# Update the total variance | |
variance += T.sum(T.abs_(updates[mom2] - T.sqr(updates[mom1]))) | |
return variance | |
def distance_to_optim(grads, beta, updates): | |
""" | |
Routine fro calculating the distance to the optimum. | |
""" | |
# Had issue with initializing to 0.0, so switched to 1.0 | |
g = theano.shared(value_floatX(1.0), name="g") | |
h = theano.shared(value_floatX(1.0), name="h") | |
d = theano.shared(value_floatX(1.0), name="d") | |
# L2 norm | |
l2_norm = sum(T.sum(T.sqr(g)) for g in grads) | |
updates[g] = ema(beta, g, T.sqrt(l2_norm)) | |
updates[h] = ema(beta, h, l2_norm) | |
updates[d] = ema(beta, d, updates[g] / updates[h]) | |
return updates[d] | |
def solve(c, d, h_min, debug=False): | |
# We have the equation x^2 D^2 + (1-x)^4 * C / h_min^2 | |
# where x = sqrt(mu) | |
# Minimising this reduces to solving | |
# y^3 + p * y + p = 0 | |
# y = x - 1 | |
# p = (D^2 h_min^2)/(2 * C) | |
p = (T.sqr(d) * T.sqr(h_min)) / (2 * c) | |
w3 = p * (T.sqrt(0.25 + p / 27.0) - 0.5) | |
w = T.power(w3, 1.0 / 3.0) | |
y = w - p / (3 * w) | |
sqrt_mu = y + 1 | |
if debug: | |
value = print_values(y*y*y + p*y + p, "derivative_value") | |
sqrt_mu += 1e-20 * value | |
return sqrt_mu | |
def momentum(grads, params, learning_rate, momentum, updates=None): | |
""" | |
Standard momentum - copied from Lasagne library. | |
""" | |
updates = OrderedDict() if updates is None else updates | |
velocities = [shared_mirror(p) for p in params] | |
for param, grad, v in zip(params, grads, velocities): | |
updates[v] = v * momentum - learning_rate * grad | |
updates[param] = param + updates[v] | |
return updates | |
def print_values(var, msg): | |
""" | |
Makes an op to print the values of the variable with the message | |
""" | |
return Print(msg)(var) | |
def ema(alpha, s_t, x_t): | |
""" | |
Exponential moving average | |
""" | |
return alpha * s_t + (1 - alpha) * x_t | |
def value_floatX(x): | |
""" | |
Converts the value to a numpy array of type theano.config.floatX | |
""" | |
return np.asarray(x).astype(theano.config.floatX) | |
def shared_mirror(shared): | |
""" | |
Creates a shared variable with same specs as the input. | |
""" | |
value = shared.get_value(borrow=True) | |
return theano.shared(np.zeros(value.shape, dtype=value.dtype), | |
broadcastable=shared.broadcastable) |
@JianGoForIt
Thanks a lot for stopping by and for the useful comments.
The smoothing coefficient might be suboptimal, we use 0.999 instead.
I've also updated this to reflect that and made the learning_rate_init to be 1.0
by default.
Not really sure whether the cubic solution is correct or not. Could you please verify the solution is in [0, 1]. In TF and PyTorch implementation, we use the numpy root function.
I've added to the gist a short python script which verifies that the Theano cubic solver is indeed accurate (between 1e-5 -1e-7 difference from np.roots). The solver relies on Vieta's substitution in the cubic equation and the fact that we know that there exists only a single real root for the equation since p > 0.
Another quick question, what is the phenomenon you observe on this implementation so that it is not better than Adam? Is there an explosion? If yes, you may want to follow up the thread here.
In this regards I don't get any NaNs or something exploding, it is just that the learning curves are worse than those from Adam. For example, for more elaborate results I suggest to check out the discussion here where I ran the 56-Layer ResNet from a standard Lasagne Recipe and the results I got are that YellowFin seem to underperform both Momentum and Adam.
Hi @botev,
Here is the author of the YellowFin and PyTorch version author.
Thanks for the implementation. I read your post on Reddit and I have noticed a few things need to be checked (might be more points than I noticed).
The smoothing coefficient might be sub-optimal, we use 0.999 instead.
Not really sure whether the cubic solution is correct or not. Could you please verify the solution is in [0, 1]. In TF and PyTorch implementation, we use the numpy root function.
Just a quick suggestion, maybe you can construct some simple example for both the PyTorch version and your implementation, so that you can compare intermediate statistics and etc.
Another quick question, what is the phenomenon you observe on this implementation so that it is not better than Adam? Is there an explosion? If yes, you may want to follow up the thread here.
Hope it helps and please keep us updated about the results.