Skip to content

Instantly share code, notes, and snippets.

@tanzhenyu
Last active August 23, 2019 17:56
Show Gist options
  • Save tanzhenyu/35a2c80512e28aef14d47e775937db1d to your computer and use it in GitHub Desktop.
Save tanzhenyu/35a2c80512e28aef14d47e775937db1d to your computer and use it in GitHub Desktop.
ppo TF2
import tensorflow as tf
import gym
import numpy as np
import scipy.signal
def mlp(ob_space, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
model = tf.keras.Sequential()
for h in hidden_sizes[:-1]:
model.add(tf.keras.layers.Dense(units=h, activation=activation))
model.add(tf.keras.layers.Dense(units=hidden_sizes[-1], activation=output_activation))
model.build(input_shape=(None,) + ob_space.shape)
return model
class MlpCategoricalActorCritic(tf.keras.Model):
def __init__(self, ob_space, ac_space, hidden_sizes=(64, 64), activation=tf.keras.activations.tanh, output_activation=None):
super(MlpCategoricalActorCritic, self).__init__()
self.act_dim = ac_space.n
with tf.name_scope('pi'):
self.actor_mlp = mlp(ob_space=ob_space, hidden_sizes=list(hidden_sizes)+[self.act_dim], activation=activation)
with tf.name_scope('v'):
self.critic_mlp = mlp(ob_space=ob_space, hidden_sizes=list(hidden_sizes)+[1], activation=activation)
@tf.function
def get_pi_logpi_vf(self, observations):
logits = self.actor_mlp(observations)
logp_all = tf.nn.log_softmax(logits)
pi = tf.squeeze(tf.random.categorical(logits, num_samples=1, seed=0), axis=1)
logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=self.act_dim) * logp_all, axis=1)
vf = self.critic_mlp(observations)
return pi, logp_pi, vf
@tf.function
def get_logp(self, observations, actions):
logits = self.actor_mlp(observations)
logp_all = tf.nn.log_softmax(logits)
return tf.reduce_sum(tf.one_hot(actions, depth=self.act_dim) * logp_all, axis=1)
@tf.function
def get_v(self, observations):
return tf.squeeze(self.critic_mlp(observations), axis=1)
def discount_cumsum(x, discount):
return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
def combined_shape(length, shape=None):
if shape is None:
return (length,)
return (length, shape) if np.isscalar(shape) else (length, *shape)
class PPOBuffer:
def __init__(self, ob_space, ac_space, size, gamma=0.99, lam=0.95):
self.obs_buf = np.zeros(combined_shape(size, ob_space.shape), dtype=ob_space.dtype)
self.act_buf = np.zeros(combined_shape(size, ac_space.shape), dtype=ac_space.dtype)
self.adv_buf = np.zeros(size, dtype=np.float32)
self.rew_buf = np.zeros(size, dtype=np.float32)
self.ret_buf = np.zeros(size, dtype=np.float32)
self.val_buf = np.zeros(size, dtype=np.float32)
self.logp_buf = np.zeros(size, dtype=np.float32)
self.gamma, self.lam = gamma, lam
self.ptr, self.path_start_idx, self.max_size = 0, 0, size
def store(self, obs, act, rew, val, logp):
assert self.ptr < self.max_size # buffer has to have room so you can store
self.obs_buf[self.ptr] = obs
self.act_buf[self.ptr] = act
self.rew_buf[self.ptr] = rew
self.val_buf[self.ptr] = val
self.logp_buf[self.ptr] = logp
self.ptr += 1
def finish_path(self, last_val=0):
path_slice = slice(self.path_start_idx, self.ptr)
rews = np.append(self.rew_buf[path_slice], last_val)
vals = np.append(self.val_buf[path_slice], last_val)
# the next two lines implement GAE-Lambda advantage calculation
deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1]
self.adv_buf[path_slice] = discount_cumsum(deltas, self.gamma * self.lam)
# the next line computes rewards-to-go, to be targets for the value function
self.ret_buf[path_slice] = discount_cumsum(rews, self.gamma)[:-1]
self.path_start_idx = self.ptr
def get(self):
assert self.ptr == self.max_size # buffer has to be full before you can get
self.ptr, self.path_start_idx = 0, 0
# the next two lines implement the advantage normalization trick
adv_mean = np.mean(self.adv_buf)
adv_std = np.std(self.adv_buf)
self.adv_buf = (self.adv_buf - adv_mean) / adv_std
return [self.obs_buf, self.act_buf, self.adv_buf,
self.ret_buf, self.logp_buf]
def ppo(seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4,
vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01):
tf.random.set_seed(seed)
np.random.seed(seed)
env = gym.make('CartPole-v1')
ob_space = env.observation_space
ac_space = env.action_space
obs_dim = ob_space.shape
act_dim = ac_space.shape
model = Mlp_Categorical_Actor_Critic(ob_space, ac_space)
# Optimizers
opt_pi = tf.keras.optimizers.Adam(learning_rate=pi_lr)
opt_v = tf.keras.optimizers.Adam(learning_rate=vf_lr)
# Experience buffer
local_steps_per_epoch = int(steps_per_epoch)
buf = PPOBuffer(ob_space, ac_space, local_steps_per_epoch, gamma, lam)
# Trainable weight for actor and critic
actor_weights = model.actor_mlp.trainable_weights
critic_weights = model.critic_mlp.trainable_weights
@tf.function
def update(obs, acs, advs, rets, logp_olds):
stopIter = tf.constant(train_pi_iters)
pi_loss = 0.
for i in tf.range(train_pi_iters):
with tf.GradientTape() as tape:
logp = model.get_logp(obs, acs)
ratio = tf.exp(logp - logp_olds)
min_adv = tf.where(advs > 0, (1+clip_ratio)*advs, (1-clip_ratio)*advs)
pi_loss = -tf.reduce_mean(tf.minimum(ratio * advs, min_adv))
grads = tape.gradient(pi_loss, actor_weights)
opt_pi.apply_gradients(zip(grads, actor_weights))
kl = tf.reduce_mean(logp_olds - logp)
if kl > 1.5 * target_kl:
stopIter = i
break
v_loss = 0.
for i in tf.range(train_v_iters):
with tf.GradientTape() as tape:
v = model.get_v(obs)
v_loss = tf.reduce_mean((rets - v)**2)
grads = tape.gradient(v_loss, critic_weights)
opt_v.apply_gradients(zip(grads, critic_weights))
return pi_loss, v_loss, stopIter
o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
# Main loop: collect experience in env and update/log each epoch
Ep_Ret = []
for epoch in range(epochs):
Ep_Ret = []
for t in range(local_steps_per_epoch):
expand_o = tf.constant(o.reshape(1, -1))
a, logp_t, v_t = model.get_pi_logpi_vf(expand_o)
a = a.numpy()[0]
logp_t = logp_t.numpy()[0]
v_t = v_t.numpy()[0][0]
buf.store(o, a, r, v_t, logp_t)
o, r, d, _ = env.step(a)
ep_ret += r
ep_len += 1
terminal = d or (ep_len == max_ep_len)
if terminal or (t==local_steps_per_epoch-1):
if not(terminal):
print('Warning: trajectory cut off by epoch at %d steps.'%ep_len)
last_val = r if d else model.get_v(tf.constant(o.reshape(1, -1))).numpy()[0]
buf.finish_path(last_val)
if terminal:
Ep_Ret.append(ep_ret)
o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
obs, acs, advs, rets, logp_olds = buf.get()
pi_loss, v_loss, stopIter = update(obs, acs, advs, rets, logp_olds)
print('---------------------------------')
print('epoch {}'.format(epoch))
print('pi loss {}'.format(pi_loss.numpy()))
print('vf loss {}'.format(v_loss.numpy()))
print('step iter {}'.format(stopIter))
print('Ep Ret {}'.format(np.mean(Ep_Ret)))
return model, env
if __name__ == '__main__':
model, env = ppo()
obs = env.reset()
while True:
action, _, _ = model.get_pi_logpi_vf(obs.reshape(1, -1))
obs, r, d, _ = env.step(action.numpy()[0])
reward += r
env.render()
if d:
print('episode reward {}'.format(reward))
reward = 0
obs = env.reset()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment