tanzhenyu · August 23, 2019 17:56
diff --git a/ppo_tf2.py b/ppo_tf2.py
 import tensorflow as tf
 import gym
 import numpy as np
 import scipy.signal

 def mlp(ob_space, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
    model = tf.keras.Sequential()
    for h in hidden_sizes[:-1]:
        model.add(tf.keras.layers.Dense(units=h, activation=activation))
    model.add(tf.keras.layers.Dense(units=hidden_sizes[-1], activation=output_activation))
    model.build(input_shape=(None,) + ob_space.shape)
    return model

 class MlpCategoricalActorCritic(tf.keras.Model):

  def __init__(self, ob_space, ac_space, hidden_sizes=(64, 64), activation=tf.keras.activations.tanh, output_activation=None):
    super(MlpCategoricalActorCritic, self).__init__()
    self.act_dim = ac_space.n
    with tf.name_scope('pi'):
      self.actor_mlp = mlp(ob_space=ob_space, hidden_sizes=list(hidden_sizes)+[self.act_dim], activation=activation)
    with tf.name_scope('v'):
      self.critic_mlp = mlp(ob_space=ob_space, hidden_sizes=list(hidden_sizes)+[1], activation=activation)

  @tf.function
  def get_pi_logpi_vf(self, observations):
    logits = self.actor_mlp(observations)
    logp_all = tf.nn.log_softmax(logits)
    pi = tf.squeeze(tf.random.categorical(logits, num_samples=1, seed=0), axis=1)
    logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=self.act_dim) * logp_all, axis=1)
    vf = self.critic_mlp(observations)
    return pi, logp_pi, vf

  @tf.function
  def get_logp(self, observations, actions):
    logits = self.actor_mlp(observations)
    logp_all = tf.nn.log_softmax(logits)
    return tf.reduce_sum(tf.one_hot(actions, depth=self.act_dim) * logp_all, axis=1)

  @tf.function
  def get_v(self, observations):
    return tf.squeeze(self.critic_mlp(observations), axis=1)
  
  def discount_cumsum(x, discount):
    return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]

 def combined_shape(length, shape=None):
    if shape is None:
        return (length,)
    return (length, shape) if np.isscalar(shape) else (length, *shape)

 class PPOBuffer:

    def __init__(self, ob_space, ac_space, size, gamma=0.99, lam=0.95):
        self.obs_buf = np.zeros(combined_shape(size, ob_space.shape), dtype=ob_space.dtype)
        self.act_buf = np.zeros(combined_shape(size, ac_space.shape), dtype=ac_space.dtype)
        self.adv_buf = np.zeros(size, dtype=np.float32)
        self.rew_buf = np.zeros(size, dtype=np.float32)
        self.ret_buf = np.zeros(size, dtype=np.float32)
        self.val_buf = np.zeros(size, dtype=np.float32)
        self.logp_buf = np.zeros(size, dtype=np.float32)
        self.gamma, self.lam = gamma, lam
        self.ptr, self.path_start_idx, self.max_size = 0, 0, size

    def store(self, obs, act, rew, val, logp):
        assert self.ptr < self.max_size     # buffer has to have room so you can store
        self.obs_buf[self.ptr] = obs
        self.act_buf[self.ptr] = act
        self.rew_buf[self.ptr] = rew
        self.val_buf[self.ptr] = val
        self.logp_buf[self.ptr] = logp
        self.ptr += 1

    def finish_path(self, last_val=0):
        path_slice = slice(self.path_start_idx, self.ptr)
        rews = np.append(self.rew_buf[path_slice], last_val)
        vals = np.append(self.val_buf[path_slice], last_val)
        
        # the next two lines implement GAE-Lambda advantage calculation
        deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1]
        self.adv_buf[path_slice] = discount_cumsum(deltas, self.gamma * self.lam)
        
        # the next line computes rewards-to-go, to be targets for the value function
        self.ret_buf[path_slice] = discount_cumsum(rews, self.gamma)[:-1]
        
        self.path_start_idx = self.ptr

    def get(self):
        assert self.ptr == self.max_size    # buffer has to be full before you can get
        self.ptr, self.path_start_idx = 0, 0
        # the next two lines implement the advantage normalization trick
        adv_mean = np.mean(self.adv_buf)
        adv_std = np.std(self.adv_buf)
        self.adv_buf = (self.adv_buf - adv_mean) / adv_std
        return [self.obs_buf, self.act_buf, self.adv_buf, 
                self.ret_buf, self.logp_buf]
     
 def ppo(seed=0,  steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4,
        vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01):

    tf.random.set_seed(seed)
    np.random.seed(seed)

    env = gym.make('CartPole-v1')
    ob_space = env.observation_space
    ac_space = env.action_space
    obs_dim = ob_space.shape
    act_dim = ac_space.shape

    model = Mlp_Categorical_Actor_Critic(ob_space, ac_space)

    # Optimizers
    opt_pi = tf.keras.optimizers.Adam(learning_rate=pi_lr)
    opt_v = tf.keras.optimizers.Adam(learning_rate=vf_lr)

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch)
    buf = PPOBuffer(ob_space, ac_space, local_steps_per_epoch, gamma, lam)

    # Trainable weight for actor and critic
    actor_weights = model.actor_mlp.trainable_weights
    critic_weights = model.critic_mlp.trainable_weights

    @tf.function
    def update(obs, acs, advs, rets, logp_olds):

      stopIter = tf.constant(train_pi_iters)
      pi_loss = 0.
      for i in tf.range(train_pi_iters):
        with tf.GradientTape() as tape:
          logp = model.get_logp(obs, acs)
          ratio = tf.exp(logp - logp_olds)
          min_adv = tf.where(advs > 0, (1+clip_ratio)*advs, (1-clip_ratio)*advs)
          pi_loss = -tf.reduce_mean(tf.minimum(ratio * advs, min_adv))
        grads = tape.gradient(pi_loss, actor_weights)
        opt_pi.apply_gradients(zip(grads, actor_weights))
        kl = tf.reduce_mean(logp_olds - logp)
        if kl > 1.5 * target_kl:
          stopIter = i
          break

      v_loss = 0.
      for i in tf.range(train_v_iters):
        with tf.GradientTape() as tape:
          v = model.get_v(obs)
          v_loss = tf.reduce_mean((rets - v)**2)
        grads = tape.gradient(v_loss, critic_weights)
        opt_v.apply_gradients(zip(grads, critic_weights))

      return pi_loss, v_loss, stopIter

    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    Ep_Ret = []
    for epoch in range(epochs):
        Ep_Ret = []
        for t in range(local_steps_per_epoch):
            expand_o = tf.constant(o.reshape(1, -1))
            a, logp_t, v_t = model.get_pi_logpi_vf(expand_o)

            a = a.numpy()[0]
            logp_t = logp_t.numpy()[0]
            v_t = v_t.numpy()[0][0]
            buf.store(o, a, r, v_t, logp_t)

            o, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t==local_steps_per_epoch-1):
                if not(terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.'%ep_len)
                last_val = r if d else model.get_v(tf.constant(o.reshape(1, -1))).numpy()[0]
                buf.finish_path(last_val)
                if terminal:
                  Ep_Ret.append(ep_ret)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        obs, acs, advs, rets, logp_olds = buf.get()
      
        pi_loss, v_loss, stopIter = update(obs, acs, advs, rets, logp_olds)

        print('---------------------------------')
        print('epoch {}'.format(epoch))
        print('pi loss {}'.format(pi_loss.numpy()))
        print('vf loss {}'.format(v_loss.numpy()))
        print('step iter {}'.format(stopIter))
        print('Ep Ret {}'.format(np.mean(Ep_Ret)))

    return model, env
 
 if __name__ == '__main__':
  model, env = ppo()
  obs = env.reset()
  while True:
    action, _, _ = model.get_pi_logpi_vf(obs.reshape(1, -1))
    obs, r, d, _ = env.step(action.numpy()[0])
    reward += r
    env.render()
    if d:
      print('episode reward {}'.format(reward))
      reward = 0
      obs = env.reset()
	import tensorflow as tf
	import gym
	import numpy as np
	import scipy.signal

	def mlp(ob_space, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
	model = tf.keras.Sequential()
	for h in hidden_sizes[:-1]:
	model.add(tf.keras.layers.Dense(units=h, activation=activation))
	model.add(tf.keras.layers.Dense(units=hidden_sizes[-1], activation=output_activation))
	model.build(input_shape=(None,) + ob_space.shape)
	return model

	class MlpCategoricalActorCritic(tf.keras.Model):

	def __init__(self, ob_space, ac_space, hidden_sizes=(64, 64), activation=tf.keras.activations.tanh, output_activation=None):
	super(MlpCategoricalActorCritic, self).__init__()
	self.act_dim = ac_space.n
	with tf.name_scope('pi'):
	self.actor_mlp = mlp(ob_space=ob_space, hidden_sizes=list(hidden_sizes)+[self.act_dim], activation=activation)
	with tf.name_scope('v'):
	self.critic_mlp = mlp(ob_space=ob_space, hidden_sizes=list(hidden_sizes)+[1], activation=activation)

	@tf.function
	def get_pi_logpi_vf(self, observations):
	logits = self.actor_mlp(observations)
	logp_all = tf.nn.log_softmax(logits)
	pi = tf.squeeze(tf.random.categorical(logits, num_samples=1, seed=0), axis=1)
	logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=self.act_dim) * logp_all, axis=1)
	vf = self.critic_mlp(observations)
	return pi, logp_pi, vf

	@tf.function
	def get_logp(self, observations, actions):
	logits = self.actor_mlp(observations)
	logp_all = tf.nn.log_softmax(logits)
	return tf.reduce_sum(tf.one_hot(actions, depth=self.act_dim) * logp_all, axis=1)

	@tf.function
	def get_v(self, observations):
	return tf.squeeze(self.critic_mlp(observations), axis=1)

	def discount_cumsum(x, discount):
	return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]

	def combined_shape(length, shape=None):
	if shape is None:
	return (length,)
	return (length, shape) if np.isscalar(shape) else (length, *shape)

	class PPOBuffer:

	def __init__(self, ob_space, ac_space, size, gamma=0.99, lam=0.95):
	self.obs_buf = np.zeros(combined_shape(size, ob_space.shape), dtype=ob_space.dtype)
	self.act_buf = np.zeros(combined_shape(size, ac_space.shape), dtype=ac_space.dtype)
	self.adv_buf = np.zeros(size, dtype=np.float32)
	self.rew_buf = np.zeros(size, dtype=np.float32)
	self.ret_buf = np.zeros(size, dtype=np.float32)
	self.val_buf = np.zeros(size, dtype=np.float32)
	self.logp_buf = np.zeros(size, dtype=np.float32)
	self.gamma, self.lam = gamma, lam
	self.ptr, self.path_start_idx, self.max_size = 0, 0, size

	def store(self, obs, act, rew, val, logp):
	assert self.ptr < self.max_size # buffer has to have room so you can store
	self.obs_buf[self.ptr] = obs
	self.act_buf[self.ptr] = act
	self.rew_buf[self.ptr] = rew
	self.val_buf[self.ptr] = val
	self.logp_buf[self.ptr] = logp
	self.ptr += 1

	def finish_path(self, last_val=0):
	path_slice = slice(self.path_start_idx, self.ptr)
	rews = np.append(self.rew_buf[path_slice], last_val)
	vals = np.append(self.val_buf[path_slice], last_val)

	# the next two lines implement GAE-Lambda advantage calculation
	deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1]
	self.adv_buf[path_slice] = discount_cumsum(deltas, self.gamma * self.lam)

	# the next line computes rewards-to-go, to be targets for the value function
	self.ret_buf[path_slice] = discount_cumsum(rews, self.gamma)[:-1]

	self.path_start_idx = self.ptr

	def get(self):
	assert self.ptr == self.max_size # buffer has to be full before you can get
	self.ptr, self.path_start_idx = 0, 0
	# the next two lines implement the advantage normalization trick
	adv_mean = np.mean(self.adv_buf)
	adv_std = np.std(self.adv_buf)
	self.adv_buf = (self.adv_buf - adv_mean) / adv_std
	return [self.obs_buf, self.act_buf, self.adv_buf,
	self.ret_buf, self.logp_buf]

	def ppo(seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4,
	vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01):

	tf.random.set_seed(seed)
	np.random.seed(seed)

	env = gym.make('CartPole-v1')
	ob_space = env.observation_space
	ac_space = env.action_space
	obs_dim = ob_space.shape
	act_dim = ac_space.shape

	model = Mlp_Categorical_Actor_Critic(ob_space, ac_space)

	# Optimizers
	opt_pi = tf.keras.optimizers.Adam(learning_rate=pi_lr)
	opt_v = tf.keras.optimizers.Adam(learning_rate=vf_lr)

	# Experience buffer
	local_steps_per_epoch = int(steps_per_epoch)
	buf = PPOBuffer(ob_space, ac_space, local_steps_per_epoch, gamma, lam)

	# Trainable weight for actor and critic
	actor_weights = model.actor_mlp.trainable_weights
	critic_weights = model.critic_mlp.trainable_weights

	@tf.function
	def update(obs, acs, advs, rets, logp_olds):

	stopIter = tf.constant(train_pi_iters)
	pi_loss = 0.
	for i in tf.range(train_pi_iters):
	with tf.GradientTape() as tape:
	logp = model.get_logp(obs, acs)
	ratio = tf.exp(logp - logp_olds)
	min_adv = tf.where(advs > 0, (1+clip_ratio)advs, (1-clip_ratio)advs)
	pi_loss = -tf.reduce_mean(tf.minimum(ratio * advs, min_adv))
	grads = tape.gradient(pi_loss, actor_weights)
	opt_pi.apply_gradients(zip(grads, actor_weights))
	kl = tf.reduce_mean(logp_olds - logp)
	if kl > 1.5 * target_kl:
	stopIter = i
	break

	v_loss = 0.
	for i in tf.range(train_v_iters):
	with tf.GradientTape() as tape:
	v = model.get_v(obs)
	v_loss = tf.reduce_mean((rets - v)**2)
	grads = tape.gradient(v_loss, critic_weights)
	opt_v.apply_gradients(zip(grads, critic_weights))

	return pi_loss, v_loss, stopIter

	o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

	# Main loop: collect experience in env and update/log each epoch
	Ep_Ret = []
	for epoch in range(epochs):
	Ep_Ret = []
	for t in range(local_steps_per_epoch):
	expand_o = tf.constant(o.reshape(1, -1))
	a, logp_t, v_t = model.get_pi_logpi_vf(expand_o)

	a = a.numpy()[0]
	logp_t = logp_t.numpy()[0]
	v_t = v_t.numpy()[0][0]
	buf.store(o, a, r, v_t, logp_t)

	o, r, d, _ = env.step(a)
	ep_ret += r
	ep_len += 1

	terminal = d or (ep_len == max_ep_len)
	if terminal or (t==local_steps_per_epoch-1):
	if not(terminal):
	print('Warning: trajectory cut off by epoch at %d steps.'%ep_len)
	last_val = r if d else model.get_v(tf.constant(o.reshape(1, -1))).numpy()[0]
	buf.finish_path(last_val)
	if terminal:
	Ep_Ret.append(ep_ret)
	o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

	obs, acs, advs, rets, logp_olds = buf.get()

	pi_loss, v_loss, stopIter = update(obs, acs, advs, rets, logp_olds)

	print('---------------------------------')
	print('epoch {}'.format(epoch))
	print('pi loss {}'.format(pi_loss.numpy()))
	print('vf loss {}'.format(v_loss.numpy()))
	print('step iter {}'.format(stopIter))
	print('Ep Ret {}'.format(np.mean(Ep_Ret)))

	return model, env

	if __name__ == '__main__':
	model, env = ppo()
	obs = env.reset()
	while True:
	action, _, _ = model.get_pi_logpi_vf(obs.reshape(1, -1))
	obs, r, d, _ = env.step(action.numpy()[0])
	reward += r
	env.render()
	if d:
	print('episode reward {}'.format(reward))
	reward = 0
	obs = env.reset()