Created
May 29, 2017 03:12
-
-
Save tjacobs/f640fa1e5e6f281f743e22fa578293c8 to your computer and use it in GitHub Desktop.
Cartpole Policy Gradient
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# OpenAI Cartpole implementation. | |
# Using a Policy Gradient. | |
# By Tom Jacobs | |
# | |
# Runs on Python 3. | |
# Originally based on https://github.com/kvfrans/openai-cartpole | |
# You can submit it to the OpenAI Gym scoreboard by entering your OpenAI API key and enabling submit below. | |
# It will submit only if it is considered solved. | |
import tensorflow as tf | |
import numpy as np | |
import random | |
import gym | |
import math | |
import matplotlib.pyplot as plt | |
# Submit it? | |
submit = True | |
api_key = '' | |
def softmax(x): | |
e_x = np.exp(x - np.max(x)) | |
out = e_x / e_x.sum() | |
return out | |
def policy_gradient(): | |
with tf.variable_scope("policy"): | |
params = tf.get_variable("policy_parameters", [4, 2]) # Parameters | |
state = tf.placeholder("float", [None, 4]) # World state | |
actions = tf.placeholder("float", [None, 2]) # Actions - move left, or right | |
advantages = tf.placeholder("float", [None, 1]) # Ooh, advantages | |
linear = tf.matmul(state, params) # Combine | |
probabilities = tf.nn.softmax(linear) # Probabilities | |
good_probabilities = tf.reduce_sum(tf.multiply(probabilities, actions), reduction_indices=[1]) | |
eligibility = tf.log(good_probabilities) * advantages | |
loss = -tf.reduce_sum(eligibility) | |
optimizer = tf.train.AdamOptimizer(0.1).minimize(loss) # Learning rate 0.01, aim to minimize loss | |
return probabilities, state, actions, advantages, optimizer | |
def value_gradient(): | |
with tf.variable_scope("value"): | |
state = tf.placeholder("float", [None, 4]) # World state | |
newvals = tf.placeholder("float", [None, 1]) | |
w1 = tf.get_variable("w1", [4, 2]) # Value gradient is *w1+b1, Relu, *w2+b2. 4, 10, 1. | |
b1 = tf.get_variable("b1", [2]) | |
h1 = tf.nn.relu(tf.matmul(state, w1) + b1) | |
w2 = tf.get_variable("w2", [2, 1]) | |
b2 = tf.get_variable("b2", [1]) | |
calculated = tf.matmul(h1,w2) + b2 | |
diffs = calculated - newvals # How different did we do from expected? | |
loss = tf.nn.l2_loss(diffs) | |
optimizer = tf.train.AdamOptimizer(0.1).minimize(loss) | |
return calculated, state, newvals, optimizer, loss | |
def run_episode(env, policy_grad, value_grad, sess, render=False): | |
pl_calculated, pl_state, pl_actions, pl_advantages, pl_optimizer = policy_grad | |
vl_calculated, vl_state, vl_newvals, vl_optimizer, vl_loss = value_grad | |
observation = env.reset() | |
totalreward = 0 | |
states = [] | |
actions = [] | |
advantages = [] | |
transitions = [] | |
update_vals = [] | |
for t in range(200): | |
# Render | |
if render: | |
env.render() | |
# Calculate policy | |
obs_vector = np.expand_dims(observation, axis=0) | |
probs = sess.run(pl_calculated,feed_dict={pl_state: obs_vector}) | |
action = 0 if random.uniform(0, 1) < probs[0][0] else 1 | |
# Record the transition | |
states.append(observation) | |
actionblank = np.zeros(2) | |
actionblank[action] = 1 | |
actions.append(actionblank) | |
# Take the action in the environment | |
old_observation = observation | |
observation, reward, done, info = env.step(action) | |
transitions.append((old_observation, action, reward)) | |
totalreward += reward | |
# Done? | |
if done: | |
break | |
for index, trans in enumerate(transitions): | |
obs, action, reward = trans | |
# Calculate discounted Monte Carlo return | |
future_reward = 0 | |
future_transitions = len(transitions) - index | |
decrease = 1 | |
for index2 in range(future_transitions): | |
future_reward += transitions[(index2) + index][2] * decrease | |
decrease = decrease * 0.95 | |
obs_vector = np.expand_dims(obs, axis=0) | |
currentval = sess.run(vl_calculated, feed_dict={vl_state: obs_vector})[0][0] | |
# Advantage: how much better was this action than normal? | |
advantages.append(future_reward - currentval) | |
# Update the value function towards new return | |
update_vals.append(future_reward) | |
# Update value function | |
update_vals_vector = np.expand_dims(update_vals, axis=1) | |
sess.run(vl_optimizer, feed_dict={vl_state: states, vl_newvals: update_vals_vector}) | |
# Update policy function | |
advantages_vector = np.expand_dims(advantages, axis=1) | |
sess.run(pl_optimizer, feed_dict={pl_state: states, pl_advantages: advantages_vector, pl_actions: actions}) | |
# Done | |
return totalreward | |
# Go | |
env = gym.make('CartPole-v0') | |
env = gym.wrappers.Monitor(env, 'cartpole', force=True) | |
policy_grad = policy_gradient() | |
value_grad = value_gradient() | |
sess = tf.InteractiveSession() | |
sess.run(tf.global_variables_initializer()) | |
# Learn | |
results = [] | |
for i in range(500): | |
reward = run_episode(env, policy_grad, value_grad, sess) | |
results.append(reward) | |
if reward < 200: | |
print("Fail at {}".format(i)) | |
#break | |
# Run 1000 | |
print("Running 100") | |
t = 0 | |
for _ in range(100): | |
reward = run_episode(env, policy_grad, value_grad, sess) | |
t += reward | |
results.append(reward) | |
print("Got {}".format(t / 100)) | |
# Submit | |
if submit and t/100 > 195: | |
# Submit to OpenAI Gym | |
print("Submitting to gym...") | |
gym.scoreboard.api_key = api_key | |
env.close() | |
gym.upload('cartpole') | |
else: | |
# Plot | |
#plt.plot(results) | |
#plt.xlabel('Episode') | |
#plt.ylabel('Rewards') | |
#plt.title('Rewards over time') | |
#plt.show() | |
# Show ten | |
print("Showing 10") | |
for _ in range(100): | |
reward = run_episode(env, policy_grad, value_grad, sess, True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment