Skip to content

Instantly share code, notes, and snippets.

@alexhiggins732
Created February 21, 2021 02:44
Show Gist options
  • Save alexhiggins732/320286f89e53c3bb3ae291f5979db1f3 to your computer and use it in GitHub Desktop.
Save alexhiggins732/320286f89e53c3bb3ae291f5979db1f3 to your computer and use it in GitHub Desktop.
Keras Actor Critic in TensorFlow.net
using System;
using System.Collections.Generic;
using NumSharp;
using Tensorflow.Keras.Layers;
using Tensorflow.Keras.Losses;
using Tensorflow.Keras.Optimizers;
using Tensorflow.Keras.Utils;
using static Tensorflow.Binding;
using static Tensorflow.KerasApi;
using System.Linq;
using System.Threading;
namespace Tensorflow
{
//inspired by Tensorflow sample of Actor Critic Method in Cartpole Environment
internal class A2C
{
int seed = 42;
float gamma = .99f;
int max_steps_per_episode = 10000;
int num_inputs = 4;
int num_actions = 2;
int num_hidden = 128;
GymEnvironment env;
//TODO: Python debugger 1.1920928955078125e-07 == np.finfo(np.float32).eps.item()
//NET float epsilon = 1.401298E-45F
float eps = 1.1920928955078125e-07F;
public A2C()
{
env = GymEnvironment.make("CartPole-v0");
env.seed(seed);
}
//TODO: allow input of action space, input space.
Tensors inputs;
Tensors common;
Tensors action;
Tensors critic;
Tensors outputs;
Keras.Engine.Functional model;
private OptimizerV2 optimizer;
private ILossFunc huber_loss;
private List<Tensor> action_probs_history;
private List<Tensor> critic_value_history;
private List<float> rewards_history;
private float running_reward;
private int episode_count;
float[] state;
private float episode_reward;
internal void Run()
{
//build the model;
var layers = new LayersApi();
inputs = keras.Input(num_inputs, dtype: TF_DataType.TF_FLOAT);
common = layers.Dense(num_hidden, activation: "relu").Apply(inputs);
action = layers.Dense(num_actions, activation: "softmax").Apply(common);
critic = keras.layers.Dense(1).Apply(common);
outputs = new Tensors(action, critic);
model = keras.Model(inputs, outputs, name: "a2c");
//optimizer = keras.optimizers.Adam(learning_rate = 0.01)
optimizer = keras.optimizers.Adam(learning_rate: 0.01f);
//huber_loss = keras.losses.Huber()
huber_loss = keras.losses.Huber();
//action_probs_history = []
action_probs_history = new List<Tensor>();
//critic_value_history = []
critic_value_history = new List<Tensor>();
//rewards_history = []
rewards_history = new List<float>();
//running_reward = 0
running_reward = 0f;
//episode_count = 0
episode_count = 0;
train();
}
void train()
{
while (true)// run until solved
{
var tensorstate = env.reset();
episode_reward = 0;
using (var tape = tf.GradientTape())
{
for (var timestep = 1; timestep < max_steps_per_episode; timestep++)
{
//env.render //TODO:
var stateAsTensor2 = tf.expand_dims(tensorstate, axis: 0);
//Predict action probabilities and estimated future rewards
//from environment state
//(action_probs, critic_value) = model(state)
var result = model.predict(stateAsTensor2);
var action_probs = result[0];
var critic_value = result[1];
var output = critic_value.ToArray<float>();
critic_value_history.Add(critic_value[0, 0]);
var propabilities = action_probs.ToArray<double>();
// use custom random choice instead because
// np.random.choice throws a not implemented exception
// np.random.choice(num_actions, probabilities: propabilities);
var action = RandomChoice.Choice(num_actions, propabilities);
var loginput = propabilities[action];
var probLog = tf.math.log(action_probs[0, action]);
action_probs_history.append(probLog);
//state, reward, done, _ = env.step(action);
var stepResult = env.step(action);
tensorstate = stepResult.state;
state = stepResult.state.ToArray<float>();
var reward = stepResult.reward;
rewards_history.Add(reward);
episode_reward += reward;
//rewards_history.append(reward)
//episode_reward += reward
//action_probs_history.append(tf.math.log(action_probs[0, action]))
if (stepResult.done)
break;
}
// # Update running reward to check condition for solving
running_reward = 0.05f * episode_reward + (1 - 0.05f) * running_reward;
//# Calculate expected value from rewards
//# - At each timestep what was the total reward received after that timestep
//# - Rewards in the past are discounted by multiplying them with gamma
//# - These are the labels for our critic
var returns = new List<float>();
var discounted_sum = 0f;
for (var i = 0; i < rewards_history.Count; i++)
{
var r = rewards_history[i];
discounted_sum = r + gamma * discounted_sum;
returns.Insert(0, discounted_sum);
}
var npReturns = np.array(returns.ToArray());
npReturns = (npReturns - np.mean(npReturns)) / (np.std(npReturns) + eps);
var returnsASList = npReturns.ToArray<float>();
//history = zip(action_probs_history, critic_value_history, returns)
var actor_losses = new List<Tensor>();
var critic_losses = new List<Tensor>();
for (var i = 0; i < action_probs_history.Count; i++)
{
var log_prob = action_probs_history[i];
var value = critic_value_history[i];
var ret = returns[i];
// At this point in history, the critic estimated that we would get a
// total reward = `value` in the future. We took an action with log probability
// of `log_prob` and ended up recieving a total reward = `ret`.
// The actor must be updated so that it predicts an action that leads to
// high rewards (compared to critic's estimate) with high probability.
var diff = ret - value;
actor_losses.Add(diff);
// The critic must be updated so that it predicts a better estimate of
// the future rewards.
var retTensor = tf.convert_to_tensor(ret);
var loss = huber_loss.Call(tf.expand_dims(value, 0), tf.expand_dims(retTensor, 0));
critic_losses.Add(loss);
}
//var loss_value = sum(actor_losses) + sum(critic_losses); //broken
var actor_losses_sum = actor_losses.SelectMany(x => x.ToArray<float>()).Sum();
var critic_losses_sum = critic_losses.SelectMany(x => x.ToArray<float>()).Sum();
// Backpropagation
float loss_value = actor_losses_sum + critic_losses_sum;
Tensor loss_value_tensor = tf.convert_to_tensor(loss_value);
var grads = tape.gradient(loss_value_tensor, model.trainable_variables);
//optimizer.apply_gradients(zip(grads, model.trainable_variables))
var zipped = grads.Zip(model.trainable_variables.Cast<ResourceVariable>()).ToList();
optimizer.apply_gradients(zipped);
// Clear the loss and reward history
action_probs_history.Clear();
critic_value_history.Clear();
rewards_history.Clear();
}
episode_count += 1;
if (episode_count % 10 == 0)
{
Console.WriteLine($"running reward: {running_reward.ToString("N2")} at {episode_count}");
}
if (running_reward > 195) //# Condition to consider the task solved
{
print($"Solved at episode {episode_count}!");
break;
}
}
//Log details
}
}
public static class RandomChoice
{
//from: https://stackoverflow.com/a/43345968/624988
static readonly ThreadLocal<Random> _random = new ThreadLocal<Random>(() => new Random());
static IEnumerable<T> Choice<T>(IList<T> sequence, int size, double[] distribution)
{
double sum = 0;
// first change shape of your distribution probablity array
// we need it to be cumulative, that is:
// if you have [0.1, 0.2, 0.3, 0.4]
// we need [0.1, 0.3, 0.6, 1 ] instead
var cumulative = distribution.Select(c =>
{
var result = c + sum;
sum += c;
return result;
}).ToList();
for (int i = 0; i < size; i++)
{
// now generate random double. It will always be in range from 0 to 1
var r = _random.Value.NextDouble();
// now find first index in our cumulative array that is greater or equal generated random value
var idx = cumulative.BinarySearch(r);
// if exact match is not found, List.BinarySearch will return index of the first items greater than passed value, but in specific form (negative)
// we need to apply ~ to this negative value to get real index
if (idx < 0)
idx = ~idx;
if (idx > cumulative.Count - 1)
idx = cumulative.Count - 1; // very rare case when probabilities do not sum to 1 becuase of double precision issues (so sum is 0.999943 and so on)
// return item at given index
yield return sequence[idx];
}
}
public static T Choice<T>(IList<T> sequence, double[] distribution)
{
return Choice(sequence, 1, distribution).First();
}
public static int Choice(int upTo, double[] distribution)
{
return Choice(Enumerable.Range(0, upTo).ToArray(), distribution);
}
}
class GymEnvironments
{
public const string CartPolev0 = "CartPole-v0";
}
public interface ILogger
{
void warn(string message);
}
public class ConsoleLogger : ILogger
{
public void warn(string message)
{
Console.ForegroundColor = ConsoleColor.Yellow;
Console.WriteLine(message);
Console.ResetColor();
}
}
public abstract class GymEnvironment
{
protected ILogger logger = new ConsoleLogger();
internal static GymEnvironment make(string v)
{
switch (v)
{
case GymEnvironments.CartPolev0:
return new CartPolev0();
default:
throw new NotImplementedException();
}
}
public abstract void seed(int seed);
public abstract Tensor reset();
public virtual EnvorinmentStepResult step(int action)
{
throw new NotImplementedException();
}
}
public class CartPolev0 : GymEnvironment
{
private float gravity;
private float masscart;
private float masspole;
private float total_mass;
private float length;
private float polemass_length;
private float force_mag;
private float tau;
private string kinematics_integrator;
private float theta_threshold_radians;
private float x_threshold;
private NDArray high;
private spaces action_space;
private spaces observation_space;
private NDArray state;
private int? steps_beyond_done;
NumPyRandom rng;
public CartPolev0()
{
this.gravity = 9.8f;
this.masscart = 1.0f;
this.masspole = 0.1f;
this.total_mass = (this.masspole + this.masscart);
this.length = 0.5f; // actually half the pole's length
this.polemass_length = (this.masspole * this.length);
this.force_mag = 10.0f;
this.tau = 0.02f; //seconds between state updates
this.kinematics_integrator = "euler";
//# Angle at which to fail the episode
this.theta_threshold_radians = (float)(12 * 2 * Math.PI / 360);
this.x_threshold = 2.4f;
// Angle limit set to 2 * theta_threshold_radians so failing observation
// is still within bounds.
var highValues = new float[] {(float)this.x_threshold * 2,
float.MaxValue, //np.finfo(np.float32).max,
(float)this.theta_threshold_radians * 2,
float.MaxValue}; //, //np.finfo(np.float32).max],
this.high = np.array(highValues);
var negHigh = high.negative();
this.action_space = spaces.Discrete(2);
this.observation_space = spaces.Box(negHigh, high, np.float32);
//this.seed();
//this.viewer = None;
this.state = null;
this.steps_beyond_done = null;
}
int _seed;
public override void seed(int seed)
{
this._seed = seed;
rng = np.random.RandomState(seed);
}
public override Tensor reset()
{
// random_ops.random_uniform(new int[] { }, minval: -0.05f, maxval: 0.05f);
var result = rng.uniform(-0.05f, 0.05f, (4));
var asFloat = result.astype(NPTypeCode.Float);
this.state = asFloat;
steps_beyond_done = null;
return np.array(asFloat);
}
public override EnvorinmentStepResult step(int action)
{
// err_msg = "%r (%s) invalid" % (action, type(action))
//assert self.action_space.contains(action), err_msg
var stateAsArray = this.state.ToArray<float>();
float x = stateAsArray[0];
float x_dot = stateAsArray[1];
float theta = stateAsArray[2];
float theta_dot = stateAsArray[3];
float force = action == 1 ? this.force_mag : -this.force_mag;
float costheta = (float)Math.Cos(theta);
float sintheta = (float)Math.Sin(theta);
// For the interested reader:
// https://coneural.org/florian/papers/05_cart_pole.pdf
float temp = (force + this.polemass_length * (float)Math.Pow(theta_dot, 2) * sintheta) / this.total_mass;
float thetaacc = (this.gravity * sintheta - costheta * temp) / (this.length * (4.0f / 3.0f - this.masspole * (float)Math.Pow(costheta, 2) / this.total_mass));
float xacc = temp - this.polemass_length * thetaacc * costheta / this.total_mass;
if (this.kinematics_integrator == "euler")
{
x = x + this.tau * x_dot;
x_dot = x_dot + this.tau * xacc;
theta = theta + this.tau * theta_dot;
theta_dot = theta_dot + this.tau * thetaacc;
}
else
{ //: // semi-implicit euler
x_dot = x_dot + this.tau * xacc;
x = x + this.tau * x_dot;
theta_dot = theta_dot + this.tau * thetaacc;//
theta = theta + this.tau * theta_dot;
}
this.state = new[] { x, x_dot, theta, theta_dot };
var done = (
x < -this.x_threshold
|| x > this.x_threshold
|| theta < -this.theta_threshold_radians
|| theta > this.theta_threshold_radians
);
var reward = 0f;
if (!done)
{
reward = 1.0f;
}
else if (this.steps_beyond_done is null)
{
this.steps_beyond_done = 0;
reward = 1.0f;
}
// Pole just fell!
else
{
if (this.steps_beyond_done == 0)
logger.warn(
"You are calling 'step()' even though this " +
"environment has already returned done = True. You " +
"should always call 'reset()' once you receive 'done = " +
"True' -- any further steps are undefined behavior."
);
this.steps_beyond_done += 1;
}
//return (np.array(this.state), reward, done, new object[] { });
var result = new EnvorinmentStepResult
{
state = np.array(this.state),
reward = reward,
done = done,
data = new object[] { }
};
return result;
}
}
public class EnvorinmentStepResult
{
public Tensor state;
public float reward;
public bool done;
public object data;
}
public class spaces
{
public static Discrete Discrete(int value) => new Discrete(value);
internal static Box Box(NDArray x, NDArray y, Type dtype)
=> new Box(x, y, dtype);
}
public class Discrete : spaces
{
public Discrete(int size)
{
this.Size = size;
}
public int Size { get; }
}
public class Box : spaces
{
public Box(NDArray x, NDArray y, Type dtype)
{
X = x;
Y = y;
Dtype = dtype;
}
public NDArray X { get; }
public NDArray Y { get; }
public Type Dtype { get; }
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment