zh4ngx · July 1, 2017 03:20
diff --git a/hill_climb_3.py b/hill_climb_3.py
 import gym
 import numpy as np
 from gym.wrappers.monitoring import Monitor

 MC_POLICY_EVAL_EP = 10
 BASE_NOISE_FACTOR = 0.5
 NUM_POLICY_EVAL = 500


 env = gym.make('CartPole-v0')
 env = Monitor(env, 'tmp/cart-pole-hill-climb-3', force=True)

 print("Action space: {0}".format(env.action_space))
 print("Observation space: {0}\n\tLow: {1}\n\tHigh: {2}".format(
    env.observation_space,
    env.observation_space.low,
    env.observation_space.high,
 ))


 def action_selection(weights, observation):
    if np.matmul(weights, observation) < 0:
        return 0
    else:
        return 1


 def run_episode(weights):
    observation = env.reset()
    total_reward = 0
    for t in range(200):
        env.render()
        action = action_selection(weights, observation)
        observation, reward, done, info = env.step(action)
        total_reward += reward
        if done:
            print("Episode finished after {0} timesteps with reward {1}".format(
                t + 1,
                total_reward,
            ))
            break

    return total_reward


 def evaluate_policy(num_episodes, weights):
    mean_reward = 0
    for k in range(1, num_episodes + 1):
        reward = run_episode(weights)
        error = reward - mean_reward
        mean_reward += error / k

    print("Mean reward estimated as {0} for past {1} episodes".format(
        mean_reward,
        num_episodes
    ))
    return mean_reward


 best_reward = -np.inf
 best_params = np.random.rand(4) * 2 - 1

 print("Running Hill Climb on Cart Pole")
 print("Params:\n\tMC Eval Count: {0} trajectories\n\tBase Noise Factor: {1}".format(
    MC_POLICY_EVAL_EP,
    BASE_NOISE_FACTOR,
 ))

 for i_episode in range(NUM_POLICY_EVAL):
    # Weights are 1x4 matrix
    # µ = 0 , sigma 1
    annealing_term = 1 - (i_episode / NUM_POLICY_EVAL)
    noise_scaling = BASE_NOISE_FACTOR * annealing_term
    print("Applying jitter with factor {0} to parameters {1}".format(
        noise_scaling,
        best_params,
    ))

    # Add gaussian noise
    # µ = 0 , sigma = noise_scaling
    noise_term = np.random.randn(4) * noise_scaling
    parameters = best_params + noise_term
    episodic_reward = evaluate_policy(MC_POLICY_EVAL_EP, parameters)
    if episodic_reward > best_reward:
        print("Episode {2}: Got new best reward of {0}, better than previous of {1}".format(
            episodic_reward,
            best_reward,
            i_episode,
        ))
        best_reward = episodic_reward
        best_params = parameters

 env.close()
	import gym
	import numpy as np
	from gym.wrappers.monitoring import Monitor

	MC_POLICY_EVAL_EP = 10
	BASE_NOISE_FACTOR = 0.5
	NUM_POLICY_EVAL = 500


	env = gym.make('CartPole-v0')
	env = Monitor(env, 'tmp/cart-pole-hill-climb-3', force=True)

	print("Action space: {0}".format(env.action_space))
	print("Observation space: {0}\n\tLow: {1}\n\tHigh: {2}".format(
	env.observation_space,
	env.observation_space.low,
	env.observation_space.high,
	))


	def action_selection(weights, observation):
	if np.matmul(weights, observation) < 0:
	return 0
	else:
	return 1


	def run_episode(weights):
	observation = env.reset()
	total_reward = 0
	for t in range(200):
	env.render()
	action = action_selection(weights, observation)
	observation, reward, done, info = env.step(action)
	total_reward += reward
	if done:
	print("Episode finished after {0} timesteps with reward {1}".format(
	t + 1,
	total_reward,
	))
	break

	return total_reward


	def evaluate_policy(num_episodes, weights):
	mean_reward = 0
	for k in range(1, num_episodes + 1):
	reward = run_episode(weights)
	error = reward - mean_reward
	mean_reward += error / k

	print("Mean reward estimated as {0} for past {1} episodes".format(
	mean_reward,
	num_episodes
	))
	return mean_reward


	best_reward = -np.inf
	best_params = np.random.rand(4) * 2 - 1

	print("Running Hill Climb on Cart Pole")
	print("Params:\n\tMC Eval Count: {0} trajectories\n\tBase Noise Factor: {1}".format(
	MC_POLICY_EVAL_EP,
	BASE_NOISE_FACTOR,
	))

	for i_episode in range(NUM_POLICY_EVAL):
	# Weights are 1x4 matrix
	# µ = 0 , sigma 1
	annealing_term = 1 - (i_episode / NUM_POLICY_EVAL)
	noise_scaling = BASE_NOISE_FACTOR * annealing_term
	print("Applying jitter with factor {0} to parameters {1}".format(
	noise_scaling,
	best_params,
	))

	# Add gaussian noise
	# µ = 0 , sigma = noise_scaling
	noise_term = np.random.randn(4) * noise_scaling
	parameters = best_params + noise_term
	episodic_reward = evaluate_policy(MC_POLICY_EVAL_EP, parameters)
	if episodic_reward > best_reward:
	print("Episode {2}: Got new best reward of {0}, better than previous of {1}".format(
	episodic_reward,
	best_reward,
	i_episode,
	))
	best_reward = episodic_reward
	best_params = parameters

	env.close()