isseu · June 18, 2016 19:31
diff --git a/CartPole-v0-TFLearn.py b/CartPole-v0-TFLearn.py
 from __future__ import division
 import gym
 import numpy as np
 import tflearn
 from tflearn.data_utils import to_categorical
 from tflearn.layers.core import input_data, dropout, fully_connected
 from tflearn import lstm, embedding
 from tflearn.layers.estimator import regression
 import random
 from collections import deque

 NUM_ACTIONS = 2
 NUM_STATES = 4
 MAX_REPLAY_STATES = 100
 NUM_GAMES_TRAIN = 200

 def create_model(n_inputs, n_outputs):
  network = input_data(shape = [None, n_inputs])
  network = fully_connected(network, 128, activation = 'relu')
  #network = dropout(network, 0.5)
  network = fully_connected(network, 256 , activation = 'relu')
  #network = dropout(network, 0.5)
  network = fully_connected(network, 128, activation = 'relu')
  #network = dropout(network, 0.5)
  network = fully_connected(network, n_outputs, activation = 'softmax')
  network = regression(network,
  optimizer = 'adam',
  loss = 'categorical_crossentropy')
  model = tflearn.DNN(
    network,
    max_checkpoints = 0,
    tensorboard_verbose = 0,
    tensorboard_dir = 'logs'
  )
  return model

 env = gym.make('CartPole-v0')
 model = create_model(NUM_STATES, NUM_ACTIONS)

 replay = deque([])

 gamma = 0.9
 epsilon = 1
 for number_game in range(NUM_GAMES_TRAIN):
  observation = env.reset()
  reward_game = 0
  print '[+] Game ' + str(number_game)
  while True:
    env.render()
    q = model.predict([observation])[0]
    if random.random() < epsilon:
      action = np.random.randint(0, NUM_ACTIONS)
    else:
      action = np.argmax(q)
    new_state, reward, done, info = env.step(action)
    reward_game += reward
    replay.append((new_state, reward, action, done, observation))
    if len(replay) > MAX_REPLAY_STATES:
      replay.popleft() # We take the oldest replay
      X_train = []
      Y_train = []
      for rep in replay:
        new_state, reward, action, done_rep, old_state = rep
        old_q = model.predict([old_state])[0]
        new_q = model.predict([new_state])[0]
        max_new_q = np.max(new_q)
        update_target = np.zeros(NUM_ACTIONS)
        update_target[:] = old_q[:]
        if done_rep:
          update = reward
        else:
          update = (reward + (gamma * max_new_q))
        update_target[action] = update
        X_train.append(old_state)
        Y_train.append(update_target)
      model.fit(
        X_train, Y_train,
        validation_set = 0,
        n_epoch = 1,
        batch_size = MAX_REPLAY_STATES,
        shuffle = True,
        show_metric = False,
        snapshot_step = 200,
        snapshot_epoch = False,
        run_id = 'carpole_rl'
      )
    if done or reward_game > 200:
      break
  print "[+] Game " + str(number_game) + " Reward " + str(reward_game)
  if epsilon > 0.1:
    epsilon -= (1 / 1000)
 env.monitor.close()
 model.save('model.tfl')
 #gym.upload(
 #  '/tmp/cartpole-experiment-1',
 #  writeup = 'https://gist.github.com/gdb/',
 #  api_key = ''
 #)
	from __future__ import division
	import gym
	import numpy as np
	import tflearn
	from tflearn.data_utils import to_categorical
	from tflearn.layers.core import input_data, dropout, fully_connected
	from tflearn import lstm, embedding
	from tflearn.layers.estimator import regression
	import random
	from collections import deque

	NUM_ACTIONS = 2
	NUM_STATES = 4
	MAX_REPLAY_STATES = 100
	NUM_GAMES_TRAIN = 200

	def create_model(n_inputs, n_outputs):
	network = input_data(shape = [None, n_inputs])
	network = fully_connected(network, 128, activation = 'relu')
	#network = dropout(network, 0.5)
	network = fully_connected(network, 256 , activation = 'relu')
	#network = dropout(network, 0.5)
	network = fully_connected(network, 128, activation = 'relu')
	#network = dropout(network, 0.5)
	network = fully_connected(network, n_outputs, activation = 'softmax')
	network = regression(network,
	optimizer = 'adam',
	loss = 'categorical_crossentropy')
	model = tflearn.DNN(
	network,
	max_checkpoints = 0,
	tensorboard_verbose = 0,
	tensorboard_dir = 'logs'
	)
	return model

	env = gym.make('CartPole-v0')
	model = create_model(NUM_STATES, NUM_ACTIONS)

	replay = deque([])

	gamma = 0.9
	epsilon = 1
	for number_game in range(NUM_GAMES_TRAIN):
	observation = env.reset()
	reward_game = 0
	print '[+] Game ' + str(number_game)
	while True:
	env.render()
	q = model.predict([observation])[0]
	if random.random() < epsilon:
	action = np.random.randint(0, NUM_ACTIONS)
	else:
	action = np.argmax(q)
	new_state, reward, done, info = env.step(action)
	reward_game += reward
	replay.append((new_state, reward, action, done, observation))
	if len(replay) > MAX_REPLAY_STATES:
	replay.popleft() # We take the oldest replay
	X_train = []
	Y_train = []
	for rep in replay:
	new_state, reward, action, done_rep, old_state = rep
	old_q = model.predict([old_state])[0]
	new_q = model.predict([new_state])[0]
	max_new_q = np.max(new_q)
	update_target = np.zeros(NUM_ACTIONS)
	update_target[:] = old_q[:]
	if done_rep:
	update = reward
	else:
	update = (reward + (gamma * max_new_q))
	update_target[action] = update
	X_train.append(old_state)
	Y_train.append(update_target)
	model.fit(
	X_train, Y_train,
	validation_set = 0,
	n_epoch = 1,
	batch_size = MAX_REPLAY_STATES,
	shuffle = True,
	show_metric = False,
	snapshot_step = 200,
	snapshot_epoch = False,
	run_id = 'carpole_rl'
	)
	if done or reward_game > 200:
	break
	print "[+] Game " + str(number_game) + " Reward " + str(reward_game)
	if epsilon > 0.1:
	epsilon -= (1 / 1000)
	env.monitor.close()
	model.save('model.tfl')
	#gym.upload(
	# '/tmp/cartpole-experiment-1',
	# writeup = 'https://gist.github.com/gdb/',
	# api_key = ''
	#)