kurnianggoro · March 10, 2017 09:09
diff --git a/openai_frozenlake4x4.py b/openai_frozenlake4x4.py
 '''
 Problem Descriptions:
 https://gym.openai.com/envs/FrozenLake-v0
 The agent should move to the goal position (G) from starting point (S) safely

 environment is 4x4 grid of blocks, 
 F means frozen (safe), H means hole (unsafe)
 example: 

 SFFF
 FHFH
 FFFH
 HFFG

 reward at every step is 0, except when entering G (1)

 moves:
 LEFT = 0
 DOWN = 1
 RIGHT = 2
 UP = 3
 note: sometimes there is a windblows, makes the agent moves to unexpected place 
 probability to be slipped is 2/3
 '''

 import gym
 from gym import wrappers
 import numpy as np

 CREATE_REPORT = 0

 def modify_reward(reward, done):
    if done and reward == 0:
        return -100.0
    elif done:
        return 50.0
    else:
 	return 1.0

 #make the environment
 env = gym.make('FrozenLake-v0')

 #save the log (for submission)
 logfile = '../openai_gym/forezenlake'
 if CREATE_REPORT:
    env = wrappers.Monitor(env, logfile, force=True)

 # for reproducibility
 env.seed(0)
 np.random.seed(0)

 print(env.observation_space.n, env.action_space.n)

 #Initialize table with all zeros
 Q = np.zeros([env.observation_space.n, env.action_space.n])

 #set parameters
 lr = .05 #.85
 y=.99
 num_episodes = 5000

 #create list to save the total rewards per episode
 rList = []

 #play episode
 first = 0;
 for i in range(num_episodes):
    #reset environment and get a new observation (the first state)
    s = env.reset()
    rAll = 0 #to save total reward
    done = False
    j=0 #current timestep
    
    #The Q-Table learning algorithm
    while not done: #j<99: # will try in 100 timesteps
        #env.render()
        j+=1 #increse the timestep
        
        #choose an action by greedily picking from Q table
        # add noise to simulate uncertainty, getting smaller as episode increased
        a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(i+1))) 
        
        #get new state and reward from environment by doing the action
        s_new, r, done, _ = env.step(a)
        
 	r_mod = modify_reward(r,done)
        #update Q-Table with new knowledge
        Q[s,a] = Q[s,a] + lr*(r_mod+y*np.max(Q[s_new,:])-Q[s,a])

        #logging
        rAll+=r
                
        #quit if success or fall to the hole
        if done == True:
            if first == 0 and r != 0.0:
                first = i
                print(i)

            break
        
        #set new state for the next iteration
        s = s_new
        
    #outside while loop
    rList.append(rAll)

 print "Score over time: " +  str(sum(rList)/num_episodes)

 print "Final Q-Table Values"
 print np.argmax(Q,1)

 env.close()

 def moving_average(x, n=100):
    x = x.cumsum()
    return (x[n:] - x[:-n]) / n

 ma = moving_average(np.asarray(rList))
 print "Best 100-episode average reward was %f." % ma.max()

 solved = len(np.where(ma >= .78)[0])>0 # criteria set by openai
 if solved:
 	print "Solved after %d episodes." % np.where(ma >= .78)[0][0]
 else:
 	print "unsolved!"

 #upload the result
 if CREATE_REPORT:
    key = ''
    gym.upload(logfile, api_key=key)
	'''
	Problem Descriptions:
	https://gym.openai.com/envs/FrozenLake-v0
	The agent should move to the goal position (G) from starting point (S) safely

	environment is 4x4 grid of blocks,
	F means frozen (safe), H means hole (unsafe)
	example:

	SFFF
	FHFH
	FFFH
	HFFG

	reward at every step is 0, except when entering G (1)

	moves:
	LEFT = 0
	DOWN = 1
	RIGHT = 2
	UP = 3
	note: sometimes there is a windblows, makes the agent moves to unexpected place
	probability to be slipped is 2/3
	'''

	import gym
	from gym import wrappers
	import numpy as np

	CREATE_REPORT = 0

	def modify_reward(reward, done):
	if done and reward == 0:
	return -100.0
	elif done:
	return 50.0
	else:
	return 1.0

	#make the environment
	env = gym.make('FrozenLake-v0')

	#save the log (for submission)
	logfile = '../openai_gym/forezenlake'
	if CREATE_REPORT:
	env = wrappers.Monitor(env, logfile, force=True)

	# for reproducibility
	env.seed(0)
	np.random.seed(0)

	print(env.observation_space.n, env.action_space.n)

	#Initialize table with all zeros
	Q = np.zeros([env.observation_space.n, env.action_space.n])

	#set parameters
	lr = .05 #.85
	y=.99
	num_episodes = 5000

	#create list to save the total rewards per episode
	rList = []

	#play episode
	first = 0;
	for i in range(num_episodes):
	#reset environment and get a new observation (the first state)
	s = env.reset()
	rAll = 0 #to save total reward
	done = False
	j=0 #current timestep

	#The Q-Table learning algorithm
	while not done: #j<99: # will try in 100 timesteps
	#env.render()
	j+=1 #increse the timestep

	#choose an action by greedily picking from Q table
	# add noise to simulate uncertainty, getting smaller as episode increased
	a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(i+1)))

	#get new state and reward from environment by doing the action
	s_new, r, done, _ = env.step(a)

	r_mod = modify_reward(r,done)
	#update Q-Table with new knowledge
	Q[s,a] = Q[s,a] + lr(r_mod+ynp.max(Q[s_new,:])-Q[s,a])

	#logging
	rAll+=r

	#quit if success or fall to the hole
	if done == True:
	if first == 0 and r != 0.0:
	first = i
	print(i)

	break

	#set new state for the next iteration
	s = s_new

	#outside while loop
	rList.append(rAll)

	print "Score over time: " + str(sum(rList)/num_episodes)

	print "Final Q-Table Values"
	print np.argmax(Q,1)

	env.close()

	def moving_average(x, n=100):
	x = x.cumsum()
	return (x[n:] - x[:-n]) / n

	ma = moving_average(np.asarray(rList))
	print "Best 100-episode average reward was %f." % ma.max()

	solved = len(np.where(ma >= .78)[0])>0 # criteria set by openai
	if solved:
	print "Solved after %d episodes." % np.where(ma >= .78)[0][0]
	else:
	print "unsolved!"

	#upload the result
	if CREATE_REPORT:
	key = ''
	gym.upload(logfile, api_key=key)