brtkwr · April 23, 2018 08:40
diff --git a/MyLunarLander.py b/MyLunarLander.py
 import numpy as np
 import pickle
 import tensorflow as tf
 import matplotlib.pyplot as plt
 import math

 import gym
 env = gym.make('LunarLander-v2')
 print ('Shape of the observation space is', env.observation_space.shape)

 # hyperparameters
 H = 100 # number of hidden layer neurons
 batch_size = 5 # every how many episodes to do a param update?
 learning_rate = 1e-4 # feel free to play with this to train faster or more stably.
 gamma = 0.99 # discount factor for reward

 D, = env.observation_space.shape # input dimensionality

 tf.reset_default_graph()

 #This defines the network as it goes from taking an observation of the environment to 
 #giving a probability of chosing to the action of moving left or right.
 observations = tf.placeholder(tf.float32, [None,D] , name="input_x")
 W1 = tf.get_variable("W1", shape=[D, H],
           initializer=tf.contrib.layers.xavier_initializer())
 layer1 = tf.nn.relu(tf.matmul(observations,W1))

 W2 = tf.get_variable("W2", shape=[H, H],
           initializer=tf.contrib.layers.xavier_initializer())
 layer2 = tf.nn.relu(tf.matmul(layer1,W2))


 W3 = tf.get_variable("W3", shape=[H, env.action_space.n],
           initializer=tf.contrib.layers.xavier_initializer())
 score = tf.matmul(layer2,W3)

 probability = tf.nn.softmax(score)

 #From here we define the parts of the network needed for learning a good policy.
 tvars = tf.trainable_variables()
 input_y = tf.placeholder(tf.float32,[None,env.action_space.n], name="input_y")
 advantages = tf.placeholder(tf.float32,name="reward_signal")

 # The loss function. This sends the weights in the direction of making actions 
 # that gave good advantage (reward over time) more likely, and actions that didn't less likely.
 # loglik = tf.log(input_y*(input_y - probability) + (1 - input_y)*(input_y + probability))
 # loglik = input_y*(input_y - probability) + (1 - input_y)*(input_y + probability)
 # loglik = input_y*probability + (1 - input_y)*(1 - probability)
 # loss = -tf.reduce_sum(loglik * advantages)

 # loglik = input_y*(input_y - probability) + (1 - input_y)*(input_y + probability)
 loglik = tf.square(input_y - probability)
 loss = tf.reduce_sum(loglik * advantages)

 newGrads = tf.gradients(loss,tvars)

 # Once we have collected a series of gradients from multiple episodes, we apply them.
 # We don't just apply gradeients after every episode in order to account for noise in the reward signal.
 adam = tf.train.AdamOptimizer(learning_rate=learning_rate) # Our optimizer
 W1Grad = tf.placeholder(tf.float32,name="batch_grad1") # Placeholders to send the final gradients through when we update.
 W2Grad = tf.placeholder(tf.float32,name="batch_grad2")
 W3Grad = tf.placeholder(tf.float32,name="batch_grad3")
 batchGrad = [W1Grad,W2Grad,W3Grad]
 updateGrads = adam.apply_gradients(zip(batchGrad,tvars))

 def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

    # %%time

 xs,hs,dlogps,drs,ys,tfps = [],[],[],[],[],[]
 running_reward = None
 running_loss = None
 reward_sum = 0
 episode_number = 1
 total_episodes = 1000
 init = tf.global_variables_initializer()

 # Launch the graph
 with tf.Session() as sess:
    rendering = False
    sess.run(init)
    observation = env.reset() # Obtain an initial observation of the environment

    # Reset the gradient placeholder. We will collect gradients in 
    # gradBuffer until we are ready to update our policy network. 
    gradBuffer = sess.run(tvars)
    for ix,grad in enumerate(gradBuffer):
        print (grad.shape)
        gradBuffer[ix] = grad * 0
    
    while episode_number <= total_episodes:
        
        # Rendering the environment slows things down, 
        # so let's only look at it once our agent is doing a good job.
        if reward_sum/batch_size > 0: 
            env.render()
            
        # Make sure the observation is in a shape the network can handle.
        x = np.reshape(observation,[1,D])
        
        # Run the policy network and get an action to take. 
        tfprob = sess.run(probability,feed_dict={observations: x})
        # print (tfprob)
        # action = 0 if np.random.uniform() < tfprob else 1
        action = np.argmax(tfprob)
        
        xs.append(x) # observation
        y = action # a "fake label"
        ys.append(y)
        
        # step the environment and get new measurements
        observation, reward, done, info = env.step(action)
        reward_sum += reward

        drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)
        
        if done: 
            # print( drs)            
            # print(tfprob)
            episode_number += 1
            # stack together all inputs, hidden states, action gradients, and rewards for this episode
            epx = np.vstack(xs)
            # epy = np.vstack(ys)
            epy = np.eye(env.action_space.n)[ys]
            epr = np.vstack(drs)
            tfp = tfps
            xs,hs,dlogps,drs,ys,tfps = [],[],[],[],[],[] # reset array memory
            
            # compute the discounted reward backwards through time
            
            discounted_epr = discount_rewards(epr)
            
            # size the rewards to be unit normal (helps control the gradient estimator variance)
            discounted_epr -= np.mean(discounted_epr)
            discounted_epr /= np.std(discounted_epr)
            
            # Get the gradient for this episode, and save it in the gradBuffer
            tProb,tLoglik,tLoss,tGrad = sess.run(fetches=(probability,loglik,loss,newGrads),feed_dict={observations: epx, input_y: epy, advantages: discounted_epr})
            if episode_number%500 == 0:
                for item in zip(discounted_epr,epy,tProb,tLoglik):
                    print (item)
                
            # Iterating over the layers
            for ix,grad in enumerate(tGrad):
                gradBuffer[ix] += grad
                
            # If we have completed enough episodes, then update the policy network with our gradients.
            if episode_number % batch_size == 0: 
                sess.run(updateGrads,feed_dict={W1Grad: gradBuffer[0],W2Grad:gradBuffer[1],W3Grad:gradBuffer[2]})
                for ix,grad in enumerate(gradBuffer):
                    gradBuffer[ix] = grad * 0
                
                # Give a summary of how well our network is doing for each batch of episodes.
                running_reward = reward_sum if running_reward is None else running_reward * 0.95 + reward_sum * 0.05
                running_loss = tLoss if running_loss is None else running_loss * 0.95 + tLoss * 0.05
                print ('%d Episode reward %f.  Running reward %f. Episode loss %f. Running loss %f.' % (episode_number,reward_sum/batch_size, running_reward/batch_size, tLoss, running_loss))
                
                if reward_sum/batch_size > 200: 
                    print ("Task solved in",episode_number,'episodes!')
                    break
                    
                reward_sum = 0
            
            observation = env.reset()
        
 print (episode_number,'Episodes completed.')
diff --git a/rl-tutorial-2.ipynb b/rl-tutorial-2.ipynb
	import numpy as np
	import pickle
	import tensorflow as tf
	import matplotlib.pyplot as plt
	import math

	import gym
	env = gym.make('LunarLander-v2')
	print ('Shape of the observation space is', env.observation_space.shape)

	# hyperparameters
	H = 100 # number of hidden layer neurons
	batch_size = 5 # every how many episodes to do a param update?
	learning_rate = 1e-4 # feel free to play with this to train faster or more stably.
	gamma = 0.99 # discount factor for reward

	D, = env.observation_space.shape # input dimensionality

	tf.reset_default_graph()

	#This defines the network as it goes from taking an observation of the environment to
	#giving a probability of chosing to the action of moving left or right.
	observations = tf.placeholder(tf.float32, [None,D] , name="input_x")
	W1 = tf.get_variable("W1", shape=[D, H],
	initializer=tf.contrib.layers.xavier_initializer())
	layer1 = tf.nn.relu(tf.matmul(observations,W1))

	W2 = tf.get_variable("W2", shape=[H, H],
	initializer=tf.contrib.layers.xavier_initializer())
	layer2 = tf.nn.relu(tf.matmul(layer1,W2))


	W3 = tf.get_variable("W3", shape=[H, env.action_space.n],
	initializer=tf.contrib.layers.xavier_initializer())
	score = tf.matmul(layer2,W3)

	probability = tf.nn.softmax(score)

	#From here we define the parts of the network needed for learning a good policy.
	tvars = tf.trainable_variables()
	input_y = tf.placeholder(tf.float32,[None,env.action_space.n], name="input_y")
	advantages = tf.placeholder(tf.float32,name="reward_signal")

	# The loss function. This sends the weights in the direction of making actions
	# that gave good advantage (reward over time) more likely, and actions that didn't less likely.
	# loglik = tf.log(input_y(input_y - probability) + (1 - input_y)(input_y + probability))
	# loglik = input_y(input_y - probability) + (1 - input_y)(input_y + probability)
	# loglik = input_yprobability + (1 - input_y)(1 - probability)
	# loss = -tf.reduce_sum(loglik * advantages)

	# loglik = input_y(input_y - probability) + (1 - input_y)(input_y + probability)
	loglik = tf.square(input_y - probability)
	loss = tf.reduce_sum(loglik * advantages)

	newGrads = tf.gradients(loss,tvars)

	# Once we have collected a series of gradients from multiple episodes, we apply them.
	# We don't just apply gradeients after every episode in order to account for noise in the reward signal.
	adam = tf.train.AdamOptimizer(learning_rate=learning_rate) # Our optimizer
	W1Grad = tf.placeholder(tf.float32,name="batch_grad1") # Placeholders to send the final gradients through when we update.
	W2Grad = tf.placeholder(tf.float32,name="batch_grad2")
	W3Grad = tf.placeholder(tf.float32,name="batch_grad3")
	batchGrad = [W1Grad,W2Grad,W3Grad]
	updateGrads = adam.apply_gradients(zip(batchGrad,tvars))

	def discount_rewards(r):
	""" take 1D float array of rewards and compute discounted reward """
	discounted_r = np.zeros_like(r)
	running_add = 0
	for t in reversed(range(0, r.size)):
	running_add = running_add * gamma + r[t]
	discounted_r[t] = running_add
	return discounted_r

	# %%time

	xs,hs,dlogps,drs,ys,tfps = [],[],[],[],[],[]
	running_reward = None
	running_loss = None
	reward_sum = 0
	episode_number = 1
	total_episodes = 1000
	init = tf.global_variables_initializer()

	# Launch the graph
	with tf.Session() as sess:
	rendering = False
	sess.run(init)
	observation = env.reset() # Obtain an initial observation of the environment

	# Reset the gradient placeholder. We will collect gradients in
	# gradBuffer until we are ready to update our policy network.
	gradBuffer = sess.run(tvars)
	for ix,grad in enumerate(gradBuffer):
	print (grad.shape)
	gradBuffer[ix] = grad * 0

	while episode_number <= total_episodes:

	# Rendering the environment slows things down,
	# so let's only look at it once our agent is doing a good job.
	if reward_sum/batch_size > 0:
	env.render()

	# Make sure the observation is in a shape the network can handle.
	x = np.reshape(observation,[1,D])

	# Run the policy network and get an action to take.
	tfprob = sess.run(probability,feed_dict={observations: x})
	# print (tfprob)
	# action = 0 if np.random.uniform() < tfprob else 1
	action = np.argmax(tfprob)

	xs.append(x) # observation
	y = action # a "fake label"
	ys.append(y)

	# step the environment and get new measurements
	observation, reward, done, info = env.step(action)
	reward_sum += reward

	drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

	if done:
	# print( drs)
	# print(tfprob)
	episode_number += 1
	# stack together all inputs, hidden states, action gradients, and rewards for this episode
	epx = np.vstack(xs)
	# epy = np.vstack(ys)
	epy = np.eye(env.action_space.n)[ys]
	epr = np.vstack(drs)
	tfp = tfps
	xs,hs,dlogps,drs,ys,tfps = [],[],[],[],[],[] # reset array memory

	# compute the discounted reward backwards through time

	discounted_epr = discount_rewards(epr)

	# size the rewards to be unit normal (helps control the gradient estimator variance)
	discounted_epr -= np.mean(discounted_epr)
	discounted_epr /= np.std(discounted_epr)

	# Get the gradient for this episode, and save it in the gradBuffer
	tProb,tLoglik,tLoss,tGrad = sess.run(fetches=(probability,loglik,loss,newGrads),feed_dict={observations: epx, input_y: epy, advantages: discounted_epr})
	if episode_number%500 == 0:
	for item in zip(discounted_epr,epy,tProb,tLoglik):
	print (item)

	# Iterating over the layers
	for ix,grad in enumerate(tGrad):
	gradBuffer[ix] += grad

	# If we have completed enough episodes, then update the policy network with our gradients.
	if episode_number % batch_size == 0:
	sess.run(updateGrads,feed_dict={W1Grad: gradBuffer[0],W2Grad:gradBuffer[1],W3Grad:gradBuffer[2]})
	for ix,grad in enumerate(gradBuffer):
	gradBuffer[ix] = grad * 0

	# Give a summary of how well our network is doing for each batch of episodes.
	running_reward = reward_sum if running_reward is None else running_reward * 0.95 + reward_sum * 0.05
	running_loss = tLoss if running_loss is None else running_loss * 0.95 + tLoss * 0.05
	print ('%d Episode reward %f. Running reward %f. Episode loss %f. Running loss %f.' % (episode_number,reward_sum/batch_size, running_reward/batch_size, tLoss, running_loss))

	if reward_sum/batch_size > 200:
	print ("Task solved in",episode_number,'episodes!')
	break

	reward_sum = 0

	observation = env.reset()

	print (episode_number,'Episodes completed.')
No results found