Skip to content

Instantly share code, notes, and snippets.

@brtkwr
Forked from awjuliani/rl-tutorial-2.ipynb
Last active April 23, 2018 08:40
Show Gist options
  • Select an option

  • Save brtkwr/ecc74e0d835c976481f7beaf08993272 to your computer and use it in GitHub Desktop.

Select an option

Save brtkwr/ecc74e0d835c976481f7beaf08993272 to your computer and use it in GitHub Desktop.
Reinforcement Learning Tutorial 2 (Cart Pole problem)
import numpy as np
import pickle
import tensorflow as tf
import matplotlib.pyplot as plt
import math
import gym
env = gym.make('LunarLander-v2')
print ('Shape of the observation space is', env.observation_space.shape)
# hyperparameters
H = 100 # number of hidden layer neurons
batch_size = 5 # every how many episodes to do a param update?
learning_rate = 1e-4 # feel free to play with this to train faster or more stably.
gamma = 0.99 # discount factor for reward
D, = env.observation_space.shape # input dimensionality
tf.reset_default_graph()
#This defines the network as it goes from taking an observation of the environment to
#giving a probability of chosing to the action of moving left or right.
observations = tf.placeholder(tf.float32, [None,D] , name="input_x")
W1 = tf.get_variable("W1", shape=[D, H],
initializer=tf.contrib.layers.xavier_initializer())
layer1 = tf.nn.relu(tf.matmul(observations,W1))
W2 = tf.get_variable("W2", shape=[H, H],
initializer=tf.contrib.layers.xavier_initializer())
layer2 = tf.nn.relu(tf.matmul(layer1,W2))
W3 = tf.get_variable("W3", shape=[H, env.action_space.n],
initializer=tf.contrib.layers.xavier_initializer())
score = tf.matmul(layer2,W3)
probability = tf.nn.softmax(score)
#From here we define the parts of the network needed for learning a good policy.
tvars = tf.trainable_variables()
input_y = tf.placeholder(tf.float32,[None,env.action_space.n], name="input_y")
advantages = tf.placeholder(tf.float32,name="reward_signal")
# The loss function. This sends the weights in the direction of making actions
# that gave good advantage (reward over time) more likely, and actions that didn't less likely.
# loglik = tf.log(input_y*(input_y - probability) + (1 - input_y)*(input_y + probability))
# loglik = input_y*(input_y - probability) + (1 - input_y)*(input_y + probability)
# loglik = input_y*probability + (1 - input_y)*(1 - probability)
# loss = -tf.reduce_sum(loglik * advantages)
# loglik = input_y*(input_y - probability) + (1 - input_y)*(input_y + probability)
loglik = tf.square(input_y - probability)
loss = tf.reduce_sum(loglik * advantages)
newGrads = tf.gradients(loss,tvars)
# Once we have collected a series of gradients from multiple episodes, we apply them.
# We don't just apply gradeients after every episode in order to account for noise in the reward signal.
adam = tf.train.AdamOptimizer(learning_rate=learning_rate) # Our optimizer
W1Grad = tf.placeholder(tf.float32,name="batch_grad1") # Placeholders to send the final gradients through when we update.
W2Grad = tf.placeholder(tf.float32,name="batch_grad2")
W3Grad = tf.placeholder(tf.float32,name="batch_grad3")
batchGrad = [W1Grad,W2Grad,W3Grad]
updateGrads = adam.apply_gradients(zip(batchGrad,tvars))
def discount_rewards(r):
""" take 1D float array of rewards and compute discounted reward """
discounted_r = np.zeros_like(r)
running_add = 0
for t in reversed(range(0, r.size)):
running_add = running_add * gamma + r[t]
discounted_r[t] = running_add
return discounted_r
# %%time
xs,hs,dlogps,drs,ys,tfps = [],[],[],[],[],[]
running_reward = None
running_loss = None
reward_sum = 0
episode_number = 1
total_episodes = 1000
init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
rendering = False
sess.run(init)
observation = env.reset() # Obtain an initial observation of the environment
# Reset the gradient placeholder. We will collect gradients in
# gradBuffer until we are ready to update our policy network.
gradBuffer = sess.run(tvars)
for ix,grad in enumerate(gradBuffer):
print (grad.shape)
gradBuffer[ix] = grad * 0
while episode_number <= total_episodes:
# Rendering the environment slows things down,
# so let's only look at it once our agent is doing a good job.
if reward_sum/batch_size > 0:
env.render()
# Make sure the observation is in a shape the network can handle.
x = np.reshape(observation,[1,D])
# Run the policy network and get an action to take.
tfprob = sess.run(probability,feed_dict={observations: x})
# print (tfprob)
# action = 0 if np.random.uniform() < tfprob else 1
action = np.argmax(tfprob)
xs.append(x) # observation
y = action # a "fake label"
ys.append(y)
# step the environment and get new measurements
observation, reward, done, info = env.step(action)
reward_sum += reward
drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)
if done:
# print( drs)
# print(tfprob)
episode_number += 1
# stack together all inputs, hidden states, action gradients, and rewards for this episode
epx = np.vstack(xs)
# epy = np.vstack(ys)
epy = np.eye(env.action_space.n)[ys]
epr = np.vstack(drs)
tfp = tfps
xs,hs,dlogps,drs,ys,tfps = [],[],[],[],[],[] # reset array memory
# compute the discounted reward backwards through time
discounted_epr = discount_rewards(epr)
# size the rewards to be unit normal (helps control the gradient estimator variance)
discounted_epr -= np.mean(discounted_epr)
discounted_epr /= np.std(discounted_epr)
# Get the gradient for this episode, and save it in the gradBuffer
tProb,tLoglik,tLoss,tGrad = sess.run(fetches=(probability,loglik,loss,newGrads),feed_dict={observations: epx, input_y: epy, advantages: discounted_epr})
if episode_number%500 == 0:
for item in zip(discounted_epr,epy,tProb,tLoglik):
print (item)
# Iterating over the layers
for ix,grad in enumerate(tGrad):
gradBuffer[ix] += grad
# If we have completed enough episodes, then update the policy network with our gradients.
if episode_number % batch_size == 0:
sess.run(updateGrads,feed_dict={W1Grad: gradBuffer[0],W2Grad:gradBuffer[1],W3Grad:gradBuffer[2]})
for ix,grad in enumerate(gradBuffer):
gradBuffer[ix] = grad * 0
# Give a summary of how well our network is doing for each batch of episodes.
running_reward = reward_sum if running_reward is None else running_reward * 0.95 + reward_sum * 0.05
running_loss = tLoss if running_loss is None else running_loss * 0.95 + tLoss * 0.05
print ('%d Episode reward %f. Running reward %f. Episode loss %f. Running loss %f.' % (episode_number,reward_sum/batch_size, running_reward/batch_size, tLoss, running_loss))
if reward_sum/batch_size > 200:
print ("Task solved in",episode_number,'episodes!')
break
reward_sum = 0
observation = env.reset()
print (episode_number,'Episodes completed.')
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment