laceholder(tf.float32,name="batch_grad1") # Placeholders to send the final gradients through when we update.
W2Grad = tf.placeholder(tf.float32,name="batch_grad2")
batchGrad = [W1Grad,W2Grad]
updateGrads = adam.apply_gradients(zip(batchGrad,tvars))
def discount_rewards(r):
""" take 1D float array of rewards and compute discounted reward """
discounted_r = np.zeros_like(r)
running_add = 0
for t in reversed(range(r.size)):
running_add = running_add * gamma + r[t]
discounted_r[t] = running_add
return discounted_r
xs,ys,drs = [],[],[]
#running_reward = None
reward_sum = 0
episode_number = 1
total_episodes = 10000
init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
rendering = False
sess.run(init)
observation = env.reset() # Obtain an initial observation of the environment
# Reset the gradient placeholder. We will collect gradients in
# gradBuffer until we are ready to update our policy network.
gradBuffer = sess.run(tvars)
for ix,grad in enumerate(gradBuffer):
gradBuffer[ix] = grad * 0
while episode_number <= total_episodes:
# Rendering the environment slows things down,
# so let's only look at it once our agent is doing a good job.
if reward_sum/batch_size > 100 or rendering == True :
env.render()
rendering = True
# Make sure the observation is in a shape the network can handle.
x = np.reshape(observation,[1,D])
# Run the policy network and get an action to take.
tfprob = sess.run(probability,feed_dict={observations: x})
action = 1 if np.random.uniform() < tfprob else 0
xs.append(x) # observation
y = 1 if action == 0 else 0 # a "fake label"
ys.append(y)
# step the environment and get new measurements
observation, reward, done, info = env.step(action)
reward_sum += reward
drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)
if done:
episode_number += 1
# stack together all inputs, hidden states, action gradients, and rewards for this episode
epx = np.vstack(xs)
epy = np.vstack(ys)
epr = np.vstack(drs)
xs,ys,drs = [],[],[] # reset array memory
# compute the discounted reward backwards through time
discounted_epr = discount_rewards(epr)
# size the rewards to be unit normal (helps control the gradient estimator variance)
discounted_epr -= np.mean(discounted_epr)
discounted_epr /= np.std(discounted_epr)
# Get the gradient for this episode, and save it in the gradBuffer
tGrad = sess.run(newGrads,feed_dict={observations: epx, input_y: epy, advantages: discounted_epr})
for ix,grad in enumerate(tGrad):
gradBuffer[ix] += grad
# If we have completed enough episodes, then update the policy network with our gradients.
if episode_number % batch_size == 0:
sess.run(updateGrads,feed_dict={W1Grad: gradBuffer[0],W2Grad:gradBuffer[1]})
for ix,grad in enumerate(gradBuffer):
gradBuffer[ix] = grad * 0
# Give a summary of how well our network is doing for each batch of episodes.
#running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
print('Average reward for episode %d : %f.' % (episode_number,reward_sum/batch_size))
if reward_sum/batch_size > 200:
print("Task solved in",episode_number,'episodes!')
break
reward_sum = 0
observation = env.reset()
参考资料: 《TensorFlow实战》
欢迎付费咨询(150元每小时),我的微信:qingxingfengz |