Cartpole-v1 using A2C has very low reward with no reason










0















I'm using a2c to work on the Cartpole-v1. The final goal is to find the mean reward. However, my reward is less than 10. I was thinking whether my code erase part of the rewards or something during the training process? I don't know whats wrong with it. The setup of the code seems fine. The whole process is supposed to have reward around 300~500.



import tensorflow as tf
import numpy as np
import gym

# Supresses compilation warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

# Constants
learning_rate = 0.001
#trials = 3
episodes = 1000
moves = 999
discount = 0.99
hidden_size = 32
critic_size = 128
updates = 50
avgRs =
totRs =

# Helper function to generate distribution
def generate_disRs(hist):
dist =
last_reward = 0
for element in reversed(hist):
reward = discount * last_reward + element
dist.append(reward)
last_reward = reward
return list(reversed(dist))

class A2C:
def __init__(self):
self.game = gym.make('CartPole-v1')
self.game.reset()
self.num_actions = self.game.action_space.n
self.state_size = self.game.observation_space.shape[0]

self.state_input = tf.placeholder(tf.float32, [None,
self.state_size])
self.rewards = tf.placeholder(shape=[None], dtype=tf.float32)

# Define any additional placeholders needed for training your agent here:

self.state_value = self.critic()
self.actProbs = self.actor()
self.loss_val = self.loss()
self.train_op = self.optimizer()

self.session = tf.Session()
self.session.run(tf.global_variables_initializer())




def optimizer(self):
"""
:return: Optimizer for your loss function
"""
optimizer = tf.train.AdamOptimizer(learning_rate)
trainOp = optimizer.minimize(self.loss)
return trainOp

def critic(self):
"""
Calculates the estimated value for every state in
self.state_input. The critic should not depend on
any other tensors besides self.state_input.
:return: A tensor of shape [num_states] representing the
estimated value of each state in the trajectory.
"""



# Placeholders for critic loss
V1 = tf.Variable(tf.random_normal([4, critic_size], dtype=tf.float32, stddev=.1))
v1Out = tf.nn.relu(tf.matmul(self.state_input, V1))
V2 = tf.Variable(tf.random_normal([critic_size, 1], dtype=tf.float32, stddev=.1))
self.v2Out = tf.matmul(v1Out, V2)
return self.v2Out

def actor(self):
"""
Calculates the action probabilities for every state in self.state_input. The actor should not depend on
any other tensors besides self.state_input.
:return: A tensor of shape [num_states, num_actions] representing the probability distribution
over actions that is generated by your actor.
"""
# Layer 1
# self.state = tf.placeholder(shape=[None, 4], dtype=tf.float32)
self.W = tf.Variable(tf.random_uniform([4, hidden_size], dtype=tf.float32))
self.bias = tf.Variable(tf.random_uniform([hidden_size], dtype=tf.float32))
self.hidden = tf.nn.relu(tf.matmul(self.state_input, self.W) + self.bias)

# Layer 2
self.O = tf.Variable(tf.random_uniform([hidden_size, 2], dtype=tf.float32))
self.bias2 = tf.Variable(tf.random_uniform([2], dtype=tf.float32))
self.output = tf.nn.softmax(tf.matmul(self.hidden, self.O) + self.bias2)

self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
self.indices = tf.range(0, tf.shape(self.output)[0]) * 2 + self.actions
self.actProbs = tf.gather(tf.reshape(self.output, [-1]), self.indices)
#self.aloss = -tf.reduce_mean(tf.log(self.actProbs) * self.advantage)
return self.actProbs
#return self.aloss


def loss(self):
"""
:return: A scalar tensor representing the combined actor and critic loss.
"""
# Placeholders
# self.rewards = tf.placeholder(shape=[None], dtype=tf.float32)
# self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
# self.indices = tf.range(0, tf.shape(self.output)[0]) * 2 + self.actions
# self.actProbs = tf.gather(tf.reshape(self.output, [-1]), self.indices)
self.aloss = -tf.reduce_mean(tf.log(self.actProbs) * self.rewards)
self.cLoss = tf.reduce_mean(tf.square(self.rewards - self.v2Out))
self.loss = self.aloss + self.cLoss
return self.loss


def train_episode(self):
"""
train_episode will be called 1000 times by the autograder to train your agent. In this method,
run your agent for a single episode, then use that data to train your agent. Feel free to
add any return values to this method.
"""
# reset
st = self.game.reset()

# List to store state, action and reward histories
state_hist =
action_hist =
reward_hist =

# List to store history of states and values
state_value_hist =

for move in range(moves):

# Run
actDict, stateVal = self.session.run([self.output, self.v2Out], feed_dict=self.state_input: [st])

# Get the random action
action = np.random.choice(np.array([0, 1]), p=actDict[0])

st1, reward, done, info = self.game.step(action)

# Render the game
# game.render()

# Add to the history
action_hist.append(action)
reward_hist.append(reward)
state_hist.append(st)

state_value_hist.append(stateVal[0][0])

# Iterate
st = st1

# Update
if done or (move % updates == 0 and move != 0):
# Get disRs
disRs = generate_disRs(reward_hist)

# Compute Difference
difference = np.array(disRs) - np.array(state_value_hist)

# Run
feed_dict = self.state_input: state_hist, self.actions: action_hist, self.rewards: difference
l, _ = self.session.run([self.loss_val, self.train_op], feed_dict=feed_dict)

if done:
totRs.append(move)
# print move, disRs[0]
break

def check_actor(model):
"""
The autograder will use your actor() function to test your agent. This function
checks that your actor returns a tensor of the right shape for the autograder.
:return: True if the model's actor returns a tensor of the correct shape.
"""
dummy_state = np.ones((10, 4))
#actDict = model.session.run(model.output, feed_dict=model.state_input: [model.game.reset()])
actor_probs = model.session.run(model.actProbs, feed_dict=
model.state_input: dummy_state)
)
return actor_probs.shape == (10, 2)


if __name__ == '__main__':
# Change __main__ to train your agent for 1000 episodes and print the average reward over the last 100 episodes.
# The code below is similar to what our autograder will be running.

learner = A2C()
for i in range(1000):
learner.train_episode()
print(str(np.average(totRs[900: 1000])))









share|improve this question


























    0















    I'm using a2c to work on the Cartpole-v1. The final goal is to find the mean reward. However, my reward is less than 10. I was thinking whether my code erase part of the rewards or something during the training process? I don't know whats wrong with it. The setup of the code seems fine. The whole process is supposed to have reward around 300~500.



    import tensorflow as tf
    import numpy as np
    import gym

    # Supresses compilation warnings
    import os
    os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

    # Constants
    learning_rate = 0.001
    #trials = 3
    episodes = 1000
    moves = 999
    discount = 0.99
    hidden_size = 32
    critic_size = 128
    updates = 50
    avgRs =
    totRs =

    # Helper function to generate distribution
    def generate_disRs(hist):
    dist =
    last_reward = 0
    for element in reversed(hist):
    reward = discount * last_reward + element
    dist.append(reward)
    last_reward = reward
    return list(reversed(dist))

    class A2C:
    def __init__(self):
    self.game = gym.make('CartPole-v1')
    self.game.reset()
    self.num_actions = self.game.action_space.n
    self.state_size = self.game.observation_space.shape[0]

    self.state_input = tf.placeholder(tf.float32, [None,
    self.state_size])
    self.rewards = tf.placeholder(shape=[None], dtype=tf.float32)

    # Define any additional placeholders needed for training your agent here:

    self.state_value = self.critic()
    self.actProbs = self.actor()
    self.loss_val = self.loss()
    self.train_op = self.optimizer()

    self.session = tf.Session()
    self.session.run(tf.global_variables_initializer())




    def optimizer(self):
    """
    :return: Optimizer for your loss function
    """
    optimizer = tf.train.AdamOptimizer(learning_rate)
    trainOp = optimizer.minimize(self.loss)
    return trainOp

    def critic(self):
    """
    Calculates the estimated value for every state in
    self.state_input. The critic should not depend on
    any other tensors besides self.state_input.
    :return: A tensor of shape [num_states] representing the
    estimated value of each state in the trajectory.
    """



    # Placeholders for critic loss
    V1 = tf.Variable(tf.random_normal([4, critic_size], dtype=tf.float32, stddev=.1))
    v1Out = tf.nn.relu(tf.matmul(self.state_input, V1))
    V2 = tf.Variable(tf.random_normal([critic_size, 1], dtype=tf.float32, stddev=.1))
    self.v2Out = tf.matmul(v1Out, V2)
    return self.v2Out

    def actor(self):
    """
    Calculates the action probabilities for every state in self.state_input. The actor should not depend on
    any other tensors besides self.state_input.
    :return: A tensor of shape [num_states, num_actions] representing the probability distribution
    over actions that is generated by your actor.
    """
    # Layer 1
    # self.state = tf.placeholder(shape=[None, 4], dtype=tf.float32)
    self.W = tf.Variable(tf.random_uniform([4, hidden_size], dtype=tf.float32))
    self.bias = tf.Variable(tf.random_uniform([hidden_size], dtype=tf.float32))
    self.hidden = tf.nn.relu(tf.matmul(self.state_input, self.W) + self.bias)

    # Layer 2
    self.O = tf.Variable(tf.random_uniform([hidden_size, 2], dtype=tf.float32))
    self.bias2 = tf.Variable(tf.random_uniform([2], dtype=tf.float32))
    self.output = tf.nn.softmax(tf.matmul(self.hidden, self.O) + self.bias2)

    self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
    self.indices = tf.range(0, tf.shape(self.output)[0]) * 2 + self.actions
    self.actProbs = tf.gather(tf.reshape(self.output, [-1]), self.indices)
    #self.aloss = -tf.reduce_mean(tf.log(self.actProbs) * self.advantage)
    return self.actProbs
    #return self.aloss


    def loss(self):
    """
    :return: A scalar tensor representing the combined actor and critic loss.
    """
    # Placeholders
    # self.rewards = tf.placeholder(shape=[None], dtype=tf.float32)
    # self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
    # self.indices = tf.range(0, tf.shape(self.output)[0]) * 2 + self.actions
    # self.actProbs = tf.gather(tf.reshape(self.output, [-1]), self.indices)
    self.aloss = -tf.reduce_mean(tf.log(self.actProbs) * self.rewards)
    self.cLoss = tf.reduce_mean(tf.square(self.rewards - self.v2Out))
    self.loss = self.aloss + self.cLoss
    return self.loss


    def train_episode(self):
    """
    train_episode will be called 1000 times by the autograder to train your agent. In this method,
    run your agent for a single episode, then use that data to train your agent. Feel free to
    add any return values to this method.
    """
    # reset
    st = self.game.reset()

    # List to store state, action and reward histories
    state_hist =
    action_hist =
    reward_hist =

    # List to store history of states and values
    state_value_hist =

    for move in range(moves):

    # Run
    actDict, stateVal = self.session.run([self.output, self.v2Out], feed_dict=self.state_input: [st])

    # Get the random action
    action = np.random.choice(np.array([0, 1]), p=actDict[0])

    st1, reward, done, info = self.game.step(action)

    # Render the game
    # game.render()

    # Add to the history
    action_hist.append(action)
    reward_hist.append(reward)
    state_hist.append(st)

    state_value_hist.append(stateVal[0][0])

    # Iterate
    st = st1

    # Update
    if done or (move % updates == 0 and move != 0):
    # Get disRs
    disRs = generate_disRs(reward_hist)

    # Compute Difference
    difference = np.array(disRs) - np.array(state_value_hist)

    # Run
    feed_dict = self.state_input: state_hist, self.actions: action_hist, self.rewards: difference
    l, _ = self.session.run([self.loss_val, self.train_op], feed_dict=feed_dict)

    if done:
    totRs.append(move)
    # print move, disRs[0]
    break

    def check_actor(model):
    """
    The autograder will use your actor() function to test your agent. This function
    checks that your actor returns a tensor of the right shape for the autograder.
    :return: True if the model's actor returns a tensor of the correct shape.
    """
    dummy_state = np.ones((10, 4))
    #actDict = model.session.run(model.output, feed_dict=model.state_input: [model.game.reset()])
    actor_probs = model.session.run(model.actProbs, feed_dict=
    model.state_input: dummy_state)
    )
    return actor_probs.shape == (10, 2)


    if __name__ == '__main__':
    # Change __main__ to train your agent for 1000 episodes and print the average reward over the last 100 episodes.
    # The code below is similar to what our autograder will be running.

    learner = A2C()
    for i in range(1000):
    learner.train_episode()
    print(str(np.average(totRs[900: 1000])))









    share|improve this question
























      0












      0








      0








      I'm using a2c to work on the Cartpole-v1. The final goal is to find the mean reward. However, my reward is less than 10. I was thinking whether my code erase part of the rewards or something during the training process? I don't know whats wrong with it. The setup of the code seems fine. The whole process is supposed to have reward around 300~500.



      import tensorflow as tf
      import numpy as np
      import gym

      # Supresses compilation warnings
      import os
      os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

      # Constants
      learning_rate = 0.001
      #trials = 3
      episodes = 1000
      moves = 999
      discount = 0.99
      hidden_size = 32
      critic_size = 128
      updates = 50
      avgRs =
      totRs =

      # Helper function to generate distribution
      def generate_disRs(hist):
      dist =
      last_reward = 0
      for element in reversed(hist):
      reward = discount * last_reward + element
      dist.append(reward)
      last_reward = reward
      return list(reversed(dist))

      class A2C:
      def __init__(self):
      self.game = gym.make('CartPole-v1')
      self.game.reset()
      self.num_actions = self.game.action_space.n
      self.state_size = self.game.observation_space.shape[0]

      self.state_input = tf.placeholder(tf.float32, [None,
      self.state_size])
      self.rewards = tf.placeholder(shape=[None], dtype=tf.float32)

      # Define any additional placeholders needed for training your agent here:

      self.state_value = self.critic()
      self.actProbs = self.actor()
      self.loss_val = self.loss()
      self.train_op = self.optimizer()

      self.session = tf.Session()
      self.session.run(tf.global_variables_initializer())




      def optimizer(self):
      """
      :return: Optimizer for your loss function
      """
      optimizer = tf.train.AdamOptimizer(learning_rate)
      trainOp = optimizer.minimize(self.loss)
      return trainOp

      def critic(self):
      """
      Calculates the estimated value for every state in
      self.state_input. The critic should not depend on
      any other tensors besides self.state_input.
      :return: A tensor of shape [num_states] representing the
      estimated value of each state in the trajectory.
      """



      # Placeholders for critic loss
      V1 = tf.Variable(tf.random_normal([4, critic_size], dtype=tf.float32, stddev=.1))
      v1Out = tf.nn.relu(tf.matmul(self.state_input, V1))
      V2 = tf.Variable(tf.random_normal([critic_size, 1], dtype=tf.float32, stddev=.1))
      self.v2Out = tf.matmul(v1Out, V2)
      return self.v2Out

      def actor(self):
      """
      Calculates the action probabilities for every state in self.state_input. The actor should not depend on
      any other tensors besides self.state_input.
      :return: A tensor of shape [num_states, num_actions] representing the probability distribution
      over actions that is generated by your actor.
      """
      # Layer 1
      # self.state = tf.placeholder(shape=[None, 4], dtype=tf.float32)
      self.W = tf.Variable(tf.random_uniform([4, hidden_size], dtype=tf.float32))
      self.bias = tf.Variable(tf.random_uniform([hidden_size], dtype=tf.float32))
      self.hidden = tf.nn.relu(tf.matmul(self.state_input, self.W) + self.bias)

      # Layer 2
      self.O = tf.Variable(tf.random_uniform([hidden_size, 2], dtype=tf.float32))
      self.bias2 = tf.Variable(tf.random_uniform([2], dtype=tf.float32))
      self.output = tf.nn.softmax(tf.matmul(self.hidden, self.O) + self.bias2)

      self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
      self.indices = tf.range(0, tf.shape(self.output)[0]) * 2 + self.actions
      self.actProbs = tf.gather(tf.reshape(self.output, [-1]), self.indices)
      #self.aloss = -tf.reduce_mean(tf.log(self.actProbs) * self.advantage)
      return self.actProbs
      #return self.aloss


      def loss(self):
      """
      :return: A scalar tensor representing the combined actor and critic loss.
      """
      # Placeholders
      # self.rewards = tf.placeholder(shape=[None], dtype=tf.float32)
      # self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
      # self.indices = tf.range(0, tf.shape(self.output)[0]) * 2 + self.actions
      # self.actProbs = tf.gather(tf.reshape(self.output, [-1]), self.indices)
      self.aloss = -tf.reduce_mean(tf.log(self.actProbs) * self.rewards)
      self.cLoss = tf.reduce_mean(tf.square(self.rewards - self.v2Out))
      self.loss = self.aloss + self.cLoss
      return self.loss


      def train_episode(self):
      """
      train_episode will be called 1000 times by the autograder to train your agent. In this method,
      run your agent for a single episode, then use that data to train your agent. Feel free to
      add any return values to this method.
      """
      # reset
      st = self.game.reset()

      # List to store state, action and reward histories
      state_hist =
      action_hist =
      reward_hist =

      # List to store history of states and values
      state_value_hist =

      for move in range(moves):

      # Run
      actDict, stateVal = self.session.run([self.output, self.v2Out], feed_dict=self.state_input: [st])

      # Get the random action
      action = np.random.choice(np.array([0, 1]), p=actDict[0])

      st1, reward, done, info = self.game.step(action)

      # Render the game
      # game.render()

      # Add to the history
      action_hist.append(action)
      reward_hist.append(reward)
      state_hist.append(st)

      state_value_hist.append(stateVal[0][0])

      # Iterate
      st = st1

      # Update
      if done or (move % updates == 0 and move != 0):
      # Get disRs
      disRs = generate_disRs(reward_hist)

      # Compute Difference
      difference = np.array(disRs) - np.array(state_value_hist)

      # Run
      feed_dict = self.state_input: state_hist, self.actions: action_hist, self.rewards: difference
      l, _ = self.session.run([self.loss_val, self.train_op], feed_dict=feed_dict)

      if done:
      totRs.append(move)
      # print move, disRs[0]
      break

      def check_actor(model):
      """
      The autograder will use your actor() function to test your agent. This function
      checks that your actor returns a tensor of the right shape for the autograder.
      :return: True if the model's actor returns a tensor of the correct shape.
      """
      dummy_state = np.ones((10, 4))
      #actDict = model.session.run(model.output, feed_dict=model.state_input: [model.game.reset()])
      actor_probs = model.session.run(model.actProbs, feed_dict=
      model.state_input: dummy_state)
      )
      return actor_probs.shape == (10, 2)


      if __name__ == '__main__':
      # Change __main__ to train your agent for 1000 episodes and print the average reward over the last 100 episodes.
      # The code below is similar to what our autograder will be running.

      learner = A2C()
      for i in range(1000):
      learner.train_episode()
      print(str(np.average(totRs[900: 1000])))









      share|improve this question














      I'm using a2c to work on the Cartpole-v1. The final goal is to find the mean reward. However, my reward is less than 10. I was thinking whether my code erase part of the rewards or something during the training process? I don't know whats wrong with it. The setup of the code seems fine. The whole process is supposed to have reward around 300~500.



      import tensorflow as tf
      import numpy as np
      import gym

      # Supresses compilation warnings
      import os
      os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

      # Constants
      learning_rate = 0.001
      #trials = 3
      episodes = 1000
      moves = 999
      discount = 0.99
      hidden_size = 32
      critic_size = 128
      updates = 50
      avgRs =
      totRs =

      # Helper function to generate distribution
      def generate_disRs(hist):
      dist =
      last_reward = 0
      for element in reversed(hist):
      reward = discount * last_reward + element
      dist.append(reward)
      last_reward = reward
      return list(reversed(dist))

      class A2C:
      def __init__(self):
      self.game = gym.make('CartPole-v1')
      self.game.reset()
      self.num_actions = self.game.action_space.n
      self.state_size = self.game.observation_space.shape[0]

      self.state_input = tf.placeholder(tf.float32, [None,
      self.state_size])
      self.rewards = tf.placeholder(shape=[None], dtype=tf.float32)

      # Define any additional placeholders needed for training your agent here:

      self.state_value = self.critic()
      self.actProbs = self.actor()
      self.loss_val = self.loss()
      self.train_op = self.optimizer()

      self.session = tf.Session()
      self.session.run(tf.global_variables_initializer())




      def optimizer(self):
      """
      :return: Optimizer for your loss function
      """
      optimizer = tf.train.AdamOptimizer(learning_rate)
      trainOp = optimizer.minimize(self.loss)
      return trainOp

      def critic(self):
      """
      Calculates the estimated value for every state in
      self.state_input. The critic should not depend on
      any other tensors besides self.state_input.
      :return: A tensor of shape [num_states] representing the
      estimated value of each state in the trajectory.
      """



      # Placeholders for critic loss
      V1 = tf.Variable(tf.random_normal([4, critic_size], dtype=tf.float32, stddev=.1))
      v1Out = tf.nn.relu(tf.matmul(self.state_input, V1))
      V2 = tf.Variable(tf.random_normal([critic_size, 1], dtype=tf.float32, stddev=.1))
      self.v2Out = tf.matmul(v1Out, V2)
      return self.v2Out

      def actor(self):
      """
      Calculates the action probabilities for every state in self.state_input. The actor should not depend on
      any other tensors besides self.state_input.
      :return: A tensor of shape [num_states, num_actions] representing the probability distribution
      over actions that is generated by your actor.
      """
      # Layer 1
      # self.state = tf.placeholder(shape=[None, 4], dtype=tf.float32)
      self.W = tf.Variable(tf.random_uniform([4, hidden_size], dtype=tf.float32))
      self.bias = tf.Variable(tf.random_uniform([hidden_size], dtype=tf.float32))
      self.hidden = tf.nn.relu(tf.matmul(self.state_input, self.W) + self.bias)

      # Layer 2
      self.O = tf.Variable(tf.random_uniform([hidden_size, 2], dtype=tf.float32))
      self.bias2 = tf.Variable(tf.random_uniform([2], dtype=tf.float32))
      self.output = tf.nn.softmax(tf.matmul(self.hidden, self.O) + self.bias2)

      self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
      self.indices = tf.range(0, tf.shape(self.output)[0]) * 2 + self.actions
      self.actProbs = tf.gather(tf.reshape(self.output, [-1]), self.indices)
      #self.aloss = -tf.reduce_mean(tf.log(self.actProbs) * self.advantage)
      return self.actProbs
      #return self.aloss


      def loss(self):
      """
      :return: A scalar tensor representing the combined actor and critic loss.
      """
      # Placeholders
      # self.rewards = tf.placeholder(shape=[None], dtype=tf.float32)
      # self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
      # self.indices = tf.range(0, tf.shape(self.output)[0]) * 2 + self.actions
      # self.actProbs = tf.gather(tf.reshape(self.output, [-1]), self.indices)
      self.aloss = -tf.reduce_mean(tf.log(self.actProbs) * self.rewards)
      self.cLoss = tf.reduce_mean(tf.square(self.rewards - self.v2Out))
      self.loss = self.aloss + self.cLoss
      return self.loss


      def train_episode(self):
      """
      train_episode will be called 1000 times by the autograder to train your agent. In this method,
      run your agent for a single episode, then use that data to train your agent. Feel free to
      add any return values to this method.
      """
      # reset
      st = self.game.reset()

      # List to store state, action and reward histories
      state_hist =
      action_hist =
      reward_hist =

      # List to store history of states and values
      state_value_hist =

      for move in range(moves):

      # Run
      actDict, stateVal = self.session.run([self.output, self.v2Out], feed_dict=self.state_input: [st])

      # Get the random action
      action = np.random.choice(np.array([0, 1]), p=actDict[0])

      st1, reward, done, info = self.game.step(action)

      # Render the game
      # game.render()

      # Add to the history
      action_hist.append(action)
      reward_hist.append(reward)
      state_hist.append(st)

      state_value_hist.append(stateVal[0][0])

      # Iterate
      st = st1

      # Update
      if done or (move % updates == 0 and move != 0):
      # Get disRs
      disRs = generate_disRs(reward_hist)

      # Compute Difference
      difference = np.array(disRs) - np.array(state_value_hist)

      # Run
      feed_dict = self.state_input: state_hist, self.actions: action_hist, self.rewards: difference
      l, _ = self.session.run([self.loss_val, self.train_op], feed_dict=feed_dict)

      if done:
      totRs.append(move)
      # print move, disRs[0]
      break

      def check_actor(model):
      """
      The autograder will use your actor() function to test your agent. This function
      checks that your actor returns a tensor of the right shape for the autograder.
      :return: True if the model's actor returns a tensor of the correct shape.
      """
      dummy_state = np.ones((10, 4))
      #actDict = model.session.run(model.output, feed_dict=model.state_input: [model.game.reset()])
      actor_probs = model.session.run(model.actProbs, feed_dict=
      model.state_input: dummy_state)
      )
      return actor_probs.shape == (10, 2)


      if __name__ == '__main__':
      # Change __main__ to train your agent for 1000 episodes and print the average reward over the last 100 episodes.
      # The code below is similar to what our autograder will be running.

      learner = A2C()
      for i in range(1000):
      learner.train_episode()
      print(str(np.average(totRs[900: 1000])))






      python-3.x deep-learning reinforcement-learning






      share|improve this question













      share|improve this question











      share|improve this question




      share|improve this question










      asked Nov 14 '18 at 2:19









      Wei BoveyWei Bovey

      1314




      1314






















          0






          active

          oldest

          votes











          Your Answer






          StackExchange.ifUsing("editor", function ()
          StackExchange.using("externalEditor", function ()
          StackExchange.using("snippets", function ()
          StackExchange.snippets.init();
          );
          );
          , "code-snippets");

          StackExchange.ready(function()
          var channelOptions =
          tags: "".split(" "),
          id: "1"
          ;
          initTagRenderer("".split(" "), "".split(" "), channelOptions);

          StackExchange.using("externalEditor", function()
          // Have to fire editor after snippets, if snippets enabled
          if (StackExchange.settings.snippets.snippetsEnabled)
          StackExchange.using("snippets", function()
          createEditor();
          );

          else
          createEditor();

          );

          function createEditor()
          StackExchange.prepareEditor(
          heartbeatType: 'answer',
          autoActivateHeartbeat: false,
          convertImagesToLinks: true,
          noModals: true,
          showLowRepImageUploadWarning: true,
          reputationToPostImages: 10,
          bindNavPrevention: true,
          postfix: "",
          imageUploader:
          brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
          contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
          allowUrls: true
          ,
          onDemand: true,
          discardSelector: ".discard-answer"
          ,immediatelyShowMarkdownHelp:true
          );



          );













          draft saved

          draft discarded


















          StackExchange.ready(
          function ()
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53292268%2fcartpole-v1-using-a2c-has-very-low-reward-with-no-reason%23new-answer', 'question_page');

          );

          Post as a guest















          Required, but never shown

























          0






          active

          oldest

          votes








          0






          active

          oldest

          votes









          active

          oldest

          votes






          active

          oldest

          votes















          draft saved

          draft discarded
















































          Thanks for contributing an answer to Stack Overflow!


          • Please be sure to answer the question. Provide details and share your research!

          But avoid


          • Asking for help, clarification, or responding to other answers.

          • Making statements based on opinion; back them up with references or personal experience.

          To learn more, see our tips on writing great answers.




          draft saved


          draft discarded














          StackExchange.ready(
          function ()
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53292268%2fcartpole-v1-using-a2c-has-very-low-reward-with-no-reason%23new-answer', 'question_page');

          );

          Post as a guest















          Required, but never shown





















































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown

































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown







          這個網誌中的熱門文章

          How to read a connectionString WITH PROVIDER in .NET Core?

          In R, how to develop a multiplot heatmap.2 figure showing key labels successfully

          Museum of Modern and Contemporary Art of Trento and Rovereto