Cartpole-v1 using A2C has very low reward with no reason
I'm using a2c to work on the Cartpole-v1. The final goal is to find the mean reward. However, my reward is less than 10. I was thinking whether my code erase part of the rewards or something during the training process? I don't know whats wrong with it. The setup of the code seems fine. The whole process is supposed to have reward around 300~500.
import tensorflow as tf
import numpy as np
import gym
# Supresses compilation warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
# Constants
learning_rate = 0.001
#trials = 3
episodes = 1000
moves = 999
discount = 0.99
hidden_size = 32
critic_size = 128
updates = 50
avgRs =
totRs =
# Helper function to generate distribution
def generate_disRs(hist):
dist =
last_reward = 0
for element in reversed(hist):
reward = discount * last_reward + element
dist.append(reward)
last_reward = reward
return list(reversed(dist))
class A2C:
def __init__(self):
self.game = gym.make('CartPole-v1')
self.game.reset()
self.num_actions = self.game.action_space.n
self.state_size = self.game.observation_space.shape[0]
self.state_input = tf.placeholder(tf.float32, [None,
self.state_size])
self.rewards = tf.placeholder(shape=[None], dtype=tf.float32)
# Define any additional placeholders needed for training your agent here:
self.state_value = self.critic()
self.actProbs = self.actor()
self.loss_val = self.loss()
self.train_op = self.optimizer()
self.session = tf.Session()
self.session.run(tf.global_variables_initializer())
def optimizer(self):
"""
:return: Optimizer for your loss function
"""
optimizer = tf.train.AdamOptimizer(learning_rate)
trainOp = optimizer.minimize(self.loss)
return trainOp
def critic(self):
"""
Calculates the estimated value for every state in
self.state_input. The critic should not depend on
any other tensors besides self.state_input.
:return: A tensor of shape [num_states] representing the
estimated value of each state in the trajectory.
"""
# Placeholders for critic loss
V1 = tf.Variable(tf.random_normal([4, critic_size], dtype=tf.float32, stddev=.1))
v1Out = tf.nn.relu(tf.matmul(self.state_input, V1))
V2 = tf.Variable(tf.random_normal([critic_size, 1], dtype=tf.float32, stddev=.1))
self.v2Out = tf.matmul(v1Out, V2)
return self.v2Out
def actor(self):
"""
Calculates the action probabilities for every state in self.state_input. The actor should not depend on
any other tensors besides self.state_input.
:return: A tensor of shape [num_states, num_actions] representing the probability distribution
over actions that is generated by your actor.
"""
# Layer 1
# self.state = tf.placeholder(shape=[None, 4], dtype=tf.float32)
self.W = tf.Variable(tf.random_uniform([4, hidden_size], dtype=tf.float32))
self.bias = tf.Variable(tf.random_uniform([hidden_size], dtype=tf.float32))
self.hidden = tf.nn.relu(tf.matmul(self.state_input, self.W) + self.bias)
# Layer 2
self.O = tf.Variable(tf.random_uniform([hidden_size, 2], dtype=tf.float32))
self.bias2 = tf.Variable(tf.random_uniform([2], dtype=tf.float32))
self.output = tf.nn.softmax(tf.matmul(self.hidden, self.O) + self.bias2)
self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
self.indices = tf.range(0, tf.shape(self.output)[0]) * 2 + self.actions
self.actProbs = tf.gather(tf.reshape(self.output, [-1]), self.indices)
#self.aloss = -tf.reduce_mean(tf.log(self.actProbs) * self.advantage)
return self.actProbs
#return self.aloss
def loss(self):
"""
:return: A scalar tensor representing the combined actor and critic loss.
"""
# Placeholders
# self.rewards = tf.placeholder(shape=[None], dtype=tf.float32)
# self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
# self.indices = tf.range(0, tf.shape(self.output)[0]) * 2 + self.actions
# self.actProbs = tf.gather(tf.reshape(self.output, [-1]), self.indices)
self.aloss = -tf.reduce_mean(tf.log(self.actProbs) * self.rewards)
self.cLoss = tf.reduce_mean(tf.square(self.rewards - self.v2Out))
self.loss = self.aloss + self.cLoss
return self.loss
def train_episode(self):
"""
train_episode will be called 1000 times by the autograder to train your agent. In this method,
run your agent for a single episode, then use that data to train your agent. Feel free to
add any return values to this method.
"""
# reset
st = self.game.reset()
# List to store state, action and reward histories
state_hist =
action_hist =
reward_hist =
# List to store history of states and values
state_value_hist =
for move in range(moves):
# Run
actDict, stateVal = self.session.run([self.output, self.v2Out], feed_dict=self.state_input: [st])
# Get the random action
action = np.random.choice(np.array([0, 1]), p=actDict[0])
st1, reward, done, info = self.game.step(action)
# Render the game
# game.render()
# Add to the history
action_hist.append(action)
reward_hist.append(reward)
state_hist.append(st)
state_value_hist.append(stateVal[0][0])
# Iterate
st = st1
# Update
if done or (move % updates == 0 and move != 0):
# Get disRs
disRs = generate_disRs(reward_hist)
# Compute Difference
difference = np.array(disRs) - np.array(state_value_hist)
# Run
feed_dict = self.state_input: state_hist, self.actions: action_hist, self.rewards: difference
l, _ = self.session.run([self.loss_val, self.train_op], feed_dict=feed_dict)
if done:
totRs.append(move)
# print move, disRs[0]
break
def check_actor(model):
"""
The autograder will use your actor() function to test your agent. This function
checks that your actor returns a tensor of the right shape for the autograder.
:return: True if the model's actor returns a tensor of the correct shape.
"""
dummy_state = np.ones((10, 4))
#actDict = model.session.run(model.output, feed_dict=model.state_input: [model.game.reset()])
actor_probs = model.session.run(model.actProbs, feed_dict=
model.state_input: dummy_state)
)
return actor_probs.shape == (10, 2)
if __name__ == '__main__':
# Change __main__ to train your agent for 1000 episodes and print the average reward over the last 100 episodes.
# The code below is similar to what our autograder will be running.
learner = A2C()
for i in range(1000):
learner.train_episode()
print(str(np.average(totRs[900: 1000])))
python-3.x deep-learning reinforcement-learning
add a comment |
I'm using a2c to work on the Cartpole-v1. The final goal is to find the mean reward. However, my reward is less than 10. I was thinking whether my code erase part of the rewards or something during the training process? I don't know whats wrong with it. The setup of the code seems fine. The whole process is supposed to have reward around 300~500.
import tensorflow as tf
import numpy as np
import gym
# Supresses compilation warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
# Constants
learning_rate = 0.001
#trials = 3
episodes = 1000
moves = 999
discount = 0.99
hidden_size = 32
critic_size = 128
updates = 50
avgRs =
totRs =
# Helper function to generate distribution
def generate_disRs(hist):
dist =
last_reward = 0
for element in reversed(hist):
reward = discount * last_reward + element
dist.append(reward)
last_reward = reward
return list(reversed(dist))
class A2C:
def __init__(self):
self.game = gym.make('CartPole-v1')
self.game.reset()
self.num_actions = self.game.action_space.n
self.state_size = self.game.observation_space.shape[0]
self.state_input = tf.placeholder(tf.float32, [None,
self.state_size])
self.rewards = tf.placeholder(shape=[None], dtype=tf.float32)
# Define any additional placeholders needed for training your agent here:
self.state_value = self.critic()
self.actProbs = self.actor()
self.loss_val = self.loss()
self.train_op = self.optimizer()
self.session = tf.Session()
self.session.run(tf.global_variables_initializer())
def optimizer(self):
"""
:return: Optimizer for your loss function
"""
optimizer = tf.train.AdamOptimizer(learning_rate)
trainOp = optimizer.minimize(self.loss)
return trainOp
def critic(self):
"""
Calculates the estimated value for every state in
self.state_input. The critic should not depend on
any other tensors besides self.state_input.
:return: A tensor of shape [num_states] representing the
estimated value of each state in the trajectory.
"""
# Placeholders for critic loss
V1 = tf.Variable(tf.random_normal([4, critic_size], dtype=tf.float32, stddev=.1))
v1Out = tf.nn.relu(tf.matmul(self.state_input, V1))
V2 = tf.Variable(tf.random_normal([critic_size, 1], dtype=tf.float32, stddev=.1))
self.v2Out = tf.matmul(v1Out, V2)
return self.v2Out
def actor(self):
"""
Calculates the action probabilities for every state in self.state_input. The actor should not depend on
any other tensors besides self.state_input.
:return: A tensor of shape [num_states, num_actions] representing the probability distribution
over actions that is generated by your actor.
"""
# Layer 1
# self.state = tf.placeholder(shape=[None, 4], dtype=tf.float32)
self.W = tf.Variable(tf.random_uniform([4, hidden_size], dtype=tf.float32))
self.bias = tf.Variable(tf.random_uniform([hidden_size], dtype=tf.float32))
self.hidden = tf.nn.relu(tf.matmul(self.state_input, self.W) + self.bias)
# Layer 2
self.O = tf.Variable(tf.random_uniform([hidden_size, 2], dtype=tf.float32))
self.bias2 = tf.Variable(tf.random_uniform([2], dtype=tf.float32))
self.output = tf.nn.softmax(tf.matmul(self.hidden, self.O) + self.bias2)
self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
self.indices = tf.range(0, tf.shape(self.output)[0]) * 2 + self.actions
self.actProbs = tf.gather(tf.reshape(self.output, [-1]), self.indices)
#self.aloss = -tf.reduce_mean(tf.log(self.actProbs) * self.advantage)
return self.actProbs
#return self.aloss
def loss(self):
"""
:return: A scalar tensor representing the combined actor and critic loss.
"""
# Placeholders
# self.rewards = tf.placeholder(shape=[None], dtype=tf.float32)
# self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
# self.indices = tf.range(0, tf.shape(self.output)[0]) * 2 + self.actions
# self.actProbs = tf.gather(tf.reshape(self.output, [-1]), self.indices)
self.aloss = -tf.reduce_mean(tf.log(self.actProbs) * self.rewards)
self.cLoss = tf.reduce_mean(tf.square(self.rewards - self.v2Out))
self.loss = self.aloss + self.cLoss
return self.loss
def train_episode(self):
"""
train_episode will be called 1000 times by the autograder to train your agent. In this method,
run your agent for a single episode, then use that data to train your agent. Feel free to
add any return values to this method.
"""
# reset
st = self.game.reset()
# List to store state, action and reward histories
state_hist =
action_hist =
reward_hist =
# List to store history of states and values
state_value_hist =
for move in range(moves):
# Run
actDict, stateVal = self.session.run([self.output, self.v2Out], feed_dict=self.state_input: [st])
# Get the random action
action = np.random.choice(np.array([0, 1]), p=actDict[0])
st1, reward, done, info = self.game.step(action)
# Render the game
# game.render()
# Add to the history
action_hist.append(action)
reward_hist.append(reward)
state_hist.append(st)
state_value_hist.append(stateVal[0][0])
# Iterate
st = st1
# Update
if done or (move % updates == 0 and move != 0):
# Get disRs
disRs = generate_disRs(reward_hist)
# Compute Difference
difference = np.array(disRs) - np.array(state_value_hist)
# Run
feed_dict = self.state_input: state_hist, self.actions: action_hist, self.rewards: difference
l, _ = self.session.run([self.loss_val, self.train_op], feed_dict=feed_dict)
if done:
totRs.append(move)
# print move, disRs[0]
break
def check_actor(model):
"""
The autograder will use your actor() function to test your agent. This function
checks that your actor returns a tensor of the right shape for the autograder.
:return: True if the model's actor returns a tensor of the correct shape.
"""
dummy_state = np.ones((10, 4))
#actDict = model.session.run(model.output, feed_dict=model.state_input: [model.game.reset()])
actor_probs = model.session.run(model.actProbs, feed_dict=
model.state_input: dummy_state)
)
return actor_probs.shape == (10, 2)
if __name__ == '__main__':
# Change __main__ to train your agent for 1000 episodes and print the average reward over the last 100 episodes.
# The code below is similar to what our autograder will be running.
learner = A2C()
for i in range(1000):
learner.train_episode()
print(str(np.average(totRs[900: 1000])))
python-3.x deep-learning reinforcement-learning
add a comment |
I'm using a2c to work on the Cartpole-v1. The final goal is to find the mean reward. However, my reward is less than 10. I was thinking whether my code erase part of the rewards or something during the training process? I don't know whats wrong with it. The setup of the code seems fine. The whole process is supposed to have reward around 300~500.
import tensorflow as tf
import numpy as np
import gym
# Supresses compilation warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
# Constants
learning_rate = 0.001
#trials = 3
episodes = 1000
moves = 999
discount = 0.99
hidden_size = 32
critic_size = 128
updates = 50
avgRs =
totRs =
# Helper function to generate distribution
def generate_disRs(hist):
dist =
last_reward = 0
for element in reversed(hist):
reward = discount * last_reward + element
dist.append(reward)
last_reward = reward
return list(reversed(dist))
class A2C:
def __init__(self):
self.game = gym.make('CartPole-v1')
self.game.reset()
self.num_actions = self.game.action_space.n
self.state_size = self.game.observation_space.shape[0]
self.state_input = tf.placeholder(tf.float32, [None,
self.state_size])
self.rewards = tf.placeholder(shape=[None], dtype=tf.float32)
# Define any additional placeholders needed for training your agent here:
self.state_value = self.critic()
self.actProbs = self.actor()
self.loss_val = self.loss()
self.train_op = self.optimizer()
self.session = tf.Session()
self.session.run(tf.global_variables_initializer())
def optimizer(self):
"""
:return: Optimizer for your loss function
"""
optimizer = tf.train.AdamOptimizer(learning_rate)
trainOp = optimizer.minimize(self.loss)
return trainOp
def critic(self):
"""
Calculates the estimated value for every state in
self.state_input. The critic should not depend on
any other tensors besides self.state_input.
:return: A tensor of shape [num_states] representing the
estimated value of each state in the trajectory.
"""
# Placeholders for critic loss
V1 = tf.Variable(tf.random_normal([4, critic_size], dtype=tf.float32, stddev=.1))
v1Out = tf.nn.relu(tf.matmul(self.state_input, V1))
V2 = tf.Variable(tf.random_normal([critic_size, 1], dtype=tf.float32, stddev=.1))
self.v2Out = tf.matmul(v1Out, V2)
return self.v2Out
def actor(self):
"""
Calculates the action probabilities for every state in self.state_input. The actor should not depend on
any other tensors besides self.state_input.
:return: A tensor of shape [num_states, num_actions] representing the probability distribution
over actions that is generated by your actor.
"""
# Layer 1
# self.state = tf.placeholder(shape=[None, 4], dtype=tf.float32)
self.W = tf.Variable(tf.random_uniform([4, hidden_size], dtype=tf.float32))
self.bias = tf.Variable(tf.random_uniform([hidden_size], dtype=tf.float32))
self.hidden = tf.nn.relu(tf.matmul(self.state_input, self.W) + self.bias)
# Layer 2
self.O = tf.Variable(tf.random_uniform([hidden_size, 2], dtype=tf.float32))
self.bias2 = tf.Variable(tf.random_uniform([2], dtype=tf.float32))
self.output = tf.nn.softmax(tf.matmul(self.hidden, self.O) + self.bias2)
self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
self.indices = tf.range(0, tf.shape(self.output)[0]) * 2 + self.actions
self.actProbs = tf.gather(tf.reshape(self.output, [-1]), self.indices)
#self.aloss = -tf.reduce_mean(tf.log(self.actProbs) * self.advantage)
return self.actProbs
#return self.aloss
def loss(self):
"""
:return: A scalar tensor representing the combined actor and critic loss.
"""
# Placeholders
# self.rewards = tf.placeholder(shape=[None], dtype=tf.float32)
# self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
# self.indices = tf.range(0, tf.shape(self.output)[0]) * 2 + self.actions
# self.actProbs = tf.gather(tf.reshape(self.output, [-1]), self.indices)
self.aloss = -tf.reduce_mean(tf.log(self.actProbs) * self.rewards)
self.cLoss = tf.reduce_mean(tf.square(self.rewards - self.v2Out))
self.loss = self.aloss + self.cLoss
return self.loss
def train_episode(self):
"""
train_episode will be called 1000 times by the autograder to train your agent. In this method,
run your agent for a single episode, then use that data to train your agent. Feel free to
add any return values to this method.
"""
# reset
st = self.game.reset()
# List to store state, action and reward histories
state_hist =
action_hist =
reward_hist =
# List to store history of states and values
state_value_hist =
for move in range(moves):
# Run
actDict, stateVal = self.session.run([self.output, self.v2Out], feed_dict=self.state_input: [st])
# Get the random action
action = np.random.choice(np.array([0, 1]), p=actDict[0])
st1, reward, done, info = self.game.step(action)
# Render the game
# game.render()
# Add to the history
action_hist.append(action)
reward_hist.append(reward)
state_hist.append(st)
state_value_hist.append(stateVal[0][0])
# Iterate
st = st1
# Update
if done or (move % updates == 0 and move != 0):
# Get disRs
disRs = generate_disRs(reward_hist)
# Compute Difference
difference = np.array(disRs) - np.array(state_value_hist)
# Run
feed_dict = self.state_input: state_hist, self.actions: action_hist, self.rewards: difference
l, _ = self.session.run([self.loss_val, self.train_op], feed_dict=feed_dict)
if done:
totRs.append(move)
# print move, disRs[0]
break
def check_actor(model):
"""
The autograder will use your actor() function to test your agent. This function
checks that your actor returns a tensor of the right shape for the autograder.
:return: True if the model's actor returns a tensor of the correct shape.
"""
dummy_state = np.ones((10, 4))
#actDict = model.session.run(model.output, feed_dict=model.state_input: [model.game.reset()])
actor_probs = model.session.run(model.actProbs, feed_dict=
model.state_input: dummy_state)
)
return actor_probs.shape == (10, 2)
if __name__ == '__main__':
# Change __main__ to train your agent for 1000 episodes and print the average reward over the last 100 episodes.
# The code below is similar to what our autograder will be running.
learner = A2C()
for i in range(1000):
learner.train_episode()
print(str(np.average(totRs[900: 1000])))
python-3.x deep-learning reinforcement-learning
I'm using a2c to work on the Cartpole-v1. The final goal is to find the mean reward. However, my reward is less than 10. I was thinking whether my code erase part of the rewards or something during the training process? I don't know whats wrong with it. The setup of the code seems fine. The whole process is supposed to have reward around 300~500.
import tensorflow as tf
import numpy as np
import gym
# Supresses compilation warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
# Constants
learning_rate = 0.001
#trials = 3
episodes = 1000
moves = 999
discount = 0.99
hidden_size = 32
critic_size = 128
updates = 50
avgRs =
totRs =
# Helper function to generate distribution
def generate_disRs(hist):
dist =
last_reward = 0
for element in reversed(hist):
reward = discount * last_reward + element
dist.append(reward)
last_reward = reward
return list(reversed(dist))
class A2C:
def __init__(self):
self.game = gym.make('CartPole-v1')
self.game.reset()
self.num_actions = self.game.action_space.n
self.state_size = self.game.observation_space.shape[0]
self.state_input = tf.placeholder(tf.float32, [None,
self.state_size])
self.rewards = tf.placeholder(shape=[None], dtype=tf.float32)
# Define any additional placeholders needed for training your agent here:
self.state_value = self.critic()
self.actProbs = self.actor()
self.loss_val = self.loss()
self.train_op = self.optimizer()
self.session = tf.Session()
self.session.run(tf.global_variables_initializer())
def optimizer(self):
"""
:return: Optimizer for your loss function
"""
optimizer = tf.train.AdamOptimizer(learning_rate)
trainOp = optimizer.minimize(self.loss)
return trainOp
def critic(self):
"""
Calculates the estimated value for every state in
self.state_input. The critic should not depend on
any other tensors besides self.state_input.
:return: A tensor of shape [num_states] representing the
estimated value of each state in the trajectory.
"""
# Placeholders for critic loss
V1 = tf.Variable(tf.random_normal([4, critic_size], dtype=tf.float32, stddev=.1))
v1Out = tf.nn.relu(tf.matmul(self.state_input, V1))
V2 = tf.Variable(tf.random_normal([critic_size, 1], dtype=tf.float32, stddev=.1))
self.v2Out = tf.matmul(v1Out, V2)
return self.v2Out
def actor(self):
"""
Calculates the action probabilities for every state in self.state_input. The actor should not depend on
any other tensors besides self.state_input.
:return: A tensor of shape [num_states, num_actions] representing the probability distribution
over actions that is generated by your actor.
"""
# Layer 1
# self.state = tf.placeholder(shape=[None, 4], dtype=tf.float32)
self.W = tf.Variable(tf.random_uniform([4, hidden_size], dtype=tf.float32))
self.bias = tf.Variable(tf.random_uniform([hidden_size], dtype=tf.float32))
self.hidden = tf.nn.relu(tf.matmul(self.state_input, self.W) + self.bias)
# Layer 2
self.O = tf.Variable(tf.random_uniform([hidden_size, 2], dtype=tf.float32))
self.bias2 = tf.Variable(tf.random_uniform([2], dtype=tf.float32))
self.output = tf.nn.softmax(tf.matmul(self.hidden, self.O) + self.bias2)
self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
self.indices = tf.range(0, tf.shape(self.output)[0]) * 2 + self.actions
self.actProbs = tf.gather(tf.reshape(self.output, [-1]), self.indices)
#self.aloss = -tf.reduce_mean(tf.log(self.actProbs) * self.advantage)
return self.actProbs
#return self.aloss
def loss(self):
"""
:return: A scalar tensor representing the combined actor and critic loss.
"""
# Placeholders
# self.rewards = tf.placeholder(shape=[None], dtype=tf.float32)
# self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
# self.indices = tf.range(0, tf.shape(self.output)[0]) * 2 + self.actions
# self.actProbs = tf.gather(tf.reshape(self.output, [-1]), self.indices)
self.aloss = -tf.reduce_mean(tf.log(self.actProbs) * self.rewards)
self.cLoss = tf.reduce_mean(tf.square(self.rewards - self.v2Out))
self.loss = self.aloss + self.cLoss
return self.loss
def train_episode(self):
"""
train_episode will be called 1000 times by the autograder to train your agent. In this method,
run your agent for a single episode, then use that data to train your agent. Feel free to
add any return values to this method.
"""
# reset
st = self.game.reset()
# List to store state, action and reward histories
state_hist =
action_hist =
reward_hist =
# List to store history of states and values
state_value_hist =
for move in range(moves):
# Run
actDict, stateVal = self.session.run([self.output, self.v2Out], feed_dict=self.state_input: [st])
# Get the random action
action = np.random.choice(np.array([0, 1]), p=actDict[0])
st1, reward, done, info = self.game.step(action)
# Render the game
# game.render()
# Add to the history
action_hist.append(action)
reward_hist.append(reward)
state_hist.append(st)
state_value_hist.append(stateVal[0][0])
# Iterate
st = st1
# Update
if done or (move % updates == 0 and move != 0):
# Get disRs
disRs = generate_disRs(reward_hist)
# Compute Difference
difference = np.array(disRs) - np.array(state_value_hist)
# Run
feed_dict = self.state_input: state_hist, self.actions: action_hist, self.rewards: difference
l, _ = self.session.run([self.loss_val, self.train_op], feed_dict=feed_dict)
if done:
totRs.append(move)
# print move, disRs[0]
break
def check_actor(model):
"""
The autograder will use your actor() function to test your agent. This function
checks that your actor returns a tensor of the right shape for the autograder.
:return: True if the model's actor returns a tensor of the correct shape.
"""
dummy_state = np.ones((10, 4))
#actDict = model.session.run(model.output, feed_dict=model.state_input: [model.game.reset()])
actor_probs = model.session.run(model.actProbs, feed_dict=
model.state_input: dummy_state)
)
return actor_probs.shape == (10, 2)
if __name__ == '__main__':
# Change __main__ to train your agent for 1000 episodes and print the average reward over the last 100 episodes.
# The code below is similar to what our autograder will be running.
learner = A2C()
for i in range(1000):
learner.train_episode()
print(str(np.average(totRs[900: 1000])))
python-3.x deep-learning reinforcement-learning
python-3.x deep-learning reinforcement-learning
asked Nov 14 '18 at 2:19
Wei BoveyWei Bovey
1314
1314
add a comment |
add a comment |
0
active
oldest
votes
Your Answer
StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");
StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "1"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);
else
createEditor();
);
function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader:
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
,
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);
);
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53292268%2fcartpole-v1-using-a2c-has-very-low-reward-with-no-reason%23new-answer', 'question_page');
);
Post as a guest
Required, but never shown
0
active
oldest
votes
0
active
oldest
votes
active
oldest
votes
active
oldest
votes
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53292268%2fcartpole-v1-using-a2c-has-very-low-reward-with-no-reason%23new-answer', 'question_page');
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown