add yapf style and apply yapf to format all Python files
This recreates pull request #2192
This commit is contained in:
@@ -11,7 +11,6 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Executable scripts for reinforcement learning."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Example configurations using the PPO algorithm."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
@@ -29,6 +28,7 @@ import pybullet_envs.bullet.minitaur_gym_env as minitaur_gym_env
|
||||
import pybullet_envs
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
def default():
|
||||
"""Default configuration for PPO."""
|
||||
# General
|
||||
@@ -38,10 +38,7 @@ def default():
|
||||
use_gpu = False
|
||||
# Network
|
||||
network = networks.feed_forward_gaussian
|
||||
weight_summaries = dict(
|
||||
all=r'.*',
|
||||
policy=r'.*/policy/.*',
|
||||
value=r'.*/value/.*')
|
||||
weight_summaries = dict(all=r'.*', policy=r'.*/policy/.*', value=r'.*/value/.*')
|
||||
policy_layers = 200, 100
|
||||
value_layers = 200, 100
|
||||
init_mean_factor = 0.1
|
||||
@@ -52,7 +49,7 @@ def default():
|
||||
optimizer = tf.train.AdamOptimizer
|
||||
update_epochs_policy = 64
|
||||
update_epochs_value = 64
|
||||
learning_rate = 1e-4
|
||||
learning_rate = 1e-4
|
||||
# Losses
|
||||
discount = 0.995
|
||||
kl_target = 1e-2
|
||||
@@ -69,6 +66,7 @@ def pybullet_pendulum():
|
||||
steps = 5e7 # 50M
|
||||
return locals()
|
||||
|
||||
|
||||
def pybullet_doublependulum():
|
||||
locals().update(default())
|
||||
env = 'InvertedDoublePendulumBulletEnv-v0'
|
||||
@@ -76,6 +74,7 @@ def pybullet_doublependulum():
|
||||
steps = 5e7 # 50M
|
||||
return locals()
|
||||
|
||||
|
||||
def pybullet_pendulumswingup():
|
||||
locals().update(default())
|
||||
env = 'InvertedPendulumSwingupBulletEnv-v0'
|
||||
@@ -83,6 +82,7 @@ def pybullet_pendulumswingup():
|
||||
steps = 5e7 # 50M
|
||||
return locals()
|
||||
|
||||
|
||||
def pybullet_cheetah():
|
||||
"""Configuration for MuJoCo's half cheetah task."""
|
||||
locals().update(default())
|
||||
@@ -92,6 +92,7 @@ def pybullet_cheetah():
|
||||
steps = 1e8 # 100M
|
||||
return locals()
|
||||
|
||||
|
||||
def pybullet_ant():
|
||||
locals().update(default())
|
||||
env = 'AntBulletEnv-v0'
|
||||
@@ -99,6 +100,7 @@ def pybullet_ant():
|
||||
steps = 5e7 # 50M
|
||||
return locals()
|
||||
|
||||
|
||||
def pybullet_kuka_grasping():
|
||||
"""Configuration for Bullet Kuka grasping task."""
|
||||
locals().update(default())
|
||||
@@ -113,7 +115,7 @@ def pybullet_racecar():
|
||||
"""Configuration for Bullet MIT Racecar task."""
|
||||
locals().update(default())
|
||||
# Environment
|
||||
env = 'RacecarBulletEnv-v0' #functools.partial(racecarGymEnv.RacecarGymEnv, isDiscrete=False, renders=True)
|
||||
env = 'RacecarBulletEnv-v0' #functools.partial(racecarGymEnv.RacecarGymEnv, isDiscrete=False, renders=True)
|
||||
max_length = 10
|
||||
steps = 1e7 # 10M
|
||||
return locals()
|
||||
@@ -132,29 +134,27 @@ def pybullet_minitaur():
|
||||
"""Configuration specific to minitaur_gym_env.MinitaurBulletEnv class."""
|
||||
locals().update(default())
|
||||
randomizer = (minitaur_env_randomizer.MinitaurEnvRandomizer())
|
||||
env = functools.partial(
|
||||
minitaur_gym_env.MinitaurBulletEnv,
|
||||
accurate_motor_model_enabled=True,
|
||||
motor_overheat_protection=True,
|
||||
pd_control_enabled=True,
|
||||
env_randomizer=randomizer,
|
||||
render=False)
|
||||
env = functools.partial(minitaur_gym_env.MinitaurBulletEnv,
|
||||
accurate_motor_model_enabled=True,
|
||||
motor_overheat_protection=True,
|
||||
pd_control_enabled=True,
|
||||
env_randomizer=randomizer,
|
||||
render=False)
|
||||
max_length = 1000
|
||||
steps = 3e7 # 30M
|
||||
return locals()
|
||||
|
||||
|
||||
def pybullet_duck_minitaur():
|
||||
"""Configuration specific to minitaur_gym_env.MinitaurBulletDuckEnv class."""
|
||||
locals().update(default())
|
||||
randomizer = (minitaur_env_randomizer.MinitaurEnvRandomizer())
|
||||
env = functools.partial(
|
||||
minitaur_gym_env.MinitaurBulletDuckEnv,
|
||||
accurate_motor_model_enabled=True,
|
||||
motor_overheat_protection=True,
|
||||
pd_control_enabled=True,
|
||||
env_randomizer=randomizer,
|
||||
render=False)
|
||||
env = functools.partial(minitaur_gym_env.MinitaurBulletDuckEnv,
|
||||
accurate_motor_model_enabled=True,
|
||||
motor_overheat_protection=True,
|
||||
pd_control_enabled=True,
|
||||
env_randomizer=randomizer,
|
||||
render=False)
|
||||
max_length = 1000
|
||||
steps = 3e7 # 30M
|
||||
return locals()
|
||||
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Network definitions for the PPO algorithm."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
@@ -24,13 +23,10 @@ import operator
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
NetworkOutput = collections.namedtuple(
|
||||
'NetworkOutput', 'policy, mean, logstd, value, state')
|
||||
NetworkOutput = collections.namedtuple('NetworkOutput', 'policy, mean, logstd, value, state')
|
||||
|
||||
|
||||
def feed_forward_gaussian(
|
||||
config, action_size, observations, unused_length, state=None):
|
||||
def feed_forward_gaussian(config, action_size, observations, unused_length, state=None):
|
||||
"""Independent feed forward networks for policy and value.
|
||||
|
||||
The policy network outputs the mean action and the log standard deviation
|
||||
@@ -50,20 +46,22 @@ def feed_forward_gaussian(
|
||||
factor=config.init_mean_factor)
|
||||
logstd_initializer = tf.random_normal_initializer(config.init_logstd, 1e-10)
|
||||
flat_observations = tf.reshape(observations, [
|
||||
tf.shape(observations)[0], tf.shape(observations)[1],
|
||||
functools.reduce(operator.mul, observations.shape.as_list()[2:], 1)])
|
||||
tf.shape(observations)[0],
|
||||
tf.shape(observations)[1],
|
||||
functools.reduce(operator.mul,
|
||||
observations.shape.as_list()[2:], 1)
|
||||
])
|
||||
with tf.variable_scope('policy'):
|
||||
x = flat_observations
|
||||
for size in config.policy_layers:
|
||||
x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
|
||||
mean = tf.contrib.layers.fully_connected(
|
||||
x, action_size, tf.tanh,
|
||||
weights_initializer=mean_weights_initializer)
|
||||
logstd = tf.get_variable(
|
||||
'logstd', mean.shape[2:], tf.float32, logstd_initializer)
|
||||
logstd = tf.tile(
|
||||
logstd[None, None],
|
||||
[tf.shape(mean)[0], tf.shape(mean)[1]] + [1] * (mean.shape.ndims - 2))
|
||||
mean = tf.contrib.layers.fully_connected(x,
|
||||
action_size,
|
||||
tf.tanh,
|
||||
weights_initializer=mean_weights_initializer)
|
||||
logstd = tf.get_variable('logstd', mean.shape[2:], tf.float32, logstd_initializer)
|
||||
logstd = tf.tile(logstd[None, None],
|
||||
[tf.shape(mean)[0], tf.shape(mean)[1]] + [1] * (mean.shape.ndims - 2))
|
||||
with tf.variable_scope('value'):
|
||||
x = flat_observations
|
||||
for size in config.value_layers:
|
||||
@@ -72,13 +70,11 @@ def feed_forward_gaussian(
|
||||
mean = tf.check_numerics(mean, 'mean')
|
||||
logstd = tf.check_numerics(logstd, 'logstd')
|
||||
value = tf.check_numerics(value, 'value')
|
||||
policy = tf.contrib.distributions.MultivariateNormalDiag(
|
||||
mean, tf.exp(logstd))
|
||||
policy = tf.contrib.distributions.MultivariateNormalDiag(mean, tf.exp(logstd))
|
||||
return NetworkOutput(policy, mean, logstd, value, state)
|
||||
|
||||
|
||||
def recurrent_gaussian(
|
||||
config, action_size, observations, length, state=None):
|
||||
def recurrent_gaussian(config, action_size, observations, length, state=None):
|
||||
"""Independent recurrent policy and feed forward value networks.
|
||||
|
||||
The policy network outputs the mean action and the log standard deviation
|
||||
@@ -100,21 +96,23 @@ def recurrent_gaussian(
|
||||
logstd_initializer = tf.random_normal_initializer(config.init_logstd, 1e-10)
|
||||
cell = tf.contrib.rnn.GRUBlockCell(config.policy_layers[-1])
|
||||
flat_observations = tf.reshape(observations, [
|
||||
tf.shape(observations)[0], tf.shape(observations)[1],
|
||||
functools.reduce(operator.mul, observations.shape.as_list()[2:], 1)])
|
||||
tf.shape(observations)[0],
|
||||
tf.shape(observations)[1],
|
||||
functools.reduce(operator.mul,
|
||||
observations.shape.as_list()[2:], 1)
|
||||
])
|
||||
with tf.variable_scope('policy'):
|
||||
x = flat_observations
|
||||
for size in config.policy_layers[:-1]:
|
||||
x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
|
||||
x, state = tf.nn.dynamic_rnn(cell, x, length, state, tf.float32)
|
||||
mean = tf.contrib.layers.fully_connected(
|
||||
x, action_size, tf.tanh,
|
||||
weights_initializer=mean_weights_initializer)
|
||||
logstd = tf.get_variable(
|
||||
'logstd', mean.shape[2:], tf.float32, logstd_initializer)
|
||||
logstd = tf.tile(
|
||||
logstd[None, None],
|
||||
[tf.shape(mean)[0], tf.shape(mean)[1]] + [1] * (mean.shape.ndims - 2))
|
||||
mean = tf.contrib.layers.fully_connected(x,
|
||||
action_size,
|
||||
tf.tanh,
|
||||
weights_initializer=mean_weights_initializer)
|
||||
logstd = tf.get_variable('logstd', mean.shape[2:], tf.float32, logstd_initializer)
|
||||
logstd = tf.tile(logstd[None, None],
|
||||
[tf.shape(mean)[0], tf.shape(mean)[1]] + [1] * (mean.shape.ndims - 2))
|
||||
with tf.variable_scope('value'):
|
||||
x = flat_observations
|
||||
for size in config.value_layers:
|
||||
@@ -123,7 +121,6 @@ def recurrent_gaussian(
|
||||
mean = tf.check_numerics(mean, 'mean')
|
||||
logstd = tf.check_numerics(logstd, 'logstd')
|
||||
value = tf.check_numerics(value, 'value')
|
||||
policy = tf.contrib.distributions.MultivariateNormalDiag(
|
||||
mean, tf.exp(logstd))
|
||||
policy = tf.contrib.distributions.MultivariateNormalDiag(mean, tf.exp(logstd))
|
||||
# assert state.shape.as_list()[0] is not None
|
||||
return NetworkOutput(policy, mean, logstd, value, state)
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Proximal Policy Optimization algorithm."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Proximal Policy Optimization algorithm.
|
||||
|
||||
Based on John Schulman's implementation in Python and Theano:
|
||||
@@ -49,51 +48,51 @@ class PPOAlgorithm(object):
|
||||
self._is_training = is_training
|
||||
self._should_log = should_log
|
||||
self._config = config
|
||||
self._observ_filter = normalize.StreamingNormalize(
|
||||
self._batch_env.observ[0], center=True, scale=True, clip=5,
|
||||
name='normalize_observ')
|
||||
self._reward_filter = normalize.StreamingNormalize(
|
||||
self._batch_env.reward[0], center=False, scale=True, clip=10,
|
||||
name='normalize_reward')
|
||||
self._observ_filter = normalize.StreamingNormalize(self._batch_env.observ[0],
|
||||
center=True,
|
||||
scale=True,
|
||||
clip=5,
|
||||
name='normalize_observ')
|
||||
self._reward_filter = normalize.StreamingNormalize(self._batch_env.reward[0],
|
||||
center=False,
|
||||
scale=True,
|
||||
clip=10,
|
||||
name='normalize_reward')
|
||||
# Memory stores tuple of observ, action, mean, logstd, reward.
|
||||
template = (
|
||||
self._batch_env.observ[0], self._batch_env.action[0],
|
||||
self._batch_env.action[0], self._batch_env.action[0],
|
||||
self._batch_env.reward[0])
|
||||
self._memory = memory.EpisodeMemory(
|
||||
template, config.update_every, config.max_length, 'memory')
|
||||
template = (self._batch_env.observ[0], self._batch_env.action[0], self._batch_env.action[0],
|
||||
self._batch_env.action[0], self._batch_env.reward[0])
|
||||
self._memory = memory.EpisodeMemory(template, config.update_every, config.max_length, 'memory')
|
||||
self._memory_index = tf.Variable(0, False)
|
||||
use_gpu = self._config.use_gpu and utility.available_gpus()
|
||||
with tf.device('/gpu:0' if use_gpu else '/cpu:0'):
|
||||
# Create network variables for later calls to reuse.
|
||||
action_size = self._batch_env.action.shape[1].value
|
||||
self._network = tf.make_template(
|
||||
'network', functools.partial(config.network, config, action_size))
|
||||
self._network = tf.make_template('network',
|
||||
functools.partial(config.network, config, action_size))
|
||||
output = self._network(
|
||||
tf.zeros_like(self._batch_env.observ)[:, None],
|
||||
tf.ones(len(self._batch_env)))
|
||||
tf.zeros_like(self._batch_env.observ)[:, None], tf.ones(len(self._batch_env)))
|
||||
with tf.variable_scope('ppo_temporary'):
|
||||
self._episodes = memory.EpisodeMemory(
|
||||
template, len(batch_env), config.max_length, 'episodes')
|
||||
self._episodes = memory.EpisodeMemory(template, len(batch_env), config.max_length,
|
||||
'episodes')
|
||||
if output.state is None:
|
||||
self._last_state = None
|
||||
else:
|
||||
# Ensure the batch dimension is set.
|
||||
tf.contrib.framework.nest.map_structure(
|
||||
lambda x: x.set_shape([len(batch_env)] + x.shape.as_list()[1:]),
|
||||
output.state)
|
||||
lambda x: x.set_shape([len(batch_env)] + x.shape.as_list()[1:]), output.state)
|
||||
# pylint: disable=undefined-variable
|
||||
self._last_state = tf.contrib.framework.nest.map_structure(
|
||||
lambda x: tf.Variable(lambda: tf.zeros_like(x), False),
|
||||
output.state)
|
||||
self._last_action = tf.Variable(
|
||||
tf.zeros_like(self._batch_env.action), False, name='last_action')
|
||||
self._last_mean = tf.Variable(
|
||||
tf.zeros_like(self._batch_env.action), False, name='last_mean')
|
||||
self._last_logstd = tf.Variable(
|
||||
tf.zeros_like(self._batch_env.action), False, name='last_logstd')
|
||||
self._penalty = tf.Variable(
|
||||
self._config.kl_init_penalty, False, dtype=tf.float32)
|
||||
lambda x: tf.Variable(lambda: tf.zeros_like(x), False), output.state)
|
||||
self._last_action = tf.Variable(tf.zeros_like(self._batch_env.action),
|
||||
False,
|
||||
name='last_action')
|
||||
self._last_mean = tf.Variable(tf.zeros_like(self._batch_env.action),
|
||||
False,
|
||||
name='last_mean')
|
||||
self._last_logstd = tf.Variable(tf.zeros_like(self._batch_env.action),
|
||||
False,
|
||||
name='last_logstd')
|
||||
self._penalty = tf.Variable(self._config.kl_init_penalty, False, dtype=tf.float32)
|
||||
self._optimizer = self._config.optimizer(self._config.learning_rate)
|
||||
|
||||
def begin_episode(self, agent_indices):
|
||||
@@ -109,8 +108,7 @@ class PPOAlgorithm(object):
|
||||
if self._last_state is None:
|
||||
reset_state = tf.no_op()
|
||||
else:
|
||||
reset_state = utility.reinit_nested_vars(
|
||||
self._last_state, agent_indices)
|
||||
reset_state = utility.reinit_nested_vars(self._last_state, agent_indices)
|
||||
reset_buffer = self._episodes.clear(agent_indices)
|
||||
with tf.control_dependencies([reset_state, reset_buffer]):
|
||||
return tf.constant('')
|
||||
@@ -130,36 +128,33 @@ class PPOAlgorithm(object):
|
||||
if self._last_state is None:
|
||||
state = None
|
||||
else:
|
||||
state = tf.contrib.framework.nest.map_structure(
|
||||
lambda x: tf.gather(x, agent_indices), self._last_state)
|
||||
state = tf.contrib.framework.nest.map_structure(lambda x: tf.gather(x, agent_indices),
|
||||
self._last_state)
|
||||
output = self._network(observ[:, None], tf.ones(observ.shape[0]), state)
|
||||
action = tf.cond(
|
||||
self._is_training, output.policy.sample, lambda: output.mean)
|
||||
action = tf.cond(self._is_training, output.policy.sample, lambda: output.mean)
|
||||
logprob = output.policy.log_prob(action)[:, 0]
|
||||
# pylint: disable=g-long-lambda
|
||||
summary = tf.cond(self._should_log, lambda: tf.summary.merge([
|
||||
tf.summary.histogram('mean', output.mean[:, 0]),
|
||||
tf.summary.histogram('std', tf.exp(output.logstd[:, 0])),
|
||||
tf.summary.histogram('action', action[:, 0]),
|
||||
tf.summary.histogram('logprob', logprob)]), str)
|
||||
summary = tf.cond(
|
||||
self._should_log, lambda: tf.summary.merge([
|
||||
tf.summary.histogram('mean', output.mean[:, 0]),
|
||||
tf.summary.histogram('std', tf.exp(output.logstd[:, 0])),
|
||||
tf.summary.histogram('action', action[:, 0]),
|
||||
tf.summary.histogram('logprob', logprob)
|
||||
]), str)
|
||||
# Remember current policy to append to memory in the experience callback.
|
||||
if self._last_state is None:
|
||||
assign_state = tf.no_op()
|
||||
else:
|
||||
assign_state = utility.assign_nested_vars(
|
||||
self._last_state, output.state, agent_indices)
|
||||
assign_state = utility.assign_nested_vars(self._last_state, output.state, agent_indices)
|
||||
with tf.control_dependencies([
|
||||
assign_state,
|
||||
tf.scatter_update(
|
||||
self._last_action, agent_indices, action[:, 0]),
|
||||
tf.scatter_update(
|
||||
self._last_mean, agent_indices, output.mean[:, 0]),
|
||||
tf.scatter_update(
|
||||
self._last_logstd, agent_indices, output.logstd[:, 0])]):
|
||||
tf.scatter_update(self._last_action, agent_indices, action[:, 0]),
|
||||
tf.scatter_update(self._last_mean, agent_indices, output.mean[:, 0]),
|
||||
tf.scatter_update(self._last_logstd, agent_indices, output.logstd[:, 0])
|
||||
]):
|
||||
return tf.check_numerics(action[:, 0], 'action'), tf.identity(summary)
|
||||
|
||||
def experience(
|
||||
self, agent_indices, observ, action, reward, unused_done, unused_nextob):
|
||||
def experience(self, agent_indices, observ, action, reward, unused_done, unused_nextob):
|
||||
"""Process the transition tuple of the current step.
|
||||
|
||||
When training, add the current transition tuple to the memory and update
|
||||
@@ -181,34 +176,36 @@ class PPOAlgorithm(object):
|
||||
return tf.cond(
|
||||
self._is_training,
|
||||
# pylint: disable=g-long-lambda
|
||||
lambda: self._define_experience(
|
||||
agent_indices, observ, action, reward), str)
|
||||
lambda: self._define_experience(agent_indices, observ, action, reward),
|
||||
str)
|
||||
|
||||
def _define_experience(self, agent_indices, observ, action, reward):
|
||||
"""Implement the branch of experience() entered during training."""
|
||||
update_filters = tf.summary.merge([
|
||||
self._observ_filter.update(observ),
|
||||
self._reward_filter.update(reward)])
|
||||
update_filters = tf.summary.merge(
|
||||
[self._observ_filter.update(observ),
|
||||
self._reward_filter.update(reward)])
|
||||
with tf.control_dependencies([update_filters]):
|
||||
if self._config.train_on_agent_action:
|
||||
# NOTE: Doesn't seem to change much.
|
||||
action = self._last_action
|
||||
batch = (
|
||||
observ, action, tf.gather(self._last_mean, agent_indices),
|
||||
tf.gather(self._last_logstd, agent_indices), reward)
|
||||
batch = (observ, action, tf.gather(self._last_mean,
|
||||
agent_indices), tf.gather(self._last_logstd,
|
||||
agent_indices), reward)
|
||||
append = self._episodes.append(batch, agent_indices)
|
||||
with tf.control_dependencies([append]):
|
||||
norm_observ = self._observ_filter.transform(observ)
|
||||
norm_reward = tf.reduce_mean(self._reward_filter.transform(reward))
|
||||
# pylint: disable=g-long-lambda
|
||||
summary = tf.cond(self._should_log, lambda: tf.summary.merge([
|
||||
update_filters,
|
||||
self._observ_filter.summary(),
|
||||
self._reward_filter.summary(),
|
||||
tf.summary.scalar('memory_size', self._memory_index),
|
||||
tf.summary.histogram('normalized_observ', norm_observ),
|
||||
tf.summary.histogram('action', self._last_action),
|
||||
tf.summary.scalar('normalized_reward', norm_reward)]), str)
|
||||
summary = tf.cond(
|
||||
self._should_log, lambda: tf.summary.merge([
|
||||
update_filters,
|
||||
self._observ_filter.summary(),
|
||||
self._reward_filter.summary(),
|
||||
tf.summary.scalar('memory_size', self._memory_index),
|
||||
tf.summary.histogram('normalized_observ', norm_observ),
|
||||
tf.summary.histogram('action', self._last_action),
|
||||
tf.summary.scalar('normalized_reward', norm_reward)
|
||||
]), str)
|
||||
return summary
|
||||
|
||||
def end_episode(self, agent_indices):
|
||||
@@ -226,20 +223,16 @@ class PPOAlgorithm(object):
|
||||
Summary tensor.
|
||||
"""
|
||||
with tf.name_scope('end_episode/'):
|
||||
return tf.cond(
|
||||
self._is_training,
|
||||
lambda: self._define_end_episode(agent_indices), str)
|
||||
return tf.cond(self._is_training, lambda: self._define_end_episode(agent_indices), str)
|
||||
|
||||
def _define_end_episode(self, agent_indices):
|
||||
"""Implement the branch of end_episode() entered during training."""
|
||||
episodes, length = self._episodes.data(agent_indices)
|
||||
space_left = self._config.update_every - self._memory_index
|
||||
use_episodes = tf.range(tf.minimum(
|
||||
tf.shape(agent_indices)[0], space_left))
|
||||
use_episodes = tf.range(tf.minimum(tf.shape(agent_indices)[0], space_left))
|
||||
episodes = [tf.gather(elem, use_episodes) for elem in episodes]
|
||||
append = self._memory.replace(
|
||||
episodes, tf.gather(length, use_episodes),
|
||||
use_episodes + self._memory_index)
|
||||
append = self._memory.replace(episodes, tf.gather(length, use_episodes),
|
||||
use_episodes + self._memory_index)
|
||||
with tf.control_dependencies([append]):
|
||||
inc_index = self._memory_index.assign_add(tf.shape(use_episodes)[0])
|
||||
with tf.control_dependencies([inc_index]):
|
||||
@@ -256,8 +249,7 @@ class PPOAlgorithm(object):
|
||||
Summary tensor.
|
||||
"""
|
||||
with tf.name_scope('training'):
|
||||
assert_full = tf.assert_equal(
|
||||
self._memory_index, self._config.update_every)
|
||||
assert_full = tf.assert_equal(self._memory_index, self._config.update_every)
|
||||
with tf.control_dependencies([assert_full]):
|
||||
data = self._memory.data()
|
||||
(observ, action, old_mean, old_logstd, reward), length = data
|
||||
@@ -265,22 +257,18 @@ class PPOAlgorithm(object):
|
||||
length = tf.identity(length)
|
||||
observ = self._observ_filter.transform(observ)
|
||||
reward = self._reward_filter.transform(reward)
|
||||
update_summary = self._perform_update_steps(
|
||||
observ, action, old_mean, old_logstd, reward, length)
|
||||
update_summary = self._perform_update_steps(observ, action, old_mean, old_logstd, reward,
|
||||
length)
|
||||
with tf.control_dependencies([update_summary]):
|
||||
penalty_summary = self._adjust_penalty(
|
||||
observ, old_mean, old_logstd, length)
|
||||
penalty_summary = self._adjust_penalty(observ, old_mean, old_logstd, length)
|
||||
with tf.control_dependencies([penalty_summary]):
|
||||
clear_memory = tf.group(
|
||||
self._memory.clear(), self._memory_index.assign(0))
|
||||
clear_memory = tf.group(self._memory.clear(), self._memory_index.assign(0))
|
||||
with tf.control_dependencies([clear_memory]):
|
||||
weight_summary = utility.variable_summaries(
|
||||
tf.trainable_variables(), self._config.weight_summaries)
|
||||
return tf.summary.merge([
|
||||
update_summary, penalty_summary, weight_summary])
|
||||
weight_summary = utility.variable_summaries(tf.trainable_variables(),
|
||||
self._config.weight_summaries)
|
||||
return tf.summary.merge([update_summary, penalty_summary, weight_summary])
|
||||
|
||||
def _perform_update_steps(
|
||||
self, observ, action, old_mean, old_logstd, reward, length):
|
||||
def _perform_update_steps(self, observ, action, old_mean, old_logstd, reward, length):
|
||||
"""Perform multiple update steps of value function and policy.
|
||||
|
||||
The advantage is computed once at the beginning and shared across
|
||||
@@ -298,37 +286,29 @@ class PPOAlgorithm(object):
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
return_ = utility.discounted_return(
|
||||
reward, length, self._config.discount)
|
||||
return_ = utility.discounted_return(reward, length, self._config.discount)
|
||||
value = self._network(observ, length).value
|
||||
if self._config.gae_lambda:
|
||||
advantage = utility.lambda_return(
|
||||
reward, value, length, self._config.discount,
|
||||
self._config.gae_lambda)
|
||||
advantage = utility.lambda_return(reward, value, length, self._config.discount,
|
||||
self._config.gae_lambda)
|
||||
else:
|
||||
advantage = return_ - value
|
||||
mean, variance = tf.nn.moments(advantage, axes=[0, 1], keep_dims=True)
|
||||
advantage = (advantage - mean) / (tf.sqrt(variance) + 1e-8)
|
||||
advantage = tf.Print(
|
||||
advantage, [tf.reduce_mean(return_), tf.reduce_mean(value)],
|
||||
'return and value: ')
|
||||
advantage = tf.Print(
|
||||
advantage, [tf.reduce_mean(advantage)],
|
||||
'normalized advantage: ')
|
||||
advantage = tf.Print(advantage,
|
||||
[tf.reduce_mean(return_), tf.reduce_mean(value)], 'return and value: ')
|
||||
advantage = tf.Print(advantage, [tf.reduce_mean(advantage)], 'normalized advantage: ')
|
||||
# pylint: disable=g-long-lambda
|
||||
value_loss, policy_loss, summary = tf.scan(
|
||||
lambda _1, _2: self._update_step(
|
||||
observ, action, old_mean, old_logstd, reward, advantage, length),
|
||||
tf.range(self._config.update_epochs),
|
||||
[0., 0., ''], parallel_iterations=1)
|
||||
print_losses = tf.group(
|
||||
tf.Print(0, [tf.reduce_mean(value_loss)], 'value loss: '),
|
||||
tf.Print(0, [tf.reduce_mean(policy_loss)], 'policy loss: '))
|
||||
value_loss, policy_loss, summary = tf.scan(lambda _1, _2: self._update_step(
|
||||
observ, action, old_mean, old_logstd, reward, advantage, length),
|
||||
tf.range(self._config.update_epochs), [0., 0., ''],
|
||||
parallel_iterations=1)
|
||||
print_losses = tf.group(tf.Print(0, [tf.reduce_mean(value_loss)], 'value loss: '),
|
||||
tf.Print(0, [tf.reduce_mean(policy_loss)], 'policy loss: '))
|
||||
with tf.control_dependencies([value_loss, policy_loss, print_losses]):
|
||||
return summary[self._config.update_epochs // 2]
|
||||
|
||||
def _update_step(
|
||||
self, observ, action, old_mean, old_logstd, reward, advantage, length):
|
||||
def _update_step(self, observ, action, old_mean, old_logstd, reward, advantage, length):
|
||||
"""Compute the current combined loss and perform a gradient update step.
|
||||
|
||||
Args:
|
||||
@@ -345,27 +325,20 @@ class PPOAlgorithm(object):
|
||||
"""
|
||||
value_loss, value_summary = self._value_loss(observ, reward, length)
|
||||
network = self._network(observ, length)
|
||||
policy_loss, policy_summary = self._policy_loss(
|
||||
network.mean, network.logstd, old_mean, old_logstd, action,
|
||||
advantage, length)
|
||||
value_gradients, value_variables = (
|
||||
zip(*self._optimizer.compute_gradients(value_loss)))
|
||||
policy_gradients, policy_variables = (
|
||||
zip(*self._optimizer.compute_gradients(policy_loss)))
|
||||
policy_loss, policy_summary = self._policy_loss(network.mean, network.logstd, old_mean,
|
||||
old_logstd, action, advantage, length)
|
||||
value_gradients, value_variables = (zip(*self._optimizer.compute_gradients(value_loss)))
|
||||
policy_gradients, policy_variables = (zip(*self._optimizer.compute_gradients(policy_loss)))
|
||||
all_gradients = value_gradients + policy_gradients
|
||||
all_variables = value_variables + policy_variables
|
||||
optimize = self._optimizer.apply_gradients(
|
||||
zip(all_gradients, all_variables))
|
||||
optimize = self._optimizer.apply_gradients(zip(all_gradients, all_variables))
|
||||
summary = tf.summary.merge([
|
||||
value_summary, policy_summary,
|
||||
tf.summary.scalar(
|
||||
'value_gradient_norm', tf.global_norm(value_gradients)),
|
||||
tf.summary.scalar(
|
||||
'policy_gradient_norm', tf.global_norm(policy_gradients)),
|
||||
utility.gradient_summaries(
|
||||
zip(value_gradients, value_variables), dict(value=r'.*')),
|
||||
utility.gradient_summaries(
|
||||
zip(policy_gradients, policy_variables), dict(policy=r'.*'))])
|
||||
tf.summary.scalar('value_gradient_norm', tf.global_norm(value_gradients)),
|
||||
tf.summary.scalar('policy_gradient_norm', tf.global_norm(policy_gradients)),
|
||||
utility.gradient_summaries(zip(value_gradients, value_variables), dict(value=r'.*')),
|
||||
utility.gradient_summaries(zip(policy_gradients, policy_variables), dict(policy=r'.*'))
|
||||
])
|
||||
with tf.control_dependencies([optimize]):
|
||||
return [tf.identity(x) for x in (value_loss, policy_loss, summary)]
|
||||
|
||||
@@ -385,18 +358,17 @@ class PPOAlgorithm(object):
|
||||
"""
|
||||
with tf.name_scope('value_loss'):
|
||||
value = self._network(observ, length).value
|
||||
return_ = utility.discounted_return(
|
||||
reward, length, self._config.discount)
|
||||
return_ = utility.discounted_return(reward, length, self._config.discount)
|
||||
advantage = return_ - value
|
||||
value_loss = 0.5 * self._mask(advantage ** 2, length)
|
||||
value_loss = 0.5 * self._mask(advantage**2, length)
|
||||
summary = tf.summary.merge([
|
||||
tf.summary.histogram('value_loss', value_loss),
|
||||
tf.summary.scalar('avg_value_loss', tf.reduce_mean(value_loss))])
|
||||
tf.summary.scalar('avg_value_loss', tf.reduce_mean(value_loss))
|
||||
])
|
||||
value_loss = tf.reduce_mean(value_loss)
|
||||
return tf.check_numerics(value_loss, 'value_loss'), summary
|
||||
|
||||
def _policy_loss(
|
||||
self, mean, logstd, old_mean, old_logstd, action, advantage, length):
|
||||
def _policy_loss(self, mean, logstd, old_mean, old_logstd, action, advantage, length):
|
||||
"""Compute the policy loss composed of multiple components.
|
||||
|
||||
1. The policy gradient loss is importance sampled from the data-collecting
|
||||
@@ -420,24 +392,20 @@ class PPOAlgorithm(object):
|
||||
"""
|
||||
with tf.name_scope('policy_loss'):
|
||||
entropy = utility.diag_normal_entropy(mean, logstd)
|
||||
kl = tf.reduce_mean(self._mask(utility.diag_normal_kl(
|
||||
old_mean, old_logstd, mean, logstd), length), 1)
|
||||
kl = tf.reduce_mean(
|
||||
self._mask(utility.diag_normal_kl(old_mean, old_logstd, mean, logstd), length), 1)
|
||||
policy_gradient = tf.exp(
|
||||
utility.diag_normal_logpdf(mean, logstd, action) -
|
||||
utility.diag_normal_logpdf(old_mean, old_logstd, action))
|
||||
surrogate_loss = -tf.reduce_mean(self._mask(
|
||||
policy_gradient * tf.stop_gradient(advantage), length), 1)
|
||||
surrogate_loss = -tf.reduce_mean(
|
||||
self._mask(policy_gradient * tf.stop_gradient(advantage), length), 1)
|
||||
kl_penalty = self._penalty * kl
|
||||
cutoff_threshold = self._config.kl_target * self._config.kl_cutoff_factor
|
||||
cutoff_count = tf.reduce_sum(
|
||||
tf.cast(kl > cutoff_threshold, tf.int32))
|
||||
with tf.control_dependencies([tf.cond(
|
||||
cutoff_count > 0,
|
||||
lambda: tf.Print(0, [cutoff_count], 'kl cutoff! '), int)]):
|
||||
kl_cutoff = (
|
||||
self._config.kl_cutoff_coef *
|
||||
tf.cast(kl > cutoff_threshold, tf.float32) *
|
||||
(kl - cutoff_threshold) ** 2)
|
||||
cutoff_count = tf.reduce_sum(tf.cast(kl > cutoff_threshold, tf.int32))
|
||||
with tf.control_dependencies(
|
||||
[tf.cond(cutoff_count > 0, lambda: tf.Print(0, [cutoff_count], 'kl cutoff! '), int)]):
|
||||
kl_cutoff = (self._config.kl_cutoff_coef * tf.cast(kl > cutoff_threshold, tf.float32) *
|
||||
(kl - cutoff_threshold)**2)
|
||||
policy_loss = surrogate_loss + kl_penalty + kl_cutoff
|
||||
summary = tf.summary.merge([
|
||||
tf.summary.histogram('entropy', entropy),
|
||||
@@ -449,7 +417,8 @@ class PPOAlgorithm(object):
|
||||
tf.summary.histogram('policy_loss', policy_loss),
|
||||
tf.summary.scalar('avg_surr_loss', tf.reduce_mean(surrogate_loss)),
|
||||
tf.summary.scalar('avg_kl_penalty', tf.reduce_mean(kl_penalty)),
|
||||
tf.summary.scalar('avg_policy_loss', tf.reduce_mean(policy_loss))])
|
||||
tf.summary.scalar('avg_policy_loss', tf.reduce_mean(policy_loss))
|
||||
])
|
||||
policy_loss = tf.reduce_mean(policy_loss, 0)
|
||||
return tf.check_numerics(policy_loss, 'policy_loss'), summary
|
||||
|
||||
@@ -471,30 +440,30 @@ class PPOAlgorithm(object):
|
||||
"""
|
||||
with tf.name_scope('adjust_penalty'):
|
||||
network = self._network(observ, length)
|
||||
assert_change = tf.assert_equal(
|
||||
tf.reduce_all(tf.equal(network.mean, old_mean)), False,
|
||||
message='policy should change')
|
||||
assert_change = tf.assert_equal(tf.reduce_all(tf.equal(network.mean, old_mean)),
|
||||
False,
|
||||
message='policy should change')
|
||||
print_penalty = tf.Print(0, [self._penalty], 'current penalty: ')
|
||||
with tf.control_dependencies([assert_change, print_penalty]):
|
||||
kl_change = tf.reduce_mean(self._mask(utility.diag_normal_kl(
|
||||
old_mean, old_logstd, network.mean, network.logstd), length))
|
||||
kl_change = tf.reduce_mean(
|
||||
self._mask(utility.diag_normal_kl(old_mean, old_logstd, network.mean, network.logstd),
|
||||
length))
|
||||
kl_change = tf.Print(kl_change, [kl_change], 'kl change: ')
|
||||
maybe_increase = tf.cond(
|
||||
kl_change > 1.3 * self._config.kl_target,
|
||||
# pylint: disable=g-long-lambda
|
||||
lambda: tf.Print(self._penalty.assign(
|
||||
self._penalty * 1.5), [0], 'increase penalty '),
|
||||
lambda: tf.Print(self._penalty.assign(self._penalty * 1.5), [0], 'increase penalty '),
|
||||
float)
|
||||
maybe_decrease = tf.cond(
|
||||
kl_change < 0.7 * self._config.kl_target,
|
||||
# pylint: disable=g-long-lambda
|
||||
lambda: tf.Print(self._penalty.assign(
|
||||
self._penalty / 1.5), [0], 'decrease penalty '),
|
||||
lambda: tf.Print(self._penalty.assign(self._penalty / 1.5), [0], 'decrease penalty '),
|
||||
float)
|
||||
with tf.control_dependencies([maybe_increase, maybe_decrease]):
|
||||
return tf.summary.merge([
|
||||
tf.summary.scalar('kl_change', kl_change),
|
||||
tf.summary.scalar('penalty', self._penalty)])
|
||||
tf.summary.scalar('penalty', self._penalty)
|
||||
])
|
||||
|
||||
def _mask(self, tensor, length):
|
||||
"""Set padding elements of a batch of sequences to zero.
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Memory that stores episodes."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
@@ -43,10 +42,9 @@ class EpisodeMemory(object):
|
||||
self._scope = var_scope
|
||||
self._length = tf.Variable(tf.zeros(capacity, tf.int32), False)
|
||||
self._buffers = [
|
||||
tf.Variable(tf.zeros(
|
||||
[capacity, max_length] + elem.shape.as_list(),
|
||||
elem.dtype), False)
|
||||
for elem in template]
|
||||
tf.Variable(tf.zeros([capacity, max_length] + elem.shape.as_list(), elem.dtype), False)
|
||||
for elem in template
|
||||
]
|
||||
|
||||
def length(self, rows=None):
|
||||
"""Tensor holding the current length of episodes.
|
||||
@@ -72,13 +70,11 @@ class EpisodeMemory(object):
|
||||
"""
|
||||
rows = tf.range(self._capacity) if rows is None else rows
|
||||
assert rows.shape.ndims == 1
|
||||
assert_capacity = tf.assert_less(
|
||||
rows, self._capacity,
|
||||
message='capacity exceeded')
|
||||
assert_capacity = tf.assert_less(rows, self._capacity, message='capacity exceeded')
|
||||
with tf.control_dependencies([assert_capacity]):
|
||||
assert_max_length = tf.assert_less(
|
||||
tf.gather(self._length, rows), self._max_length,
|
||||
message='max length exceeded')
|
||||
assert_max_length = tf.assert_less(tf.gather(self._length, rows),
|
||||
self._max_length,
|
||||
message='max length exceeded')
|
||||
append_ops = []
|
||||
with tf.control_dependencies([assert_max_length]):
|
||||
for buffer_, elements in zip(self._buffers, transitions):
|
||||
@@ -86,8 +82,7 @@ class EpisodeMemory(object):
|
||||
indices = tf.stack([rows, timestep], 1)
|
||||
append_ops.append(tf.scatter_nd_update(buffer_, indices, elements))
|
||||
with tf.control_dependencies(append_ops):
|
||||
episode_mask = tf.reduce_sum(tf.one_hot(
|
||||
rows, self._capacity, dtype=tf.int32), 0)
|
||||
episode_mask = tf.reduce_sum(tf.one_hot(rows, self._capacity, dtype=tf.int32), 0)
|
||||
return self._length.assign_add(episode_mask)
|
||||
|
||||
def replace(self, episodes, length, rows=None):
|
||||
@@ -103,11 +98,11 @@ class EpisodeMemory(object):
|
||||
"""
|
||||
rows = tf.range(self._capacity) if rows is None else rows
|
||||
assert rows.shape.ndims == 1
|
||||
assert_capacity = tf.assert_less(
|
||||
rows, self._capacity, message='capacity exceeded')
|
||||
assert_capacity = tf.assert_less(rows, self._capacity, message='capacity exceeded')
|
||||
with tf.control_dependencies([assert_capacity]):
|
||||
assert_max_length = tf.assert_less_equal(
|
||||
length, self._max_length, message='max length exceeded')
|
||||
assert_max_length = tf.assert_less_equal(length,
|
||||
self._max_length,
|
||||
message='max length exceeded')
|
||||
replace_ops = []
|
||||
with tf.control_dependencies([assert_max_length]):
|
||||
for buffer_, elements in zip(self._buffers, episodes):
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Normalize tensors based on streaming estimates of mean and variance."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
@@ -24,8 +23,7 @@ import tensorflow as tf
|
||||
class StreamingNormalize(object):
|
||||
"""Normalize tensors based on streaming estimates of mean and variance."""
|
||||
|
||||
def __init__(
|
||||
self, template, center=True, scale=True, clip=10, name='normalize'):
|
||||
def __init__(self, template, center=True, scale=True, clip=10, name='normalize'):
|
||||
"""Normalize tensors based on streaming estimates of mean and variance.
|
||||
|
||||
Centering the value, scaling it by the standard deviation, and clipping
|
||||
@@ -69,8 +67,7 @@ class StreamingNormalize(object):
|
||||
if self._scale:
|
||||
# We cannot scale before seeing at least two samples.
|
||||
value /= tf.cond(
|
||||
self._count > 1, lambda: self._std() + 1e-8,
|
||||
lambda: tf.ones_like(self._var_sum))[None]
|
||||
self._count > 1, lambda: self._std() + 1e-8, lambda: tf.ones_like(self._var_sum))[None]
|
||||
if self._clip:
|
||||
value = tf.clip_by_value(value, -self._clip, self._clip)
|
||||
# Remove batch dimension if necessary.
|
||||
@@ -97,8 +94,7 @@ class StreamingNormalize(object):
|
||||
mean_delta = tf.reduce_sum(value - self._mean[None, ...], 0)
|
||||
new_mean = self._mean + mean_delta / step
|
||||
new_mean = tf.cond(self._count > 1, lambda: new_mean, lambda: value[0])
|
||||
var_delta = (
|
||||
value - self._mean[None, ...]) * (value - new_mean[None, ...])
|
||||
var_delta = (value - self._mean[None, ...]) * (value - new_mean[None, ...])
|
||||
new_var_sum = self._var_sum + tf.reduce_sum(var_delta, 0)
|
||||
with tf.control_dependencies([new_mean, new_var_sum]):
|
||||
update = self._mean.assign(new_mean), self._var_sum.assign(new_var_sum)
|
||||
@@ -116,10 +112,8 @@ class StreamingNormalize(object):
|
||||
Operation.
|
||||
"""
|
||||
with tf.name_scope(self._name + '/reset'):
|
||||
return tf.group(
|
||||
self._count.assign(0),
|
||||
self._mean.assign(tf.zeros_like(self._mean)),
|
||||
self._var_sum.assign(tf.zeros_like(self._var_sum)))
|
||||
return tf.group(self._count.assign(0), self._mean.assign(tf.zeros_like(self._mean)),
|
||||
self._var_sum.assign(tf.zeros_like(self._var_sum)))
|
||||
|
||||
def summary(self):
|
||||
"""Summary string of mean and standard deviation.
|
||||
@@ -128,10 +122,8 @@ class StreamingNormalize(object):
|
||||
Summary tensor.
|
||||
"""
|
||||
with tf.name_scope(self._name + '/summary'):
|
||||
mean_summary = tf.cond(
|
||||
self._count > 0, lambda: self._summary('mean', self._mean), str)
|
||||
std_summary = tf.cond(
|
||||
self._count > 1, lambda: self._summary('stddev', self._std()), str)
|
||||
mean_summary = tf.cond(self._count > 0, lambda: self._summary('mean', self._mean), str)
|
||||
std_summary = tf.cond(self._count > 1, lambda: self._summary('stddev', self._std()), str)
|
||||
return tf.summary.merge([mean_summary, std_summary])
|
||||
|
||||
def _std(self):
|
||||
@@ -143,10 +135,8 @@ class StreamingNormalize(object):
|
||||
Returns:
|
||||
Tensor of current variance.
|
||||
"""
|
||||
variance = tf.cond(
|
||||
self._count > 1,
|
||||
lambda: self._var_sum / tf.cast(self._count - 1, tf.float32),
|
||||
lambda: tf.ones_like(self._var_sum) * float('nan'))
|
||||
variance = tf.cond(self._count > 1, lambda: self._var_sum / tf.cast(
|
||||
self._count - 1, tf.float32), lambda: tf.ones_like(self._var_sum) * float('nan'))
|
||||
# The epsilon corrects for small negative variance values caused by
|
||||
# the algorithm. It was empirically chosen to work with all environments
|
||||
# tested.
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Utilities for the PPO algorithm."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
@@ -37,8 +36,7 @@ def reinit_nested_vars(variables, indices=None):
|
||||
Operation.
|
||||
"""
|
||||
if isinstance(variables, (tuple, list)):
|
||||
return tf.group(*[
|
||||
reinit_nested_vars(variable, indices) for variable in variables])
|
||||
return tf.group(*[reinit_nested_vars(variable, indices) for variable in variables])
|
||||
if indices is None:
|
||||
return variables.assign(tf.zeros_like(variables))
|
||||
else:
|
||||
@@ -58,9 +56,8 @@ def assign_nested_vars(variables, tensors, indices=None):
|
||||
Operation.
|
||||
"""
|
||||
if isinstance(variables, (tuple, list)):
|
||||
return tf.group(*[
|
||||
assign_nested_vars(variable, tensor)
|
||||
for variable, tensor in zip(variables, tensors)])
|
||||
return tf.group(
|
||||
*[assign_nested_vars(variable, tensor) for variable, tensor in zip(variables, tensors)])
|
||||
if indices is None:
|
||||
return variables.assign(tensors)
|
||||
else:
|
||||
@@ -71,10 +68,11 @@ def discounted_return(reward, length, discount):
|
||||
"""Discounted Monte-Carlo returns."""
|
||||
timestep = tf.range(reward.shape[1].value)
|
||||
mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
|
||||
return_ = tf.reverse(tf.transpose(tf.scan(
|
||||
lambda agg, cur: cur + discount * agg,
|
||||
tf.transpose(tf.reverse(mask * reward, [1]), [1, 0]),
|
||||
tf.zeros_like(reward[:, -1]), 1, False), [1, 0]), [1])
|
||||
return_ = tf.reverse(
|
||||
tf.transpose(
|
||||
tf.scan(lambda agg, cur: cur + discount * agg,
|
||||
tf.transpose(tf.reverse(mask * reward, [1]), [1, 0]),
|
||||
tf.zeros_like(reward[:, -1]), 1, False), [1, 0]), [1])
|
||||
return tf.check_numerics(tf.stop_gradient(return_), 'return')
|
||||
|
||||
|
||||
@@ -85,9 +83,8 @@ def fixed_step_return(reward, value, length, discount, window):
|
||||
return_ = tf.zeros_like(reward)
|
||||
for _ in range(window):
|
||||
return_ += reward
|
||||
reward = discount * tf.concat(
|
||||
[reward[:, 1:], tf.zeros_like(reward[:, -1:])], 1)
|
||||
return_ += discount ** window * tf.concat(
|
||||
reward = discount * tf.concat([reward[:, 1:], tf.zeros_like(reward[:, -1:])], 1)
|
||||
return_ += discount**window * tf.concat(
|
||||
[value[:, window:], tf.zeros_like(value[:, -window:]), 1])
|
||||
return tf.check_numerics(tf.stop_gradient(mask * return_), 'return')
|
||||
|
||||
@@ -99,10 +96,11 @@ def lambda_return(reward, value, length, discount, lambda_):
|
||||
sequence = mask * reward + discount * value * (1 - lambda_)
|
||||
discount = mask * discount * lambda_
|
||||
sequence = tf.stack([sequence, discount], 2)
|
||||
return_ = tf.reverse(tf.transpose(tf.scan(
|
||||
lambda agg, cur: cur[0] + cur[1] * agg,
|
||||
tf.transpose(tf.reverse(sequence, [1]), [1, 2, 0]),
|
||||
tf.zeros_like(value[:, -1]), 1, False), [1, 0]), [1])
|
||||
return_ = tf.reverse(
|
||||
tf.transpose(
|
||||
tf.scan(lambda agg, cur: cur[0] + cur[1] * agg,
|
||||
tf.transpose(tf.reverse(sequence, [1]), [1, 2, 0]), tf.zeros_like(value[:, -1]),
|
||||
1, False), [1, 0]), [1])
|
||||
return tf.check_numerics(tf.stop_gradient(return_), 'return')
|
||||
|
||||
|
||||
@@ -112,27 +110,26 @@ def lambda_advantage(reward, value, length, discount):
|
||||
mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
|
||||
next_value = tf.concat([value[:, 1:], tf.zeros_like(value[:, -1:])], 1)
|
||||
delta = reward + discount * next_value - value
|
||||
advantage = tf.reverse(tf.transpose(tf.scan(
|
||||
lambda agg, cur: cur + discount * agg,
|
||||
tf.transpose(tf.reverse(mask * delta, [1]), [1, 0]),
|
||||
tf.zeros_like(delta[:, -1]), 1, False), [1, 0]), [1])
|
||||
advantage = tf.reverse(
|
||||
tf.transpose(
|
||||
tf.scan(lambda agg, cur: cur + discount * agg,
|
||||
tf.transpose(tf.reverse(mask * delta, [1]), [1, 0]), tf.zeros_like(delta[:, -1]),
|
||||
1, False), [1, 0]), [1])
|
||||
return tf.check_numerics(tf.stop_gradient(advantage), 'advantage')
|
||||
|
||||
|
||||
def diag_normal_kl(mean0, logstd0, mean1, logstd1):
|
||||
"""Epirical KL divergence of two normals with diagonal covariance."""
|
||||
logstd0_2, logstd1_2 = 2 * logstd0, 2 * logstd1
|
||||
return 0.5 * (
|
||||
tf.reduce_sum(tf.exp(logstd0_2 - logstd1_2), -1) +
|
||||
tf.reduce_sum((mean1 - mean0) ** 2 / tf.exp(logstd1_2), -1) +
|
||||
tf.reduce_sum(logstd1_2, -1) - tf.reduce_sum(logstd0_2, -1) -
|
||||
mean0.shape[-1].value)
|
||||
return 0.5 * (tf.reduce_sum(tf.exp(logstd0_2 - logstd1_2), -1) + tf.reduce_sum(
|
||||
(mean1 - mean0)**2 / tf.exp(logstd1_2), -1) + tf.reduce_sum(logstd1_2, -1) -
|
||||
tf.reduce_sum(logstd0_2, -1) - mean0.shape[-1].value)
|
||||
|
||||
|
||||
def diag_normal_logpdf(mean, logstd, loc):
|
||||
"""Log density of a normal with diagonal covariance."""
|
||||
constant = -0.5 * math.log(2 * math.pi) - logstd
|
||||
value = -0.5 * ((loc - mean) / tf.exp(logstd)) ** 2
|
||||
value = -0.5 * ((loc - mean) / tf.exp(logstd))**2
|
||||
return tf.reduce_sum(constant + value, -1)
|
||||
|
||||
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Tools for reinforcement learning."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Wrap a dictionary to access keys as attributes."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Combine multiple environments to step them in batch."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
@@ -83,13 +82,9 @@ class BatchEnv(object):
|
||||
message = 'Invalid action at index {}: {}'
|
||||
raise ValueError(message.format(index, action))
|
||||
if self._blocking:
|
||||
transitions = [
|
||||
env.step(action)
|
||||
for env, action in zip(self._envs, actions)]
|
||||
transitions = [env.step(action) for env, action in zip(self._envs, actions)]
|
||||
else:
|
||||
transitions = [
|
||||
env.step(action, blocking=False)
|
||||
for env, action in zip(self._envs, actions)]
|
||||
transitions = [env.step(action, blocking=False) for env, action in zip(self._envs, actions)]
|
||||
transitions = [transition() for transition in transitions]
|
||||
observs, rewards, dones, infos = zip(*transitions)
|
||||
observ = np.stack(observs)
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Count learnable parameters."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Batch of environments inside the TensorFlow graph."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
@@ -42,18 +41,18 @@ class InGraphBatchEnv(object):
|
||||
action_shape = self._parse_shape(self._batch_env.action_space)
|
||||
action_dtype = self._parse_dtype(self._batch_env.action_space)
|
||||
with tf.variable_scope('env_temporary'):
|
||||
self._observ = tf.Variable(
|
||||
tf.zeros((len(self._batch_env),) + observ_shape, observ_dtype),
|
||||
name='observ', trainable=False)
|
||||
self._action = tf.Variable(
|
||||
tf.zeros((len(self._batch_env),) + action_shape, action_dtype),
|
||||
name='action', trainable=False)
|
||||
self._reward = tf.Variable(
|
||||
tf.zeros((len(self._batch_env),), tf.float32),
|
||||
name='reward', trainable=False)
|
||||
self._done = tf.Variable(
|
||||
tf.cast(tf.ones((len(self._batch_env),)), tf.bool),
|
||||
name='done', trainable=False)
|
||||
self._observ = tf.Variable(tf.zeros((len(self._batch_env),) + observ_shape, observ_dtype),
|
||||
name='observ',
|
||||
trainable=False)
|
||||
self._action = tf.Variable(tf.zeros((len(self._batch_env),) + action_shape, action_dtype),
|
||||
name='action',
|
||||
trainable=False)
|
||||
self._reward = tf.Variable(tf.zeros((len(self._batch_env),), tf.float32),
|
||||
name='reward',
|
||||
trainable=False)
|
||||
self._done = tf.Variable(tf.cast(tf.ones((len(self._batch_env),)), tf.bool),
|
||||
name='done',
|
||||
trainable=False)
|
||||
|
||||
def __getattr__(self, name):
|
||||
"""Forward unimplemented attributes to one of the original environments.
|
||||
@@ -89,16 +88,13 @@ class InGraphBatchEnv(object):
|
||||
if action.dtype in (tf.float16, tf.float32, tf.float64):
|
||||
action = tf.check_numerics(action, 'action')
|
||||
observ_dtype = self._parse_dtype(self._batch_env.observation_space)
|
||||
observ, reward, done = tf.py_func(
|
||||
lambda a: self._batch_env.step(a)[:3], [action],
|
||||
[observ_dtype, tf.float32, tf.bool], name='step')
|
||||
observ, reward, done = tf.py_func(lambda a: self._batch_env.step(a)[:3], [action],
|
||||
[observ_dtype, tf.float32, tf.bool],
|
||||
name='step')
|
||||
observ = tf.check_numerics(observ, 'observ')
|
||||
reward = tf.check_numerics(reward, 'reward')
|
||||
return tf.group(
|
||||
self._observ.assign(observ),
|
||||
self._action.assign(action),
|
||||
self._reward.assign(reward),
|
||||
self._done.assign(done))
|
||||
return tf.group(self._observ.assign(observ), self._action.assign(action),
|
||||
self._reward.assign(reward), self._done.assign(done))
|
||||
|
||||
def reset(self, indices=None):
|
||||
"""Reset the batch of environments.
|
||||
@@ -112,15 +108,15 @@ class InGraphBatchEnv(object):
|
||||
if indices is None:
|
||||
indices = tf.range(len(self._batch_env))
|
||||
observ_dtype = self._parse_dtype(self._batch_env.observation_space)
|
||||
observ = tf.py_func(
|
||||
self._batch_env.reset, [indices], observ_dtype, name='reset')
|
||||
observ = tf.py_func(self._batch_env.reset, [indices], observ_dtype, name='reset')
|
||||
observ = tf.check_numerics(observ, 'observ')
|
||||
reward = tf.zeros_like(indices, tf.float32)
|
||||
done = tf.zeros_like(indices, tf.bool)
|
||||
with tf.control_dependencies([
|
||||
tf.scatter_update(self._observ, indices, observ),
|
||||
tf.scatter_update(self._reward, indices, reward),
|
||||
tf.scatter_update(self._done, indices, done)]):
|
||||
tf.scatter_update(self._done, indices, done)
|
||||
]):
|
||||
return tf.identity(observ)
|
||||
|
||||
@property
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Put an OpenAI Gym environment into the TensorFlow graph."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
@@ -42,16 +41,15 @@ class InGraphEnv(object):
|
||||
action_shape = self._parse_shape(self._env.action_space)
|
||||
action_dtype = self._parse_dtype(self._env.action_space)
|
||||
with tf.name_scope('environment'):
|
||||
self._observ = tf.Variable(
|
||||
tf.zeros(observ_shape, observ_dtype), name='observ', trainable=False)
|
||||
self._action = tf.Variable(
|
||||
tf.zeros(action_shape, action_dtype), name='action', trainable=False)
|
||||
self._reward = tf.Variable(
|
||||
0.0, dtype=tf.float32, name='reward', trainable=False)
|
||||
self._done = tf.Variable(
|
||||
True, dtype=tf.bool, name='done', trainable=False)
|
||||
self._step = tf.Variable(
|
||||
0, dtype=tf.int32, name='step', trainable=False)
|
||||
self._observ = tf.Variable(tf.zeros(observ_shape, observ_dtype),
|
||||
name='observ',
|
||||
trainable=False)
|
||||
self._action = tf.Variable(tf.zeros(action_shape, action_dtype),
|
||||
name='action',
|
||||
trainable=False)
|
||||
self._reward = tf.Variable(0.0, dtype=tf.float32, name='reward', trainable=False)
|
||||
self._done = tf.Variable(True, dtype=tf.bool, name='done', trainable=False)
|
||||
self._step = tf.Variable(0, dtype=tf.int32, name='step', trainable=False)
|
||||
|
||||
def __getattr__(self, name):
|
||||
"""Forward unimplemented attributes to the original environment.
|
||||
@@ -79,17 +77,14 @@ class InGraphEnv(object):
|
||||
if action.dtype in (tf.float16, tf.float32, tf.float64):
|
||||
action = tf.check_numerics(action, 'action')
|
||||
observ_dtype = self._parse_dtype(self._env.observation_space)
|
||||
observ, reward, done = tf.py_func(
|
||||
lambda a: self._env.step(a)[:3], [action],
|
||||
[observ_dtype, tf.float32, tf.bool], name='step')
|
||||
observ, reward, done = tf.py_func(lambda a: self._env.step(a)[:3], [action],
|
||||
[observ_dtype, tf.float32, tf.bool],
|
||||
name='step')
|
||||
observ = tf.check_numerics(observ, 'observ')
|
||||
reward = tf.check_numerics(reward, 'reward')
|
||||
return tf.group(
|
||||
self._observ.assign(observ),
|
||||
self._action.assign(action),
|
||||
self._reward.assign(reward),
|
||||
self._done.assign(done),
|
||||
self._step.assign_add(1))
|
||||
return tf.group(self._observ.assign(observ), self._action.assign(action),
|
||||
self._reward.assign(reward), self._done.assign(done),
|
||||
self._step.assign_add(1))
|
||||
|
||||
def reset(self):
|
||||
"""Reset the environment.
|
||||
@@ -100,10 +95,10 @@ class InGraphEnv(object):
|
||||
observ_dtype = self._parse_dtype(self._env.observation_space)
|
||||
observ = tf.py_func(self._env.reset, [], observ_dtype, name='reset')
|
||||
observ = tf.check_numerics(observ, 'observ')
|
||||
with tf.control_dependencies([
|
||||
self._observ.assign(observ),
|
||||
self._reward.assign(0),
|
||||
self._done.assign(False)]):
|
||||
with tf.control_dependencies(
|
||||
[self._observ.assign(observ),
|
||||
self._reward.assign(0),
|
||||
self._done.assign(False)]):
|
||||
return tf.identity(observ)
|
||||
|
||||
@property
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Execute operations in a loop and coordinate logging and checkpoints."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
@@ -25,10 +24,8 @@ import tensorflow as tf
|
||||
|
||||
from . import streaming_mean
|
||||
|
||||
|
||||
_Phase = collections.namedtuple(
|
||||
'Phase',
|
||||
'name, writer, op, batch, steps, feed, report_every, log_every,'
|
||||
'Phase', 'name, writer, op, batch, steps, feed, report_every, log_every,'
|
||||
'checkpoint_every')
|
||||
|
||||
|
||||
@@ -56,16 +53,22 @@ class Loop(object):
|
||||
reset: Tensor indicating to the model to start a new computation.
|
||||
"""
|
||||
self._logdir = logdir
|
||||
self._step = (
|
||||
tf.Variable(0, False, name='global_step') if step is None else step)
|
||||
self._step = (tf.Variable(0, False, name='global_step') if step is None else step)
|
||||
self._log = tf.placeholder(tf.bool) if log is None else log
|
||||
self._report = tf.placeholder(tf.bool) if report is None else report
|
||||
self._reset = tf.placeholder(tf.bool) if reset is None else reset
|
||||
self._phases = []
|
||||
|
||||
def add_phase(
|
||||
self, name, done, score, summary, steps,
|
||||
report_every=None, log_every=None, checkpoint_every=None, feed=None):
|
||||
def add_phase(self,
|
||||
name,
|
||||
done,
|
||||
score,
|
||||
summary,
|
||||
steps,
|
||||
report_every=None,
|
||||
log_every=None,
|
||||
checkpoint_every=None,
|
||||
feed=None):
|
||||
"""Add a phase to the loop protocol.
|
||||
|
||||
If the model breaks long computation into multiple steps, the done tensor
|
||||
@@ -97,13 +100,12 @@ class Loop(object):
|
||||
if done.shape.ndims is None or score.shape.ndims is None:
|
||||
raise ValueError("Rank of 'done' and 'score' tensors must be known.")
|
||||
writer = self._logdir and tf.summary.FileWriter(
|
||||
os.path.join(self._logdir, name), tf.get_default_graph(),
|
||||
flush_secs=60)
|
||||
os.path.join(self._logdir, name), tf.get_default_graph(), flush_secs=60)
|
||||
op = self._define_step(done, score, summary)
|
||||
batch = 1 if score.shape.ndims == 0 else score.shape[0].value
|
||||
self._phases.append(_Phase(
|
||||
name, writer, op, batch, int(steps), feed, report_every,
|
||||
log_every, checkpoint_every))
|
||||
self._phases.append(
|
||||
_Phase(name, writer, op, batch, int(steps), feed, report_every, log_every,
|
||||
checkpoint_every))
|
||||
|
||||
def run(self, sess, saver, max_step=None):
|
||||
"""Run the loop schedule for a specified number of steps.
|
||||
@@ -133,13 +135,11 @@ class Loop(object):
|
||||
tf.logging.info(message.format(phase.name, phase_step, global_step))
|
||||
# Populate book keeping tensors.
|
||||
phase.feed[self._reset] = (steps_in < steps_made)
|
||||
phase.feed[self._log] = (
|
||||
phase.writer and
|
||||
self._is_every_steps(phase_step, phase.batch, phase.log_every))
|
||||
phase.feed[self._report] = (
|
||||
self._is_every_steps(phase_step, phase.batch, phase.report_every))
|
||||
summary, mean_score, global_step, steps_made = sess.run(
|
||||
phase.op, phase.feed)
|
||||
phase.feed[self._log] = (phase.writer and
|
||||
self._is_every_steps(phase_step, phase.batch, phase.log_every))
|
||||
phase.feed[self._report] = (self._is_every_steps(phase_step, phase.batch,
|
||||
phase.report_every))
|
||||
summary, mean_score, global_step, steps_made = sess.run(phase.op, phase.feed)
|
||||
if self._is_every_steps(phase_step, phase.batch, phase.checkpoint_every):
|
||||
self._store_checkpoint(sess, saver, global_step)
|
||||
if self._is_every_steps(phase_step, phase.batch, phase.report_every):
|
||||
@@ -207,8 +207,7 @@ class Loop(object):
|
||||
score_mean = streaming_mean.StreamingMean((), tf.float32)
|
||||
with tf.control_dependencies([done, score, summary]):
|
||||
done_score = tf.gather(score, tf.where(done)[:, 0])
|
||||
submit_score = tf.cond(
|
||||
tf.reduce_any(done), lambda: score_mean.submit(done_score), tf.no_op)
|
||||
submit_score = tf.cond(tf.reduce_any(done), lambda: score_mean.submit(done_score), tf.no_op)
|
||||
with tf.control_dependencies([submit_score]):
|
||||
mean_score = tf.cond(self._report, score_mean.clear, float)
|
||||
steps_made = tf.shape(score)[0]
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Mock algorithm for testing reinforcement learning code."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Mock environment for testing reinforcement learning code."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""In-graph simulation step of a vectorized algorithm with environments."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
@@ -55,7 +54,8 @@ def simulate(batch_env, algo, log=True, reset=False):
|
||||
reset_ops = [
|
||||
batch_env.reset(agent_indices),
|
||||
tf.scatter_update(score, agent_indices, zero_scores),
|
||||
tf.scatter_update(length, agent_indices, zero_durations)]
|
||||
tf.scatter_update(length, agent_indices, zero_durations)
|
||||
]
|
||||
with tf.control_dependencies(reset_ops):
|
||||
return algo.begin_episode(agent_indices)
|
||||
|
||||
@@ -78,9 +78,8 @@ def simulate(batch_env, algo, log=True, reset=False):
|
||||
inc_length = length.assign_add(tf.ones(len(batch_env), tf.int32))
|
||||
with tf.control_dependencies([add_score, inc_length]):
|
||||
agent_indices = tf.range(len(batch_env))
|
||||
experience_summary = algo.experience(
|
||||
agent_indices, prevob, batch_env.action, batch_env.reward,
|
||||
batch_env.done, batch_env.observ)
|
||||
experience_summary = algo.experience(agent_indices, prevob, batch_env.action,
|
||||
batch_env.reward, batch_env.done, batch_env.observ)
|
||||
return tf.summary.merge([step_summary, experience_summary])
|
||||
|
||||
def _define_end_episode(agent_indices):
|
||||
@@ -96,8 +95,7 @@ def simulate(batch_env, algo, log=True, reset=False):
|
||||
"""
|
||||
assert agent_indices.shape.ndims == 1
|
||||
submit_score = mean_score.submit(tf.gather(score, agent_indices))
|
||||
submit_length = mean_length.submit(
|
||||
tf.cast(tf.gather(length, agent_indices), tf.float32))
|
||||
submit_length = mean_length.submit(tf.cast(tf.gather(length, agent_indices), tf.float32))
|
||||
with tf.control_dependencies([submit_score, submit_length]):
|
||||
return algo.end_episode(agent_indices)
|
||||
|
||||
@@ -107,41 +105,34 @@ def simulate(batch_env, algo, log=True, reset=False):
|
||||
Returns:
|
||||
Summary string.
|
||||
"""
|
||||
score_summary = tf.cond(
|
||||
tf.logical_and(log, tf.cast(mean_score.count, tf.bool)),
|
||||
lambda: tf.summary.scalar('mean_score', mean_score.clear()), str)
|
||||
length_summary = tf.cond(
|
||||
tf.logical_and(log, tf.cast(mean_length.count, tf.bool)),
|
||||
lambda: tf.summary.scalar('mean_length', mean_length.clear()), str)
|
||||
score_summary = tf.cond(tf.logical_and(log, tf.cast(
|
||||
mean_score.count, tf.bool)), lambda: tf.summary.scalar('mean_score', mean_score.clear()),
|
||||
str)
|
||||
length_summary = tf.cond(tf.logical_and(
|
||||
log, tf.cast(mean_length.count,
|
||||
tf.bool)), lambda: tf.summary.scalar('mean_length', mean_length.clear()), str)
|
||||
return tf.summary.merge([score_summary, length_summary])
|
||||
|
||||
with tf.name_scope('simulate'):
|
||||
log = tf.convert_to_tensor(log)
|
||||
reset = tf.convert_to_tensor(reset)
|
||||
with tf.variable_scope('simulate_temporary'):
|
||||
score = tf.Variable(
|
||||
tf.zeros(len(batch_env), dtype=tf.float32), False, name='score')
|
||||
length = tf.Variable(
|
||||
tf.zeros(len(batch_env), dtype=tf.int32), False, name='length')
|
||||
score = tf.Variable(tf.zeros(len(batch_env), dtype=tf.float32), False, name='score')
|
||||
length = tf.Variable(tf.zeros(len(batch_env), dtype=tf.int32), False, name='length')
|
||||
mean_score = streaming_mean.StreamingMean((), tf.float32)
|
||||
mean_length = streaming_mean.StreamingMean((), tf.float32)
|
||||
agent_indices = tf.cond(
|
||||
reset,
|
||||
lambda: tf.range(len(batch_env)),
|
||||
lambda: tf.cast(tf.where(batch_env.done)[:, 0], tf.int32))
|
||||
begin_episode = tf.cond(
|
||||
tf.cast(tf.shape(agent_indices)[0], tf.bool),
|
||||
lambda: _define_begin_episode(agent_indices), str)
|
||||
agent_indices = tf.cond(reset, lambda: tf.range(len(batch_env)), lambda: tf.cast(
|
||||
tf.where(batch_env.done)[:, 0], tf.int32))
|
||||
begin_episode = tf.cond(tf.cast(tf.shape(agent_indices)[0],
|
||||
tf.bool), lambda: _define_begin_episode(agent_indices), str)
|
||||
with tf.control_dependencies([begin_episode]):
|
||||
step = _define_step()
|
||||
with tf.control_dependencies([step]):
|
||||
agent_indices = tf.cast(tf.where(batch_env.done)[:, 0], tf.int32)
|
||||
end_episode = tf.cond(
|
||||
tf.cast(tf.shape(agent_indices)[0], tf.bool),
|
||||
lambda: _define_end_episode(agent_indices), str)
|
||||
end_episode = tf.cond(tf.cast(tf.shape(agent_indices)[0],
|
||||
tf.bool), lambda: _define_end_episode(agent_indices), str)
|
||||
with tf.control_dependencies([end_episode]):
|
||||
summary = tf.summary.merge([
|
||||
_define_summaries(), begin_episode, step, end_episode])
|
||||
summary = tf.summary.merge([_define_summaries(), begin_episode, step, end_episode])
|
||||
with tf.control_dependencies([summary]):
|
||||
done, score = tf.identity(batch_env.done), tf.identity(score)
|
||||
return done, score, summary
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Compute a streaming estimation of the mean of submitted tensors."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
@@ -53,9 +52,8 @@ class StreamingMean(object):
|
||||
# Add a batch dimension if necessary.
|
||||
if value.shape.ndims == self._sum.shape.ndims:
|
||||
value = value[None, ...]
|
||||
return tf.group(
|
||||
self._sum.assign_add(tf.reduce_sum(value, 0)),
|
||||
self._count.assign_add(tf.shape(value)[0]))
|
||||
return tf.group(self._sum.assign_add(tf.reduce_sum(value, 0)),
|
||||
self._count.assign_add(tf.shape(value)[0]))
|
||||
|
||||
def clear(self):
|
||||
"""Return the mean estimate and reset the streaming statistics."""
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Wrappers for OpenAI Gym environments."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
@@ -149,8 +148,7 @@ class FrameHistory(object):
|
||||
return self._select_frames()
|
||||
|
||||
def _select_frames(self):
|
||||
indices = [
|
||||
(self._step - index) % self._capacity for index in self._past_indices]
|
||||
indices = [(self._step - index) % self._capacity for index in self._past_indices]
|
||||
observ = self._buffer[indices]
|
||||
if self._flatten:
|
||||
observ = np.reshape(observ, (-1,) + observ.shape[2:])
|
||||
@@ -191,14 +189,14 @@ class RangeNormalize(object):
|
||||
|
||||
def __init__(self, env, observ=None, action=None):
|
||||
self._env = env
|
||||
self._should_normalize_observ = (
|
||||
observ is not False and self._is_finite(self._env.observation_space))
|
||||
self._should_normalize_observ = (observ is not False and
|
||||
self._is_finite(self._env.observation_space))
|
||||
if observ is True and not self._should_normalize_observ:
|
||||
raise ValueError('Cannot normalize infinite observation range.')
|
||||
if observ is None and not self._should_normalize_observ:
|
||||
tf.logging.info('Not normalizing infinite observation range.')
|
||||
self._should_normalize_action = (
|
||||
action is not False and self._is_finite(self._env.action_space))
|
||||
self._should_normalize_action = (action is not False and
|
||||
self._is_finite(self._env.action_space))
|
||||
if action is True and not self._should_normalize_action:
|
||||
raise ValueError('Cannot normalize infinite action range.')
|
||||
if action is None and not self._should_normalize_action:
|
||||
@@ -323,8 +321,7 @@ class ExternalProcess(object):
|
||||
action_space: The cached action space of the environment.
|
||||
"""
|
||||
self._conn, conn = multiprocessing.Pipe()
|
||||
self._process = multiprocessing.Process(
|
||||
target=self._worker, args=(constructor, conn))
|
||||
self._process = multiprocessing.Process(target=self._worker, args=(constructor, conn))
|
||||
atexit.register(self.close)
|
||||
self._process.start()
|
||||
self._observ_space = None
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
r"""Script to train a batch reinforcement learning algorithm.
|
||||
|
||||
Command line:
|
||||
@@ -67,21 +66,25 @@ def _define_loop(graph, logdir, train_steps, eval_steps):
|
||||
Returns:
|
||||
Loop object.
|
||||
"""
|
||||
loop = tools.Loop(
|
||||
logdir, graph.step, graph.should_log, graph.do_report,
|
||||
graph.force_reset)
|
||||
loop.add_phase(
|
||||
'train', graph.done, graph.score, graph.summary, train_steps,
|
||||
report_every=train_steps,
|
||||
log_every=train_steps // 2,
|
||||
checkpoint_every=None,
|
||||
feed={graph.is_training: True})
|
||||
loop.add_phase(
|
||||
'eval', graph.done, graph.score, graph.summary, eval_steps,
|
||||
report_every=eval_steps,
|
||||
log_every=eval_steps // 2,
|
||||
checkpoint_every=10 * eval_steps,
|
||||
feed={graph.is_training: False})
|
||||
loop = tools.Loop(logdir, graph.step, graph.should_log, graph.do_report, graph.force_reset)
|
||||
loop.add_phase('train',
|
||||
graph.done,
|
||||
graph.score,
|
||||
graph.summary,
|
||||
train_steps,
|
||||
report_every=train_steps,
|
||||
log_every=train_steps // 2,
|
||||
checkpoint_every=None,
|
||||
feed={graph.is_training: True})
|
||||
loop.add_phase('eval',
|
||||
graph.done,
|
||||
graph.score,
|
||||
graph.summary,
|
||||
eval_steps,
|
||||
report_every=eval_steps,
|
||||
log_every=eval_steps // 2,
|
||||
checkpoint_every=10 * eval_steps,
|
||||
feed={graph.is_training: False})
|
||||
return loop
|
||||
|
||||
|
||||
@@ -102,18 +105,13 @@ def train(config, env_processes):
|
||||
if config.update_every % config.num_agents:
|
||||
tf.logging.warn('Number of agents should divide episodes per update.')
|
||||
with tf.device('/cpu:0'):
|
||||
batch_env = utility.define_batch_env(
|
||||
lambda: _create_environment(config),
|
||||
config.num_agents, env_processes)
|
||||
graph = utility.define_simulation_graph(
|
||||
batch_env, config.algorithm, config)
|
||||
loop = _define_loop(
|
||||
graph, config.logdir,
|
||||
config.update_every * config.max_length,
|
||||
config.eval_episodes * config.max_length)
|
||||
total_steps = int(
|
||||
config.steps / config.update_every *
|
||||
(config.update_every + config.eval_episodes))
|
||||
batch_env = utility.define_batch_env(lambda: _create_environment(config), config.num_agents,
|
||||
env_processes)
|
||||
graph = utility.define_simulation_graph(batch_env, config.algorithm, config)
|
||||
loop = _define_loop(graph, config.logdir, config.update_every * config.max_length,
|
||||
config.eval_episodes * config.max_length)
|
||||
total_steps = int(config.steps / config.update_every *
|
||||
(config.update_every + config.eval_episodes))
|
||||
# Exclude episode related variables since the Python state of environments is
|
||||
# not checkpointed and thus new episodes start after resuming.
|
||||
saver = utility.define_saver(exclude=(r'.*_temporary/.*',))
|
||||
@@ -131,8 +129,8 @@ def main(_):
|
||||
utility.set_up_logging()
|
||||
if not FLAGS.config:
|
||||
raise KeyError('You must specify a configuration.')
|
||||
logdir = FLAGS.logdir and os.path.expanduser(os.path.join(
|
||||
FLAGS.logdir, '{}-{}'.format(FLAGS.timestamp, FLAGS.config)))
|
||||
logdir = FLAGS.logdir and os.path.expanduser(
|
||||
os.path.join(FLAGS.logdir, '{}-{}'.format(FLAGS.timestamp, FLAGS.config)))
|
||||
try:
|
||||
config = utility.load_config(logdir)
|
||||
except IOError:
|
||||
@@ -144,16 +142,11 @@ def main(_):
|
||||
|
||||
if __name__ == '__main__':
|
||||
FLAGS = tf.app.flags.FLAGS
|
||||
tf.app.flags.DEFINE_string(
|
||||
'logdir', None,
|
||||
'Base directory to store logs.')
|
||||
tf.app.flags.DEFINE_string(
|
||||
'timestamp', datetime.datetime.now().strftime('%Y%m%dT%H%M%S'),
|
||||
'Sub directory to store logs.')
|
||||
tf.app.flags.DEFINE_string(
|
||||
'config', None,
|
||||
'Configuration to execute.')
|
||||
tf.app.flags.DEFINE_boolean(
|
||||
'env_processes', True,
|
||||
'Step environments in separate processes to circumvent the GIL.')
|
||||
tf.app.flags.DEFINE_string('logdir', None, 'Base directory to store logs.')
|
||||
tf.app.flags.DEFINE_string('timestamp',
|
||||
datetime.datetime.now().strftime('%Y%m%dT%H%M%S'),
|
||||
'Sub directory to store logs.')
|
||||
tf.app.flags.DEFINE_string('config', None, 'Configuration to execute.')
|
||||
tf.app.flags.DEFINE_boolean('env_processes', True,
|
||||
'Step environments in separate processes to circumvent the GIL.')
|
||||
tf.app.run()
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Utilities for using reinforcement learning algorithms."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
@@ -46,8 +45,7 @@ def define_simulation_graph(batch_env, algo_cls, config):
|
||||
do_report = tf.placeholder(tf.bool, name='do_report')
|
||||
force_reset = tf.placeholder(tf.bool, name='force_reset')
|
||||
algo = algo_cls(batch_env, step, is_training, should_log, config)
|
||||
done, score, summary = tools.simulate(
|
||||
batch_env, algo, should_log, force_reset)
|
||||
done, score, summary = tools.simulate(batch_env, algo, should_log, force_reset)
|
||||
message = 'Graph contains {} trainable variables.'
|
||||
tf.logging.info(message.format(tools.count_weights()))
|
||||
# pylint: enable=unused-variable
|
||||
@@ -67,9 +65,7 @@ def define_batch_env(constructor, num_agents, env_processes):
|
||||
"""
|
||||
with tf.variable_scope('environments'):
|
||||
if env_processes:
|
||||
envs = [
|
||||
tools.wrappers.ExternalProcess(constructor)
|
||||
for _ in range(num_agents)]
|
||||
envs = [tools.wrappers.ExternalProcess(constructor) for _ in range(num_agents)]
|
||||
else:
|
||||
envs = [constructor() for _ in range(num_agents)]
|
||||
batch_env = tools.BatchEnv(envs, blocking=not env_processes)
|
||||
@@ -111,9 +107,7 @@ def initialize_variables(sess, saver, logdir, checkpoint=None, resume=None):
|
||||
ValueError: If resume expected but no log directory specified.
|
||||
RuntimeError: If no resume expected but a checkpoint was found.
|
||||
"""
|
||||
sess.run(tf.group(
|
||||
tf.local_variables_initializer(),
|
||||
tf.global_variables_initializer()))
|
||||
sess.run(tf.group(tf.local_variables_initializer(), tf.global_variables_initializer()))
|
||||
if resume and not (logdir or checkpoint):
|
||||
raise ValueError('Need to specify logdir to resume a checkpoint.')
|
||||
if logdir:
|
||||
@@ -152,9 +146,8 @@ def save_config(config, logdir=None):
|
||||
with tf.gfile.GFile(config_path, 'w') as file_:
|
||||
yaml.dump(config, file_, default_flow_style=False)
|
||||
else:
|
||||
message = (
|
||||
'Start a new run without storing summaries and checkpoints since no '
|
||||
'logging directory was specified.')
|
||||
message = ('Start a new run without storing summaries and checkpoints since no '
|
||||
'logging directory was specified.')
|
||||
tf.logging.info(message)
|
||||
return config
|
||||
|
||||
@@ -173,9 +166,8 @@ def load_config(logdir):
|
||||
"""
|
||||
config_path = logdir and os.path.join(logdir, 'config.yaml')
|
||||
if not config_path or not tf.gfile.Exists(config_path):
|
||||
message = (
|
||||
'Cannot resume an existing run since the logging directory does not '
|
||||
'contain a configuration file.')
|
||||
message = ('Cannot resume an existing run since the logging directory does not '
|
||||
'contain a configuration file.')
|
||||
raise IOError(message)
|
||||
with tf.gfile.FastGFile(config_path, 'r') as file_:
|
||||
config = yaml.load(file_, Loader=yaml.Loader)
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
r"""Script to render videos of the Proximal Policy Gradient algorithm.
|
||||
|
||||
Command line:
|
||||
@@ -53,8 +52,7 @@ def _create_environment(config, outdir):
|
||||
setattr(env, 'spec', getattr(env, 'spec', None))
|
||||
if config.max_length:
|
||||
env = tools.wrappers.LimitDuration(env, config.max_length)
|
||||
env = gym.wrappers.Monitor(
|
||||
env, outdir, lambda unused_episode_number: True)
|
||||
env = gym.wrappers.Monitor(env, outdir, lambda unused_episode_number: True)
|
||||
env = tools.wrappers.RangeNormalize(env)
|
||||
env = tools.wrappers.ClipAction(env)
|
||||
env = tools.wrappers.ConvertTo32Bit(env)
|
||||
@@ -71,20 +69,20 @@ def _define_loop(graph, eval_steps):
|
||||
Returns:
|
||||
Loop object.
|
||||
"""
|
||||
loop = tools.Loop(
|
||||
None, graph.step, graph.should_log, graph.do_report, graph.force_reset)
|
||||
loop.add_phase(
|
||||
'eval', graph.done, graph.score, graph.summary, eval_steps,
|
||||
report_every=eval_steps,
|
||||
log_every=None,
|
||||
checkpoint_every=None,
|
||||
feed={graph.is_training: False})
|
||||
loop = tools.Loop(None, graph.step, graph.should_log, graph.do_report, graph.force_reset)
|
||||
loop.add_phase('eval',
|
||||
graph.done,
|
||||
graph.score,
|
||||
graph.summary,
|
||||
eval_steps,
|
||||
report_every=eval_steps,
|
||||
log_every=None,
|
||||
checkpoint_every=None,
|
||||
feed={graph.is_training: False})
|
||||
return loop
|
||||
|
||||
|
||||
def visualize(
|
||||
logdir, outdir, num_agents, num_episodes, checkpoint=None,
|
||||
env_processes=True):
|
||||
def visualize(logdir, outdir, num_agents, num_episodes, checkpoint=None, env_processes=True):
|
||||
"""Recover checkpoint and render videos from it.
|
||||
|
||||
Args:
|
||||
@@ -97,20 +95,16 @@ def visualize(
|
||||
"""
|
||||
config = utility.load_config(logdir)
|
||||
with tf.device('/cpu:0'):
|
||||
batch_env = utility.define_batch_env(
|
||||
lambda: _create_environment(config, outdir),
|
||||
num_agents, env_processes)
|
||||
graph = utility.define_simulation_graph(
|
||||
batch_env, config.algorithm, config)
|
||||
batch_env = utility.define_batch_env(lambda: _create_environment(config, outdir), num_agents,
|
||||
env_processes)
|
||||
graph = utility.define_simulation_graph(batch_env, config.algorithm, config)
|
||||
total_steps = num_episodes * config.max_length
|
||||
loop = _define_loop(graph, total_steps)
|
||||
saver = utility.define_saver(
|
||||
exclude=(r'.*_temporary/.*', r'global_step'))
|
||||
saver = utility.define_saver(exclude=(r'.*_temporary/.*', r'global_step'))
|
||||
sess_config = tf.ConfigProto(allow_soft_placement=True)
|
||||
sess_config.gpu_options.allow_growth = True
|
||||
with tf.Session(config=sess_config) as sess:
|
||||
utility.initialize_variables(
|
||||
sess, saver, config.logdir, checkpoint, resume=True)
|
||||
utility.initialize_variables(sess, saver, config.logdir, checkpoint, resume=True)
|
||||
for unused_score in loop.run(sess, saver, total_steps):
|
||||
pass
|
||||
batch_env.close()
|
||||
@@ -123,29 +117,18 @@ def main(_):
|
||||
raise KeyError('You must specify logging and outdirs directories.')
|
||||
FLAGS.logdir = os.path.expanduser(FLAGS.logdir)
|
||||
FLAGS.outdir = os.path.expanduser(FLAGS.outdir)
|
||||
visualize(
|
||||
FLAGS.logdir, FLAGS.outdir, FLAGS.num_agents, FLAGS.num_episodes,
|
||||
FLAGS.checkpoint, FLAGS.env_processes)
|
||||
visualize(FLAGS.logdir, FLAGS.outdir, FLAGS.num_agents, FLAGS.num_episodes, FLAGS.checkpoint,
|
||||
FLAGS.env_processes)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
FLAGS = tf.app.flags.FLAGS
|
||||
tf.app.flags.DEFINE_string(
|
||||
'logdir', None,
|
||||
'Directory to the checkpoint of a training run.')
|
||||
tf.app.flags.DEFINE_string(
|
||||
'outdir', None,
|
||||
'Local directory for storing the monitoring outdir.')
|
||||
tf.app.flags.DEFINE_string(
|
||||
'checkpoint', None,
|
||||
'Checkpoint name to load; defaults to most recent.')
|
||||
tf.app.flags.DEFINE_integer(
|
||||
'num_agents', 1,
|
||||
'How many environments to step in parallel.')
|
||||
tf.app.flags.DEFINE_integer(
|
||||
'num_episodes', 5,
|
||||
'Minimum number of episodes to render.')
|
||||
tf.app.flags.DEFINE_boolean(
|
||||
'env_processes', True,
|
||||
'Step environments in separate processes to circumvent the GIL.')
|
||||
tf.app.flags.DEFINE_string('logdir', None, 'Directory to the checkpoint of a training run.')
|
||||
tf.app.flags.DEFINE_string('outdir', None, 'Local directory for storing the monitoring outdir.')
|
||||
tf.app.flags.DEFINE_string('checkpoint', None,
|
||||
'Checkpoint name to load; defaults to most recent.')
|
||||
tf.app.flags.DEFINE_integer('num_agents', 1, 'How many environments to step in parallel.')
|
||||
tf.app.flags.DEFINE_integer('num_episodes', 5, 'Minimum number of episodes to render.')
|
||||
tf.app.flags.DEFINE_boolean('env_processes', True,
|
||||
'Step environments in separate processes to circumvent the GIL.')
|
||||
tf.app.run()
|
||||
|
||||
Reference in New Issue
Block a user