add a temp copy of TF agents (until the API stops changing or configs.py are included)

This commit is contained in:
Erwin Coumans
2017-11-16 16:47:14 +00:00
parent 7f654bdd87
commit 7b030426c1
24 changed files with 3294 additions and 27 deletions

View File

@@ -0,0 +1,21 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Proximal Policy Optimization algorithm."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from .algorithm import PPOAlgorithm

View File

@@ -0,0 +1,515 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Proximal Policy Optimization algorithm.
Based on John Schulman's implementation in Python and Theano:
https://github.com/joschu/modular_rl/blob/master/modular_rl/ppo.py
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import tensorflow as tf
from . import memory
from . import normalize
from . import utility
class PPOAlgorithm(object):
"""A vectorized implementation of the PPO algorithm by John Schulman."""
def __init__(self, batch_env, step, is_training, should_log, config):
"""Create an instance of the PPO algorithm.
Args:
batch_env: In-graph batch environment.
step: Integer tensor holding the current training step.
is_training: Boolean tensor for whether the algorithm should train.
should_log: Boolean tensor for whether summaries should be returned.
config: Object containing the agent configuration as attributes.
"""
self._batch_env = batch_env
self._step = step
self._is_training = is_training
self._should_log = should_log
self._config = config
self._observ_filter = normalize.StreamingNormalize(
self._batch_env.observ[0], center=True, scale=True, clip=5,
name='normalize_observ')
self._reward_filter = normalize.StreamingNormalize(
self._batch_env.reward[0], center=False, scale=True, clip=10,
name='normalize_reward')
# Memory stores tuple of observ, action, mean, logstd, reward.
template = (
self._batch_env.observ[0], self._batch_env.action[0],
self._batch_env.action[0], self._batch_env.action[0],
self._batch_env.reward[0])
self._memory = memory.EpisodeMemory(
template, config.update_every, config.max_length, 'memory')
self._memory_index = tf.Variable(0, False)
use_gpu = self._config.use_gpu and utility.available_gpus()
with tf.device('/gpu:0' if use_gpu else '/cpu:0'):
# Create network variables for later calls to reuse.
action_size = self._batch_env.action.shape[1].value
self._network = tf.make_template(
'network', functools.partial(config.network, config, action_size))
output = self._network(
tf.zeros_like(self._batch_env.observ)[:, None],
tf.ones(len(self._batch_env)))
with tf.variable_scope('ppo_temporary'):
self._episodes = memory.EpisodeMemory(
template, len(batch_env), config.max_length, 'episodes')
if output.state is None:
self._last_state = None
else:
# Ensure the batch dimension is set.
tf.contrib.framework.nest.map_structure(
lambda x: x.set_shape([len(batch_env)] + x.shape.as_list()[1:]),
output.state)
# pylint: disable=undefined-variable
self._last_state = tf.contrib.framework.nest.map_structure(
lambda x: tf.Variable(lambda: tf.zeros_like(x), False),
output.state)
self._last_action = tf.Variable(
tf.zeros_like(self._batch_env.action), False, name='last_action')
self._last_mean = tf.Variable(
tf.zeros_like(self._batch_env.action), False, name='last_mean')
self._last_logstd = tf.Variable(
tf.zeros_like(self._batch_env.action), False, name='last_logstd')
self._penalty = tf.Variable(
self._config.kl_init_penalty, False, dtype=tf.float32)
self._optimizer = self._config.optimizer(self._config.learning_rate)
def begin_episode(self, agent_indices):
"""Reset the recurrent states and stored episode.
Args:
agent_indices: Tensor containing current batch indices.
Returns:
Summary tensor.
"""
with tf.name_scope('begin_episode/'):
if self._last_state is None:
reset_state = tf.no_op()
else:
reset_state = utility.reinit_nested_vars(
self._last_state, agent_indices)
reset_buffer = self._episodes.clear(agent_indices)
with tf.control_dependencies([reset_state, reset_buffer]):
return tf.constant('')
def perform(self, agent_indices, observ):
"""Compute batch of actions and a summary for a batch of observation.
Args:
agent_indices: Tensor containing current batch indices.
observ: Tensor of a batch of observations for all agents.
Returns:
Tuple of action batch tensor and summary tensor.
"""
with tf.name_scope('perform/'):
observ = self._observ_filter.transform(observ)
if self._last_state is None:
state = None
else:
state = tf.contrib.framework.nest.map_structure(
lambda x: tf.gather(x, agent_indices), self._last_state)
output = self._network(observ[:, None], tf.ones(observ.shape[0]), state)
action = tf.cond(
self._is_training, output.policy.sample, lambda: output.mean)
logprob = output.policy.log_prob(action)[:, 0]
# pylint: disable=g-long-lambda
summary = tf.cond(self._should_log, lambda: tf.summary.merge([
tf.summary.histogram('mean', output.mean[:, 0]),
tf.summary.histogram('std', tf.exp(output.logstd[:, 0])),
tf.summary.histogram('action', action[:, 0]),
tf.summary.histogram('logprob', logprob)]), str)
# Remember current policy to append to memory in the experience callback.
if self._last_state is None:
assign_state = tf.no_op()
else:
assign_state = utility.assign_nested_vars(
self._last_state, output.state, agent_indices)
with tf.control_dependencies([
assign_state,
tf.scatter_update(
self._last_action, agent_indices, action[:, 0]),
tf.scatter_update(
self._last_mean, agent_indices, output.mean[:, 0]),
tf.scatter_update(
self._last_logstd, agent_indices, output.logstd[:, 0])]):
return tf.check_numerics(action[:, 0], 'action'), tf.identity(summary)
def experience(
self, agent_indices, observ, action, reward, unused_done, unused_nextob):
"""Process the transition tuple of the current step.
When training, add the current transition tuple to the memory and update
the streaming statistics for observations and rewards. A summary string is
returned if requested at this step.
Args:
agent_indices: Tensor containing current batch indices.
observ: Batch tensor of observations.
action: Batch tensor of actions.
reward: Batch tensor of rewards.
unused_done: Batch tensor of done flags.
unused_nextob: Batch tensor of successor observations.
Returns:
Summary tensor.
"""
with tf.name_scope('experience/'):
return tf.cond(
self._is_training,
# pylint: disable=g-long-lambda
lambda: self._define_experience(
agent_indices, observ, action, reward), str)
def _define_experience(self, agent_indices, observ, action, reward):
"""Implement the branch of experience() entered during training."""
update_filters = tf.summary.merge([
self._observ_filter.update(observ),
self._reward_filter.update(reward)])
with tf.control_dependencies([update_filters]):
if self._config.train_on_agent_action:
# NOTE: Doesn't seem to change much.
action = self._last_action
batch = (
observ, action, tf.gather(self._last_mean, agent_indices),
tf.gather(self._last_logstd, agent_indices), reward)
append = self._episodes.append(batch, agent_indices)
with tf.control_dependencies([append]):
norm_observ = self._observ_filter.transform(observ)
norm_reward = tf.reduce_mean(self._reward_filter.transform(reward))
# pylint: disable=g-long-lambda
summary = tf.cond(self._should_log, lambda: tf.summary.merge([
update_filters,
self._observ_filter.summary(),
self._reward_filter.summary(),
tf.summary.scalar('memory_size', self._memory_index),
tf.summary.histogram('normalized_observ', norm_observ),
tf.summary.histogram('action', self._last_action),
tf.summary.scalar('normalized_reward', norm_reward)]), str)
return summary
def end_episode(self, agent_indices):
"""Add episodes to the memory and perform update steps if memory is full.
During training, add the collected episodes of the batch indices that
finished their episode to the memory. If the memory is full, train on it,
and then clear the memory. A summary string is returned if requested at
this step.
Args:
agent_indices: Tensor containing current batch indices.
Returns:
Summary tensor.
"""
with tf.name_scope('end_episode/'):
return tf.cond(
self._is_training,
lambda: self._define_end_episode(agent_indices), str)
def _define_end_episode(self, agent_indices):
"""Implement the branch of end_episode() entered during training."""
episodes, length = self._episodes.data(agent_indices)
space_left = self._config.update_every - self._memory_index
use_episodes = tf.range(tf.minimum(
tf.shape(agent_indices)[0], space_left))
episodes = [tf.gather(elem, use_episodes) for elem in episodes]
append = self._memory.replace(
episodes, tf.gather(length, use_episodes),
use_episodes + self._memory_index)
with tf.control_dependencies([append]):
inc_index = self._memory_index.assign_add(tf.shape(use_episodes)[0])
with tf.control_dependencies([inc_index]):
memory_full = self._memory_index >= self._config.update_every
return tf.cond(memory_full, self._training, str)
def _training(self):
"""Perform multiple training iterations of both policy and value baseline.
Training on the episodes collected in the memory. Reset the memory
afterwards. Always returns a summary string.
Returns:
Summary tensor.
"""
with tf.name_scope('training'):
assert_full = tf.assert_equal(
self._memory_index, self._config.update_every)
with tf.control_dependencies([assert_full]):
data = self._memory.data()
(observ, action, old_mean, old_logstd, reward), length = data
with tf.control_dependencies([tf.assert_greater(length, 0)]):
length = tf.identity(length)
observ = self._observ_filter.transform(observ)
reward = self._reward_filter.transform(reward)
update_summary = self._perform_update_steps(
observ, action, old_mean, old_logstd, reward, length)
with tf.control_dependencies([update_summary]):
penalty_summary = self._adjust_penalty(
observ, old_mean, old_logstd, length)
with tf.control_dependencies([penalty_summary]):
clear_memory = tf.group(
self._memory.clear(), self._memory_index.assign(0))
with tf.control_dependencies([clear_memory]):
weight_summary = utility.variable_summaries(
tf.trainable_variables(), self._config.weight_summaries)
return tf.summary.merge([
update_summary, penalty_summary, weight_summary])
def _perform_update_steps(
self, observ, action, old_mean, old_logstd, reward, length):
"""Perform multiple update steps of value function and policy.
The advantage is computed once at the beginning and shared across
iterations. We need to decide for the summary of one iteration, and thus
choose the one after half of the iterations.
Args:
observ: Sequences of observations.
action: Sequences of actions.
old_mean: Sequences of action means of the behavioral policy.
old_logstd: Sequences of action log stddevs of the behavioral policy.
reward: Sequences of rewards.
length: Batch of sequence lengths.
Returns:
Summary tensor.
"""
return_ = utility.discounted_return(
reward, length, self._config.discount)
value = self._network(observ, length).value
if self._config.gae_lambda:
advantage = utility.lambda_return(
reward, value, length, self._config.discount,
self._config.gae_lambda)
else:
advantage = return_ - value
mean, variance = tf.nn.moments(advantage, axes=[0, 1], keep_dims=True)
advantage = (advantage - mean) / (tf.sqrt(variance) + 1e-8)
advantage = tf.Print(
advantage, [tf.reduce_mean(return_), tf.reduce_mean(value)],
'return and value: ')
advantage = tf.Print(
advantage, [tf.reduce_mean(advantage)],
'normalized advantage: ')
# pylint: disable=g-long-lambda
value_loss, policy_loss, summary = tf.scan(
lambda _1, _2: self._update_step(
observ, action, old_mean, old_logstd, reward, advantage, length),
tf.range(self._config.update_epochs),
[0., 0., ''], parallel_iterations=1)
print_losses = tf.group(
tf.Print(0, [tf.reduce_mean(value_loss)], 'value loss: '),
tf.Print(0, [tf.reduce_mean(policy_loss)], 'policy loss: '))
with tf.control_dependencies([value_loss, policy_loss, print_losses]):
return summary[self._config.update_epochs // 2]
def _update_step(
self, observ, action, old_mean, old_logstd, reward, advantage, length):
"""Compute the current combined loss and perform a gradient update step.
Args:
observ: Sequences of observations.
action: Sequences of actions.
old_mean: Sequences of action means of the behavioral policy.
old_logstd: Sequences of action log stddevs of the behavioral policy.
reward: Sequences of reward.
advantage: Sequences of advantages.
length: Batch of sequence lengths.
Returns:
Tuple of value loss, policy loss, and summary tensor.
"""
value_loss, value_summary = self._value_loss(observ, reward, length)
network = self._network(observ, length)
policy_loss, policy_summary = self._policy_loss(
network.mean, network.logstd, old_mean, old_logstd, action,
advantage, length)
value_gradients, value_variables = (
zip(*self._optimizer.compute_gradients(value_loss)))
policy_gradients, policy_variables = (
zip(*self._optimizer.compute_gradients(policy_loss)))
all_gradients = value_gradients + policy_gradients
all_variables = value_variables + policy_variables
optimize = self._optimizer.apply_gradients(
zip(all_gradients, all_variables))
summary = tf.summary.merge([
value_summary, policy_summary,
tf.summary.scalar(
'value_gradient_norm', tf.global_norm(value_gradients)),
tf.summary.scalar(
'policy_gradient_norm', tf.global_norm(policy_gradients)),
utility.gradient_summaries(
zip(value_gradients, value_variables), dict(value=r'.*')),
utility.gradient_summaries(
zip(policy_gradients, policy_variables), dict(policy=r'.*'))])
with tf.control_dependencies([optimize]):
return [tf.identity(x) for x in (value_loss, policy_loss, summary)]
def _value_loss(self, observ, reward, length):
"""Compute the loss function for the value baseline.
The value loss is the difference between empirical and approximated returns
over the collected episodes. Returns the loss tensor and a summary strin.
Args:
observ: Sequences of observations.
reward: Sequences of reward.
length: Batch of sequence lengths.
Returns:
Tuple of loss tensor and summary tensor.
"""
with tf.name_scope('value_loss'):
value = self._network(observ, length).value
return_ = utility.discounted_return(
reward, length, self._config.discount)
advantage = return_ - value
value_loss = 0.5 * self._mask(advantage ** 2, length)
summary = tf.summary.merge([
tf.summary.histogram('value_loss', value_loss),
tf.summary.scalar('avg_value_loss', tf.reduce_mean(value_loss))])
value_loss = tf.reduce_mean(value_loss)
return tf.check_numerics(value_loss, 'value_loss'), summary
def _policy_loss(
self, mean, logstd, old_mean, old_logstd, action, advantage, length):
"""Compute the policy loss composed of multiple components.
1. The policy gradient loss is importance sampled from the data-collecting
policy at the beginning of training.
2. The second term is a KL penalty between the policy at the beginning of
training and the current policy.
3. Additionally, if this KL already changed more than twice the target
amount, we activate a strong penalty discouraging further divergence.
Args:
mean: Sequences of action means of the current policy.
logstd: Sequences of action log stddevs of the current policy.
old_mean: Sequences of action means of the behavioral policy.
old_logstd: Sequences of action log stddevs of the behavioral policy.
action: Sequences of actions.
advantage: Sequences of advantages.
length: Batch of sequence lengths.
Returns:
Tuple of loss tensor and summary tensor.
"""
with tf.name_scope('policy_loss'):
entropy = utility.diag_normal_entropy(mean, logstd)
kl = tf.reduce_mean(self._mask(utility.diag_normal_kl(
old_mean, old_logstd, mean, logstd), length), 1)
policy_gradient = tf.exp(
utility.diag_normal_logpdf(mean, logstd, action) -
utility.diag_normal_logpdf(old_mean, old_logstd, action))
surrogate_loss = -tf.reduce_mean(self._mask(
policy_gradient * tf.stop_gradient(advantage), length), 1)
kl_penalty = self._penalty * kl
cutoff_threshold = self._config.kl_target * self._config.kl_cutoff_factor
cutoff_count = tf.reduce_sum(
tf.cast(kl > cutoff_threshold, tf.int32))
with tf.control_dependencies([tf.cond(
cutoff_count > 0,
lambda: tf.Print(0, [cutoff_count], 'kl cutoff! '), int)]):
kl_cutoff = (
self._config.kl_cutoff_coef *
tf.cast(kl > cutoff_threshold, tf.float32) *
(kl - cutoff_threshold) ** 2)
policy_loss = surrogate_loss + kl_penalty + kl_cutoff
summary = tf.summary.merge([
tf.summary.histogram('entropy', entropy),
tf.summary.histogram('kl', kl),
tf.summary.histogram('surrogate_loss', surrogate_loss),
tf.summary.histogram('kl_penalty', kl_penalty),
tf.summary.histogram('kl_cutoff', kl_cutoff),
tf.summary.histogram('kl_penalty_combined', kl_penalty + kl_cutoff),
tf.summary.histogram('policy_loss', policy_loss),
tf.summary.scalar('avg_surr_loss', tf.reduce_mean(surrogate_loss)),
tf.summary.scalar('avg_kl_penalty', tf.reduce_mean(kl_penalty)),
tf.summary.scalar('avg_policy_loss', tf.reduce_mean(policy_loss))])
policy_loss = tf.reduce_mean(policy_loss, 0)
return tf.check_numerics(policy_loss, 'policy_loss'), summary
def _adjust_penalty(self, observ, old_mean, old_logstd, length):
"""Adjust the KL policy between the behavioral and current policy.
Compute how much the policy actually changed during the multiple
update steps. Adjust the penalty strength for the next training phase if we
overshot or undershot the target divergence too much.
Args:
observ: Sequences of observations.
old_mean: Sequences of action means of the behavioral policy.
old_logstd: Sequences of action log stddevs of the behavioral policy.
length: Batch of sequence lengths.
Returns:
Summary tensor.
"""
with tf.name_scope('adjust_penalty'):
network = self._network(observ, length)
assert_change = tf.assert_equal(
tf.reduce_all(tf.equal(network.mean, old_mean)), False,
message='policy should change')
print_penalty = tf.Print(0, [self._penalty], 'current penalty: ')
with tf.control_dependencies([assert_change, print_penalty]):
kl_change = tf.reduce_mean(self._mask(utility.diag_normal_kl(
old_mean, old_logstd, network.mean, network.logstd), length))
kl_change = tf.Print(kl_change, [kl_change], 'kl change: ')
maybe_increase = tf.cond(
kl_change > 1.3 * self._config.kl_target,
# pylint: disable=g-long-lambda
lambda: tf.Print(self._penalty.assign(
self._penalty * 1.5), [0], 'increase penalty '),
float)
maybe_decrease = tf.cond(
kl_change < 0.7 * self._config.kl_target,
# pylint: disable=g-long-lambda
lambda: tf.Print(self._penalty.assign(
self._penalty / 1.5), [0], 'decrease penalty '),
float)
with tf.control_dependencies([maybe_increase, maybe_decrease]):
return tf.summary.merge([
tf.summary.scalar('kl_change', kl_change),
tf.summary.scalar('penalty', self._penalty)])
def _mask(self, tensor, length):
"""Set padding elements of a batch of sequences to zero.
Useful to then safely sum along the time dimension.
Args:
tensor: Tensor of sequences.
length: Batch of sequence lengths.
Returns:
Masked sequences.
"""
with tf.name_scope('mask'):
range_ = tf.range(tensor.shape[1].value)
mask = tf.cast(range_[None, :] < length[:, None], tf.float32)
masked = tensor * mask
return tf.check_numerics(masked, 'masked')

View File

@@ -0,0 +1,152 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Memory that stores episodes."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
class EpisodeMemory(object):
"""Memory that stores episodes."""
def __init__(self, template, capacity, max_length, scope):
"""Create a memory that stores episodes.
Each transition tuple consists of quantities specified by the template.
These quantities would typically be be observartions, actions, rewards, and
done indicators.
Args:
template: List of tensors to derive shapes and dtypes of each transition.
capacity: Number of episodes, or rows, hold by the memory.
max_length: Allocated sequence length for the episodes.
scope: Variable scope to use for internal variables.
"""
self._capacity = capacity
self._max_length = max_length
with tf.variable_scope(scope) as var_scope:
self._scope = var_scope
self._length = tf.Variable(tf.zeros(capacity, tf.int32), False)
self._buffers = [
tf.Variable(tf.zeros(
[capacity, max_length] + elem.shape.as_list(),
elem.dtype), False)
for elem in template]
def length(self, rows=None):
"""Tensor holding the current length of episodes.
Args:
rows: Episodes to select length from, defaults to all.
Returns:
Batch tensor of sequence lengths.
"""
rows = tf.range(self._capacity) if rows is None else rows
return tf.gather(self._length, rows)
def append(self, transitions, rows=None):
"""Append a batch of transitions to rows of the memory.
Args:
transitions: Tuple of transition quantities with batch dimension.
rows: Episodes to append to, defaults to all.
Returns:
Operation.
"""
rows = tf.range(self._capacity) if rows is None else rows
assert rows.shape.ndims == 1
assert_capacity = tf.assert_less(
rows, self._capacity,
message='capacity exceeded')
with tf.control_dependencies([assert_capacity]):
assert_max_length = tf.assert_less(
tf.gather(self._length, rows), self._max_length,
message='max length exceeded')
append_ops = []
with tf.control_dependencies([assert_max_length]):
for buffer_, elements in zip(self._buffers, transitions):
timestep = tf.gather(self._length, rows)
indices = tf.stack([rows, timestep], 1)
append_ops.append(tf.scatter_nd_update(buffer_, indices, elements))
with tf.control_dependencies(append_ops):
episode_mask = tf.reduce_sum(tf.one_hot(
rows, self._capacity, dtype=tf.int32), 0)
return self._length.assign_add(episode_mask)
def replace(self, episodes, length, rows=None):
"""Replace full episodes.
Args:
episodes: Tuple of transition quantities with batch and time dimensions.
length: Batch of sequence lengths.
rows: Episodes to replace, defaults to all.
Returns:
Operation.
"""
rows = tf.range(self._capacity) if rows is None else rows
assert rows.shape.ndims == 1
assert_capacity = tf.assert_less(
rows, self._capacity, message='capacity exceeded')
with tf.control_dependencies([assert_capacity]):
assert_max_length = tf.assert_less_equal(
length, self._max_length, message='max length exceeded')
replace_ops = []
with tf.control_dependencies([assert_max_length]):
for buffer_, elements in zip(self._buffers, episodes):
replace_op = tf.scatter_update(buffer_, rows, elements)
replace_ops.append(replace_op)
with tf.control_dependencies(replace_ops):
return tf.scatter_update(self._length, rows, length)
def data(self, rows=None):
"""Access a batch of episodes from the memory.
Padding elements after the length of each episode are unspecified and might
contain old data.
Args:
rows: Episodes to select, defaults to all.
Returns:
Tuple containing a tuple of transition quantiries with batch and time
dimensions, and a batch of sequence lengths.
"""
rows = tf.range(self._capacity) if rows is None else rows
assert rows.shape.ndims == 1
episode = [tf.gather(buffer_, rows) for buffer_ in self._buffers]
length = tf.gather(self._length, rows)
return episode, length
def clear(self, rows=None):
"""Reset episodes in the memory.
Internally, this only sets their lengths to zero. The memory entries will
be overridden by future calls to append() or replace().
Args:
rows: Episodes to clear, defaults to all.
Returns:
Operation.
"""
rows = tf.range(self._capacity) if rows is None else rows
assert rows.shape.ndims == 1
return tf.scatter_update(self._length, rows, tf.zeros_like(rows))

View File

@@ -0,0 +1,168 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Normalize tensors based on streaming estimates of mean and variance."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
class StreamingNormalize(object):
"""Normalize tensors based on streaming estimates of mean and variance."""
def __init__(
self, template, center=True, scale=True, clip=10, name='normalize'):
"""Normalize tensors based on streaming estimates of mean and variance.
Centering the value, scaling it by the standard deviation, and clipping
outlier values are optional.
Args:
template: Example tensor providing shape and dtype of the vaule to track.
center: Python boolean indicating whether to subtract mean from values.
scale: Python boolean indicating whether to scale values by stddev.
clip: If and when to clip normalized values.
name: Parent scope of operations provided by this class.
"""
self._center = center
self._scale = scale
self._clip = clip
self._name = name
with tf.name_scope(name):
self._count = tf.Variable(0, False)
self._mean = tf.Variable(tf.zeros_like(template), False)
self._var_sum = tf.Variable(tf.zeros_like(template), False)
def transform(self, value):
"""Normalize a single or batch tensor.
Applies the activated transformations in the constructor using current
estimates of mean and variance.
Args:
value: Batch or single value tensor.
Returns:
Normalized batch or single value tensor.
"""
with tf.name_scope(self._name + '/transform'):
no_batch_dim = value.shape.ndims == self._mean.shape.ndims
if no_batch_dim:
# Add a batch dimension if necessary.
value = value[None, ...]
if self._center:
value -= self._mean[None, ...]
if self._scale:
# We cannot scale before seeing at least two samples.
value /= tf.cond(
self._count > 1, lambda: self._std() + 1e-8,
lambda: tf.ones_like(self._var_sum))[None]
if self._clip:
value = tf.clip_by_value(value, -self._clip, self._clip)
# Remove batch dimension if necessary.
if no_batch_dim:
value = value[0]
return tf.check_numerics(value, 'value')
def update(self, value):
"""Update the mean and variance estimates.
Args:
value: Batch or single value tensor.
Returns:
Summary tensor.
"""
with tf.name_scope(self._name + '/update'):
if value.shape.ndims == self._mean.shape.ndims:
# Add a batch dimension if necessary.
value = value[None, ...]
count = tf.shape(value)[0]
with tf.control_dependencies([self._count.assign_add(count)]):
step = tf.cast(self._count, tf.float32)
mean_delta = tf.reduce_sum(value - self._mean[None, ...], 0)
new_mean = self._mean + mean_delta / step
new_mean = tf.cond(self._count > 1, lambda: new_mean, lambda: value[0])
var_delta = (
value - self._mean[None, ...]) * (value - new_mean[None, ...])
new_var_sum = self._var_sum + tf.reduce_sum(var_delta, 0)
with tf.control_dependencies([new_mean, new_var_sum]):
update = self._mean.assign(new_mean), self._var_sum.assign(new_var_sum)
with tf.control_dependencies(update):
if value.shape.ndims == 1:
value = tf.reduce_mean(value)
return self._summary('value', tf.reduce_mean(value))
def reset(self):
"""Reset the estimates of mean and variance.
Resets the full state of this class.
Returns:
Operation.
"""
with tf.name_scope(self._name + '/reset'):
return tf.group(
self._count.assign(0),
self._mean.assign(tf.zeros_like(self._mean)),
self._var_sum.assign(tf.zeros_like(self._var_sum)))
def summary(self):
"""Summary string of mean and standard deviation.
Returns:
Summary tensor.
"""
with tf.name_scope(self._name + '/summary'):
mean_summary = tf.cond(
self._count > 0, lambda: self._summary('mean', self._mean), str)
std_summary = tf.cond(
self._count > 1, lambda: self._summary('stddev', self._std()), str)
return tf.summary.merge([mean_summary, std_summary])
def _std(self):
"""Computes the current estimate of the standard deviation.
Note that the standard deviation is not defined until at least two samples
were seen.
Returns:
Tensor of current variance.
"""
variance = tf.cond(
self._count > 1,
lambda: self._var_sum / tf.cast(self._count - 1, tf.float32),
lambda: tf.ones_like(self._var_sum) * float('nan'))
# The epsilon corrects for small negative variance values caused by
# the algorithm. It was empirically chosen to work with all environments
# tested.
return tf.sqrt(variance + 1e-4)
def _summary(self, name, tensor):
"""Create a scalar or histogram summary matching the rank of the tensor.
Args:
name: Name for the summary.
tensor: Tensor to summarize.
Returns:
Summary tensor.
"""
if tensor.shape.ndims == 0:
return tf.summary.scalar(name, tensor)
else:
return tf.summary.histogram(name, tensor)

View File

@@ -0,0 +1,213 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utilities for the PPO algorithm."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import math
import re
import tensorflow as tf
from tensorflow.python.client import device_lib
def reinit_nested_vars(variables, indices=None):
"""Reset all variables in a nested tuple to zeros.
Args:
variables: Nested tuple or list of variaables.
indices: Batch indices to reset, defaults to all.
Returns:
Operation.
"""
if isinstance(variables, (tuple, list)):
return tf.group(*[
reinit_nested_vars(variable, indices) for variable in variables])
if indices is None:
return variables.assign(tf.zeros_like(variables))
else:
zeros = tf.zeros([tf.shape(indices)[0]] + variables.shape[1:].as_list())
return tf.scatter_update(variables, indices, zeros)
def assign_nested_vars(variables, tensors, indices=None):
"""Assign tensors to matching nested tuple of variables.
Args:
variables: Nested tuple or list of variables to update.
tensors: Nested tuple or list of tensors to assign.
indices: Batch indices to assign to; default to all.
Returns:
Operation.
"""
if isinstance(variables, (tuple, list)):
return tf.group(*[
assign_nested_vars(variable, tensor)
for variable, tensor in zip(variables, tensors)])
if indices is None:
return variables.assign(tensors)
else:
return tf.scatter_update(variables, indices, tensors)
def discounted_return(reward, length, discount):
"""Discounted Monte-Carlo returns."""
timestep = tf.range(reward.shape[1].value)
mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
return_ = tf.reverse(tf.transpose(tf.scan(
lambda agg, cur: cur + discount * agg,
tf.transpose(tf.reverse(mask * reward, [1]), [1, 0]),
tf.zeros_like(reward[:, -1]), 1, False), [1, 0]), [1])
return tf.check_numerics(tf.stop_gradient(return_), 'return')
def fixed_step_return(reward, value, length, discount, window):
"""N-step discounted return."""
timestep = tf.range(reward.shape[1].value)
mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
return_ = tf.zeros_like(reward)
for _ in range(window):
return_ += reward
reward = discount * tf.concat(
[reward[:, 1:], tf.zeros_like(reward[:, -1:])], 1)
return_ += discount ** window * tf.concat(
[value[:, window:], tf.zeros_like(value[:, -window:]), 1])
return tf.check_numerics(tf.stop_gradient(mask * return_), 'return')
def lambda_return(reward, value, length, discount, lambda_):
"""TD-lambda returns."""
timestep = tf.range(reward.shape[1].value)
mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
sequence = mask * reward + discount * value * (1 - lambda_)
discount = mask * discount * lambda_
sequence = tf.stack([sequence, discount], 2)
return_ = tf.reverse(tf.transpose(tf.scan(
lambda agg, cur: cur[0] + cur[1] * agg,
tf.transpose(tf.reverse(sequence, [1]), [1, 2, 0]),
tf.zeros_like(value[:, -1]), 1, False), [1, 0]), [1])
return tf.check_numerics(tf.stop_gradient(return_), 'return')
def lambda_advantage(reward, value, length, discount):
"""Generalized Advantage Estimation."""
timestep = tf.range(reward.shape[1].value)
mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
next_value = tf.concat([value[:, 1:], tf.zeros_like(value[:, -1:])], 1)
delta = reward + discount * next_value - value
advantage = tf.reverse(tf.transpose(tf.scan(
lambda agg, cur: cur + discount * agg,
tf.transpose(tf.reverse(mask * delta, [1]), [1, 0]),
tf.zeros_like(delta[:, -1]), 1, False), [1, 0]), [1])
return tf.check_numerics(tf.stop_gradient(advantage), 'advantage')
def diag_normal_kl(mean0, logstd0, mean1, logstd1):
"""Epirical KL divergence of two normals with diagonal covariance."""
logstd0_2, logstd1_2 = 2 * logstd0, 2 * logstd1
return 0.5 * (
tf.reduce_sum(tf.exp(logstd0_2 - logstd1_2), -1) +
tf.reduce_sum((mean1 - mean0) ** 2 / tf.exp(logstd1_2), -1) +
tf.reduce_sum(logstd1_2, -1) - tf.reduce_sum(logstd0_2, -1) -
mean0.shape[-1].value)
def diag_normal_logpdf(mean, logstd, loc):
"""Log density of a normal with diagonal covariance."""
constant = -0.5 * math.log(2 * math.pi) - logstd
value = -0.5 * ((loc - mean) / tf.exp(logstd)) ** 2
return tf.reduce_sum(constant + value, -1)
def diag_normal_entropy(mean, logstd):
"""Empirical entropy of a normal with diagonal covariance."""
constant = mean.shape[-1].value * math.log(2 * math.pi * math.e)
return (constant + tf.reduce_sum(2 * logstd, 1)) / 2
def available_gpus():
"""List of GPU device names detected by TensorFlow."""
local_device_protos = device_lib.list_local_devices()
return [x.name for x in local_device_protos if x.device_type == 'GPU']
def gradient_summaries(grad_vars, groups=None, scope='gradients'):
"""Create histogram summaries of the gradient.
Summaries can be grouped via regexes matching variables names.
Args:
grad_vars: List of (gradient, variable) tuples as returned by optimizers.
groups: Mapping of name to regex for grouping summaries.
scope: Name scope for this operation.
Returns:
Summary tensor.
"""
groups = groups or {r'all': r'.*'}
grouped = collections.defaultdict(list)
for grad, var in grad_vars:
if grad is None:
continue
for name, pattern in groups.items():
if re.match(pattern, var.name):
name = re.sub(pattern, name, var.name)
grouped[name].append(grad)
for name in groups:
if name not in grouped:
tf.logging.warn("No variables matching '{}' group.".format(name))
summaries = []
for name, grads in grouped.items():
grads = [tf.reshape(grad, [-1]) for grad in grads]
grads = tf.concat(grads, 0)
summaries.append(tf.summary.histogram(scope + '/' + name, grads))
return tf.summary.merge(summaries)
def variable_summaries(vars_, groups=None, scope='weights'):
"""Create histogram summaries for the provided variables.
Summaries can be grouped via regexes matching variables names.
Args:
vars_: List of variables to summarize.
groups: Mapping of name to regex for grouping summaries.
scope: Name scope for this operation.
Returns:
Summary tensor.
"""
groups = groups or {r'all': r'.*'}
grouped = collections.defaultdict(list)
for var in vars_:
for name, pattern in groups.items():
if re.match(pattern, var.name):
name = re.sub(pattern, name, var.name)
grouped[name].append(var)
for name in groups:
if name not in grouped:
tf.logging.warn("No variables matching '{}' group.".format(name))
summaries = []
# pylint: disable=redefined-argument-from-local
for name, vars_ in grouped.items():
vars_ = [tf.reshape(var, [-1]) for var in vars_]
vars_ = tf.concat(vars_, 0)
summaries.append(tf.summary.histogram(scope + '/' + name, vars_))
return tf.summary.merge(summaries)