add a temp copy of TF agents (until the API stops changing or configs.py are included)
This commit is contained in:
21
examples/pybullet/gym/pybullet_envs/agents/ppo/__init__.py
Normal file
21
examples/pybullet/gym/pybullet_envs/agents/ppo/__init__.py
Normal file
@@ -0,0 +1,21 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Proximal Policy Optimization algorithm."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from .algorithm import PPOAlgorithm
|
||||
515
examples/pybullet/gym/pybullet_envs/agents/ppo/algorithm.py
Normal file
515
examples/pybullet/gym/pybullet_envs/agents/ppo/algorithm.py
Normal file
@@ -0,0 +1,515 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Proximal Policy Optimization algorithm.
|
||||
|
||||
Based on John Schulman's implementation in Python and Theano:
|
||||
https://github.com/joschu/modular_rl/blob/master/modular_rl/ppo.py
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import functools
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from . import memory
|
||||
from . import normalize
|
||||
from . import utility
|
||||
|
||||
|
||||
class PPOAlgorithm(object):
|
||||
"""A vectorized implementation of the PPO algorithm by John Schulman."""
|
||||
|
||||
def __init__(self, batch_env, step, is_training, should_log, config):
|
||||
"""Create an instance of the PPO algorithm.
|
||||
|
||||
Args:
|
||||
batch_env: In-graph batch environment.
|
||||
step: Integer tensor holding the current training step.
|
||||
is_training: Boolean tensor for whether the algorithm should train.
|
||||
should_log: Boolean tensor for whether summaries should be returned.
|
||||
config: Object containing the agent configuration as attributes.
|
||||
"""
|
||||
self._batch_env = batch_env
|
||||
self._step = step
|
||||
self._is_training = is_training
|
||||
self._should_log = should_log
|
||||
self._config = config
|
||||
self._observ_filter = normalize.StreamingNormalize(
|
||||
self._batch_env.observ[0], center=True, scale=True, clip=5,
|
||||
name='normalize_observ')
|
||||
self._reward_filter = normalize.StreamingNormalize(
|
||||
self._batch_env.reward[0], center=False, scale=True, clip=10,
|
||||
name='normalize_reward')
|
||||
# Memory stores tuple of observ, action, mean, logstd, reward.
|
||||
template = (
|
||||
self._batch_env.observ[0], self._batch_env.action[0],
|
||||
self._batch_env.action[0], self._batch_env.action[0],
|
||||
self._batch_env.reward[0])
|
||||
self._memory = memory.EpisodeMemory(
|
||||
template, config.update_every, config.max_length, 'memory')
|
||||
self._memory_index = tf.Variable(0, False)
|
||||
use_gpu = self._config.use_gpu and utility.available_gpus()
|
||||
with tf.device('/gpu:0' if use_gpu else '/cpu:0'):
|
||||
# Create network variables for later calls to reuse.
|
||||
action_size = self._batch_env.action.shape[1].value
|
||||
self._network = tf.make_template(
|
||||
'network', functools.partial(config.network, config, action_size))
|
||||
output = self._network(
|
||||
tf.zeros_like(self._batch_env.observ)[:, None],
|
||||
tf.ones(len(self._batch_env)))
|
||||
with tf.variable_scope('ppo_temporary'):
|
||||
self._episodes = memory.EpisodeMemory(
|
||||
template, len(batch_env), config.max_length, 'episodes')
|
||||
if output.state is None:
|
||||
self._last_state = None
|
||||
else:
|
||||
# Ensure the batch dimension is set.
|
||||
tf.contrib.framework.nest.map_structure(
|
||||
lambda x: x.set_shape([len(batch_env)] + x.shape.as_list()[1:]),
|
||||
output.state)
|
||||
# pylint: disable=undefined-variable
|
||||
self._last_state = tf.contrib.framework.nest.map_structure(
|
||||
lambda x: tf.Variable(lambda: tf.zeros_like(x), False),
|
||||
output.state)
|
||||
self._last_action = tf.Variable(
|
||||
tf.zeros_like(self._batch_env.action), False, name='last_action')
|
||||
self._last_mean = tf.Variable(
|
||||
tf.zeros_like(self._batch_env.action), False, name='last_mean')
|
||||
self._last_logstd = tf.Variable(
|
||||
tf.zeros_like(self._batch_env.action), False, name='last_logstd')
|
||||
self._penalty = tf.Variable(
|
||||
self._config.kl_init_penalty, False, dtype=tf.float32)
|
||||
self._optimizer = self._config.optimizer(self._config.learning_rate)
|
||||
|
||||
def begin_episode(self, agent_indices):
|
||||
"""Reset the recurrent states and stored episode.
|
||||
|
||||
Args:
|
||||
agent_indices: Tensor containing current batch indices.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
with tf.name_scope('begin_episode/'):
|
||||
if self._last_state is None:
|
||||
reset_state = tf.no_op()
|
||||
else:
|
||||
reset_state = utility.reinit_nested_vars(
|
||||
self._last_state, agent_indices)
|
||||
reset_buffer = self._episodes.clear(agent_indices)
|
||||
with tf.control_dependencies([reset_state, reset_buffer]):
|
||||
return tf.constant('')
|
||||
|
||||
def perform(self, agent_indices, observ):
|
||||
"""Compute batch of actions and a summary for a batch of observation.
|
||||
|
||||
Args:
|
||||
agent_indices: Tensor containing current batch indices.
|
||||
observ: Tensor of a batch of observations for all agents.
|
||||
|
||||
Returns:
|
||||
Tuple of action batch tensor and summary tensor.
|
||||
"""
|
||||
with tf.name_scope('perform/'):
|
||||
observ = self._observ_filter.transform(observ)
|
||||
if self._last_state is None:
|
||||
state = None
|
||||
else:
|
||||
state = tf.contrib.framework.nest.map_structure(
|
||||
lambda x: tf.gather(x, agent_indices), self._last_state)
|
||||
output = self._network(observ[:, None], tf.ones(observ.shape[0]), state)
|
||||
action = tf.cond(
|
||||
self._is_training, output.policy.sample, lambda: output.mean)
|
||||
logprob = output.policy.log_prob(action)[:, 0]
|
||||
# pylint: disable=g-long-lambda
|
||||
summary = tf.cond(self._should_log, lambda: tf.summary.merge([
|
||||
tf.summary.histogram('mean', output.mean[:, 0]),
|
||||
tf.summary.histogram('std', tf.exp(output.logstd[:, 0])),
|
||||
tf.summary.histogram('action', action[:, 0]),
|
||||
tf.summary.histogram('logprob', logprob)]), str)
|
||||
# Remember current policy to append to memory in the experience callback.
|
||||
if self._last_state is None:
|
||||
assign_state = tf.no_op()
|
||||
else:
|
||||
assign_state = utility.assign_nested_vars(
|
||||
self._last_state, output.state, agent_indices)
|
||||
with tf.control_dependencies([
|
||||
assign_state,
|
||||
tf.scatter_update(
|
||||
self._last_action, agent_indices, action[:, 0]),
|
||||
tf.scatter_update(
|
||||
self._last_mean, agent_indices, output.mean[:, 0]),
|
||||
tf.scatter_update(
|
||||
self._last_logstd, agent_indices, output.logstd[:, 0])]):
|
||||
return tf.check_numerics(action[:, 0], 'action'), tf.identity(summary)
|
||||
|
||||
def experience(
|
||||
self, agent_indices, observ, action, reward, unused_done, unused_nextob):
|
||||
"""Process the transition tuple of the current step.
|
||||
|
||||
When training, add the current transition tuple to the memory and update
|
||||
the streaming statistics for observations and rewards. A summary string is
|
||||
returned if requested at this step.
|
||||
|
||||
Args:
|
||||
agent_indices: Tensor containing current batch indices.
|
||||
observ: Batch tensor of observations.
|
||||
action: Batch tensor of actions.
|
||||
reward: Batch tensor of rewards.
|
||||
unused_done: Batch tensor of done flags.
|
||||
unused_nextob: Batch tensor of successor observations.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
with tf.name_scope('experience/'):
|
||||
return tf.cond(
|
||||
self._is_training,
|
||||
# pylint: disable=g-long-lambda
|
||||
lambda: self._define_experience(
|
||||
agent_indices, observ, action, reward), str)
|
||||
|
||||
def _define_experience(self, agent_indices, observ, action, reward):
|
||||
"""Implement the branch of experience() entered during training."""
|
||||
update_filters = tf.summary.merge([
|
||||
self._observ_filter.update(observ),
|
||||
self._reward_filter.update(reward)])
|
||||
with tf.control_dependencies([update_filters]):
|
||||
if self._config.train_on_agent_action:
|
||||
# NOTE: Doesn't seem to change much.
|
||||
action = self._last_action
|
||||
batch = (
|
||||
observ, action, tf.gather(self._last_mean, agent_indices),
|
||||
tf.gather(self._last_logstd, agent_indices), reward)
|
||||
append = self._episodes.append(batch, agent_indices)
|
||||
with tf.control_dependencies([append]):
|
||||
norm_observ = self._observ_filter.transform(observ)
|
||||
norm_reward = tf.reduce_mean(self._reward_filter.transform(reward))
|
||||
# pylint: disable=g-long-lambda
|
||||
summary = tf.cond(self._should_log, lambda: tf.summary.merge([
|
||||
update_filters,
|
||||
self._observ_filter.summary(),
|
||||
self._reward_filter.summary(),
|
||||
tf.summary.scalar('memory_size', self._memory_index),
|
||||
tf.summary.histogram('normalized_observ', norm_observ),
|
||||
tf.summary.histogram('action', self._last_action),
|
||||
tf.summary.scalar('normalized_reward', norm_reward)]), str)
|
||||
return summary
|
||||
|
||||
def end_episode(self, agent_indices):
|
||||
"""Add episodes to the memory and perform update steps if memory is full.
|
||||
|
||||
During training, add the collected episodes of the batch indices that
|
||||
finished their episode to the memory. If the memory is full, train on it,
|
||||
and then clear the memory. A summary string is returned if requested at
|
||||
this step.
|
||||
|
||||
Args:
|
||||
agent_indices: Tensor containing current batch indices.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
with tf.name_scope('end_episode/'):
|
||||
return tf.cond(
|
||||
self._is_training,
|
||||
lambda: self._define_end_episode(agent_indices), str)
|
||||
|
||||
def _define_end_episode(self, agent_indices):
|
||||
"""Implement the branch of end_episode() entered during training."""
|
||||
episodes, length = self._episodes.data(agent_indices)
|
||||
space_left = self._config.update_every - self._memory_index
|
||||
use_episodes = tf.range(tf.minimum(
|
||||
tf.shape(agent_indices)[0], space_left))
|
||||
episodes = [tf.gather(elem, use_episodes) for elem in episodes]
|
||||
append = self._memory.replace(
|
||||
episodes, tf.gather(length, use_episodes),
|
||||
use_episodes + self._memory_index)
|
||||
with tf.control_dependencies([append]):
|
||||
inc_index = self._memory_index.assign_add(tf.shape(use_episodes)[0])
|
||||
with tf.control_dependencies([inc_index]):
|
||||
memory_full = self._memory_index >= self._config.update_every
|
||||
return tf.cond(memory_full, self._training, str)
|
||||
|
||||
def _training(self):
|
||||
"""Perform multiple training iterations of both policy and value baseline.
|
||||
|
||||
Training on the episodes collected in the memory. Reset the memory
|
||||
afterwards. Always returns a summary string.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
with tf.name_scope('training'):
|
||||
assert_full = tf.assert_equal(
|
||||
self._memory_index, self._config.update_every)
|
||||
with tf.control_dependencies([assert_full]):
|
||||
data = self._memory.data()
|
||||
(observ, action, old_mean, old_logstd, reward), length = data
|
||||
with tf.control_dependencies([tf.assert_greater(length, 0)]):
|
||||
length = tf.identity(length)
|
||||
observ = self._observ_filter.transform(observ)
|
||||
reward = self._reward_filter.transform(reward)
|
||||
update_summary = self._perform_update_steps(
|
||||
observ, action, old_mean, old_logstd, reward, length)
|
||||
with tf.control_dependencies([update_summary]):
|
||||
penalty_summary = self._adjust_penalty(
|
||||
observ, old_mean, old_logstd, length)
|
||||
with tf.control_dependencies([penalty_summary]):
|
||||
clear_memory = tf.group(
|
||||
self._memory.clear(), self._memory_index.assign(0))
|
||||
with tf.control_dependencies([clear_memory]):
|
||||
weight_summary = utility.variable_summaries(
|
||||
tf.trainable_variables(), self._config.weight_summaries)
|
||||
return tf.summary.merge([
|
||||
update_summary, penalty_summary, weight_summary])
|
||||
|
||||
def _perform_update_steps(
|
||||
self, observ, action, old_mean, old_logstd, reward, length):
|
||||
"""Perform multiple update steps of value function and policy.
|
||||
|
||||
The advantage is computed once at the beginning and shared across
|
||||
iterations. We need to decide for the summary of one iteration, and thus
|
||||
choose the one after half of the iterations.
|
||||
|
||||
Args:
|
||||
observ: Sequences of observations.
|
||||
action: Sequences of actions.
|
||||
old_mean: Sequences of action means of the behavioral policy.
|
||||
old_logstd: Sequences of action log stddevs of the behavioral policy.
|
||||
reward: Sequences of rewards.
|
||||
length: Batch of sequence lengths.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
return_ = utility.discounted_return(
|
||||
reward, length, self._config.discount)
|
||||
value = self._network(observ, length).value
|
||||
if self._config.gae_lambda:
|
||||
advantage = utility.lambda_return(
|
||||
reward, value, length, self._config.discount,
|
||||
self._config.gae_lambda)
|
||||
else:
|
||||
advantage = return_ - value
|
||||
mean, variance = tf.nn.moments(advantage, axes=[0, 1], keep_dims=True)
|
||||
advantage = (advantage - mean) / (tf.sqrt(variance) + 1e-8)
|
||||
advantage = tf.Print(
|
||||
advantage, [tf.reduce_mean(return_), tf.reduce_mean(value)],
|
||||
'return and value: ')
|
||||
advantage = tf.Print(
|
||||
advantage, [tf.reduce_mean(advantage)],
|
||||
'normalized advantage: ')
|
||||
# pylint: disable=g-long-lambda
|
||||
value_loss, policy_loss, summary = tf.scan(
|
||||
lambda _1, _2: self._update_step(
|
||||
observ, action, old_mean, old_logstd, reward, advantage, length),
|
||||
tf.range(self._config.update_epochs),
|
||||
[0., 0., ''], parallel_iterations=1)
|
||||
print_losses = tf.group(
|
||||
tf.Print(0, [tf.reduce_mean(value_loss)], 'value loss: '),
|
||||
tf.Print(0, [tf.reduce_mean(policy_loss)], 'policy loss: '))
|
||||
with tf.control_dependencies([value_loss, policy_loss, print_losses]):
|
||||
return summary[self._config.update_epochs // 2]
|
||||
|
||||
def _update_step(
|
||||
self, observ, action, old_mean, old_logstd, reward, advantage, length):
|
||||
"""Compute the current combined loss and perform a gradient update step.
|
||||
|
||||
Args:
|
||||
observ: Sequences of observations.
|
||||
action: Sequences of actions.
|
||||
old_mean: Sequences of action means of the behavioral policy.
|
||||
old_logstd: Sequences of action log stddevs of the behavioral policy.
|
||||
reward: Sequences of reward.
|
||||
advantage: Sequences of advantages.
|
||||
length: Batch of sequence lengths.
|
||||
|
||||
Returns:
|
||||
Tuple of value loss, policy loss, and summary tensor.
|
||||
"""
|
||||
value_loss, value_summary = self._value_loss(observ, reward, length)
|
||||
network = self._network(observ, length)
|
||||
policy_loss, policy_summary = self._policy_loss(
|
||||
network.mean, network.logstd, old_mean, old_logstd, action,
|
||||
advantage, length)
|
||||
value_gradients, value_variables = (
|
||||
zip(*self._optimizer.compute_gradients(value_loss)))
|
||||
policy_gradients, policy_variables = (
|
||||
zip(*self._optimizer.compute_gradients(policy_loss)))
|
||||
all_gradients = value_gradients + policy_gradients
|
||||
all_variables = value_variables + policy_variables
|
||||
optimize = self._optimizer.apply_gradients(
|
||||
zip(all_gradients, all_variables))
|
||||
summary = tf.summary.merge([
|
||||
value_summary, policy_summary,
|
||||
tf.summary.scalar(
|
||||
'value_gradient_norm', tf.global_norm(value_gradients)),
|
||||
tf.summary.scalar(
|
||||
'policy_gradient_norm', tf.global_norm(policy_gradients)),
|
||||
utility.gradient_summaries(
|
||||
zip(value_gradients, value_variables), dict(value=r'.*')),
|
||||
utility.gradient_summaries(
|
||||
zip(policy_gradients, policy_variables), dict(policy=r'.*'))])
|
||||
with tf.control_dependencies([optimize]):
|
||||
return [tf.identity(x) for x in (value_loss, policy_loss, summary)]
|
||||
|
||||
def _value_loss(self, observ, reward, length):
|
||||
"""Compute the loss function for the value baseline.
|
||||
|
||||
The value loss is the difference between empirical and approximated returns
|
||||
over the collected episodes. Returns the loss tensor and a summary strin.
|
||||
|
||||
Args:
|
||||
observ: Sequences of observations.
|
||||
reward: Sequences of reward.
|
||||
length: Batch of sequence lengths.
|
||||
|
||||
Returns:
|
||||
Tuple of loss tensor and summary tensor.
|
||||
"""
|
||||
with tf.name_scope('value_loss'):
|
||||
value = self._network(observ, length).value
|
||||
return_ = utility.discounted_return(
|
||||
reward, length, self._config.discount)
|
||||
advantage = return_ - value
|
||||
value_loss = 0.5 * self._mask(advantage ** 2, length)
|
||||
summary = tf.summary.merge([
|
||||
tf.summary.histogram('value_loss', value_loss),
|
||||
tf.summary.scalar('avg_value_loss', tf.reduce_mean(value_loss))])
|
||||
value_loss = tf.reduce_mean(value_loss)
|
||||
return tf.check_numerics(value_loss, 'value_loss'), summary
|
||||
|
||||
def _policy_loss(
|
||||
self, mean, logstd, old_mean, old_logstd, action, advantage, length):
|
||||
"""Compute the policy loss composed of multiple components.
|
||||
|
||||
1. The policy gradient loss is importance sampled from the data-collecting
|
||||
policy at the beginning of training.
|
||||
2. The second term is a KL penalty between the policy at the beginning of
|
||||
training and the current policy.
|
||||
3. Additionally, if this KL already changed more than twice the target
|
||||
amount, we activate a strong penalty discouraging further divergence.
|
||||
|
||||
Args:
|
||||
mean: Sequences of action means of the current policy.
|
||||
logstd: Sequences of action log stddevs of the current policy.
|
||||
old_mean: Sequences of action means of the behavioral policy.
|
||||
old_logstd: Sequences of action log stddevs of the behavioral policy.
|
||||
action: Sequences of actions.
|
||||
advantage: Sequences of advantages.
|
||||
length: Batch of sequence lengths.
|
||||
|
||||
Returns:
|
||||
Tuple of loss tensor and summary tensor.
|
||||
"""
|
||||
with tf.name_scope('policy_loss'):
|
||||
entropy = utility.diag_normal_entropy(mean, logstd)
|
||||
kl = tf.reduce_mean(self._mask(utility.diag_normal_kl(
|
||||
old_mean, old_logstd, mean, logstd), length), 1)
|
||||
policy_gradient = tf.exp(
|
||||
utility.diag_normal_logpdf(mean, logstd, action) -
|
||||
utility.diag_normal_logpdf(old_mean, old_logstd, action))
|
||||
surrogate_loss = -tf.reduce_mean(self._mask(
|
||||
policy_gradient * tf.stop_gradient(advantage), length), 1)
|
||||
kl_penalty = self._penalty * kl
|
||||
cutoff_threshold = self._config.kl_target * self._config.kl_cutoff_factor
|
||||
cutoff_count = tf.reduce_sum(
|
||||
tf.cast(kl > cutoff_threshold, tf.int32))
|
||||
with tf.control_dependencies([tf.cond(
|
||||
cutoff_count > 0,
|
||||
lambda: tf.Print(0, [cutoff_count], 'kl cutoff! '), int)]):
|
||||
kl_cutoff = (
|
||||
self._config.kl_cutoff_coef *
|
||||
tf.cast(kl > cutoff_threshold, tf.float32) *
|
||||
(kl - cutoff_threshold) ** 2)
|
||||
policy_loss = surrogate_loss + kl_penalty + kl_cutoff
|
||||
summary = tf.summary.merge([
|
||||
tf.summary.histogram('entropy', entropy),
|
||||
tf.summary.histogram('kl', kl),
|
||||
tf.summary.histogram('surrogate_loss', surrogate_loss),
|
||||
tf.summary.histogram('kl_penalty', kl_penalty),
|
||||
tf.summary.histogram('kl_cutoff', kl_cutoff),
|
||||
tf.summary.histogram('kl_penalty_combined', kl_penalty + kl_cutoff),
|
||||
tf.summary.histogram('policy_loss', policy_loss),
|
||||
tf.summary.scalar('avg_surr_loss', tf.reduce_mean(surrogate_loss)),
|
||||
tf.summary.scalar('avg_kl_penalty', tf.reduce_mean(kl_penalty)),
|
||||
tf.summary.scalar('avg_policy_loss', tf.reduce_mean(policy_loss))])
|
||||
policy_loss = tf.reduce_mean(policy_loss, 0)
|
||||
return tf.check_numerics(policy_loss, 'policy_loss'), summary
|
||||
|
||||
def _adjust_penalty(self, observ, old_mean, old_logstd, length):
|
||||
"""Adjust the KL policy between the behavioral and current policy.
|
||||
|
||||
Compute how much the policy actually changed during the multiple
|
||||
update steps. Adjust the penalty strength for the next training phase if we
|
||||
overshot or undershot the target divergence too much.
|
||||
|
||||
Args:
|
||||
observ: Sequences of observations.
|
||||
old_mean: Sequences of action means of the behavioral policy.
|
||||
old_logstd: Sequences of action log stddevs of the behavioral policy.
|
||||
length: Batch of sequence lengths.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
with tf.name_scope('adjust_penalty'):
|
||||
network = self._network(observ, length)
|
||||
assert_change = tf.assert_equal(
|
||||
tf.reduce_all(tf.equal(network.mean, old_mean)), False,
|
||||
message='policy should change')
|
||||
print_penalty = tf.Print(0, [self._penalty], 'current penalty: ')
|
||||
with tf.control_dependencies([assert_change, print_penalty]):
|
||||
kl_change = tf.reduce_mean(self._mask(utility.diag_normal_kl(
|
||||
old_mean, old_logstd, network.mean, network.logstd), length))
|
||||
kl_change = tf.Print(kl_change, [kl_change], 'kl change: ')
|
||||
maybe_increase = tf.cond(
|
||||
kl_change > 1.3 * self._config.kl_target,
|
||||
# pylint: disable=g-long-lambda
|
||||
lambda: tf.Print(self._penalty.assign(
|
||||
self._penalty * 1.5), [0], 'increase penalty '),
|
||||
float)
|
||||
maybe_decrease = tf.cond(
|
||||
kl_change < 0.7 * self._config.kl_target,
|
||||
# pylint: disable=g-long-lambda
|
||||
lambda: tf.Print(self._penalty.assign(
|
||||
self._penalty / 1.5), [0], 'decrease penalty '),
|
||||
float)
|
||||
with tf.control_dependencies([maybe_increase, maybe_decrease]):
|
||||
return tf.summary.merge([
|
||||
tf.summary.scalar('kl_change', kl_change),
|
||||
tf.summary.scalar('penalty', self._penalty)])
|
||||
|
||||
def _mask(self, tensor, length):
|
||||
"""Set padding elements of a batch of sequences to zero.
|
||||
|
||||
Useful to then safely sum along the time dimension.
|
||||
|
||||
Args:
|
||||
tensor: Tensor of sequences.
|
||||
length: Batch of sequence lengths.
|
||||
|
||||
Returns:
|
||||
Masked sequences.
|
||||
"""
|
||||
with tf.name_scope('mask'):
|
||||
range_ = tf.range(tensor.shape[1].value)
|
||||
mask = tf.cast(range_[None, :] < length[:, None], tf.float32)
|
||||
masked = tensor * mask
|
||||
return tf.check_numerics(masked, 'masked')
|
||||
152
examples/pybullet/gym/pybullet_envs/agents/ppo/memory.py
Normal file
152
examples/pybullet/gym/pybullet_envs/agents/ppo/memory.py
Normal file
@@ -0,0 +1,152 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Memory that stores episodes."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
class EpisodeMemory(object):
|
||||
"""Memory that stores episodes."""
|
||||
|
||||
def __init__(self, template, capacity, max_length, scope):
|
||||
"""Create a memory that stores episodes.
|
||||
|
||||
Each transition tuple consists of quantities specified by the template.
|
||||
These quantities would typically be be observartions, actions, rewards, and
|
||||
done indicators.
|
||||
|
||||
Args:
|
||||
template: List of tensors to derive shapes and dtypes of each transition.
|
||||
capacity: Number of episodes, or rows, hold by the memory.
|
||||
max_length: Allocated sequence length for the episodes.
|
||||
scope: Variable scope to use for internal variables.
|
||||
"""
|
||||
self._capacity = capacity
|
||||
self._max_length = max_length
|
||||
with tf.variable_scope(scope) as var_scope:
|
||||
self._scope = var_scope
|
||||
self._length = tf.Variable(tf.zeros(capacity, tf.int32), False)
|
||||
self._buffers = [
|
||||
tf.Variable(tf.zeros(
|
||||
[capacity, max_length] + elem.shape.as_list(),
|
||||
elem.dtype), False)
|
||||
for elem in template]
|
||||
|
||||
def length(self, rows=None):
|
||||
"""Tensor holding the current length of episodes.
|
||||
|
||||
Args:
|
||||
rows: Episodes to select length from, defaults to all.
|
||||
|
||||
Returns:
|
||||
Batch tensor of sequence lengths.
|
||||
"""
|
||||
rows = tf.range(self._capacity) if rows is None else rows
|
||||
return tf.gather(self._length, rows)
|
||||
|
||||
def append(self, transitions, rows=None):
|
||||
"""Append a batch of transitions to rows of the memory.
|
||||
|
||||
Args:
|
||||
transitions: Tuple of transition quantities with batch dimension.
|
||||
rows: Episodes to append to, defaults to all.
|
||||
|
||||
Returns:
|
||||
Operation.
|
||||
"""
|
||||
rows = tf.range(self._capacity) if rows is None else rows
|
||||
assert rows.shape.ndims == 1
|
||||
assert_capacity = tf.assert_less(
|
||||
rows, self._capacity,
|
||||
message='capacity exceeded')
|
||||
with tf.control_dependencies([assert_capacity]):
|
||||
assert_max_length = tf.assert_less(
|
||||
tf.gather(self._length, rows), self._max_length,
|
||||
message='max length exceeded')
|
||||
append_ops = []
|
||||
with tf.control_dependencies([assert_max_length]):
|
||||
for buffer_, elements in zip(self._buffers, transitions):
|
||||
timestep = tf.gather(self._length, rows)
|
||||
indices = tf.stack([rows, timestep], 1)
|
||||
append_ops.append(tf.scatter_nd_update(buffer_, indices, elements))
|
||||
with tf.control_dependencies(append_ops):
|
||||
episode_mask = tf.reduce_sum(tf.one_hot(
|
||||
rows, self._capacity, dtype=tf.int32), 0)
|
||||
return self._length.assign_add(episode_mask)
|
||||
|
||||
def replace(self, episodes, length, rows=None):
|
||||
"""Replace full episodes.
|
||||
|
||||
Args:
|
||||
episodes: Tuple of transition quantities with batch and time dimensions.
|
||||
length: Batch of sequence lengths.
|
||||
rows: Episodes to replace, defaults to all.
|
||||
|
||||
Returns:
|
||||
Operation.
|
||||
"""
|
||||
rows = tf.range(self._capacity) if rows is None else rows
|
||||
assert rows.shape.ndims == 1
|
||||
assert_capacity = tf.assert_less(
|
||||
rows, self._capacity, message='capacity exceeded')
|
||||
with tf.control_dependencies([assert_capacity]):
|
||||
assert_max_length = tf.assert_less_equal(
|
||||
length, self._max_length, message='max length exceeded')
|
||||
replace_ops = []
|
||||
with tf.control_dependencies([assert_max_length]):
|
||||
for buffer_, elements in zip(self._buffers, episodes):
|
||||
replace_op = tf.scatter_update(buffer_, rows, elements)
|
||||
replace_ops.append(replace_op)
|
||||
with tf.control_dependencies(replace_ops):
|
||||
return tf.scatter_update(self._length, rows, length)
|
||||
|
||||
def data(self, rows=None):
|
||||
"""Access a batch of episodes from the memory.
|
||||
|
||||
Padding elements after the length of each episode are unspecified and might
|
||||
contain old data.
|
||||
|
||||
Args:
|
||||
rows: Episodes to select, defaults to all.
|
||||
|
||||
Returns:
|
||||
Tuple containing a tuple of transition quantiries with batch and time
|
||||
dimensions, and a batch of sequence lengths.
|
||||
"""
|
||||
rows = tf.range(self._capacity) if rows is None else rows
|
||||
assert rows.shape.ndims == 1
|
||||
episode = [tf.gather(buffer_, rows) for buffer_ in self._buffers]
|
||||
length = tf.gather(self._length, rows)
|
||||
return episode, length
|
||||
|
||||
def clear(self, rows=None):
|
||||
"""Reset episodes in the memory.
|
||||
|
||||
Internally, this only sets their lengths to zero. The memory entries will
|
||||
be overridden by future calls to append() or replace().
|
||||
|
||||
Args:
|
||||
rows: Episodes to clear, defaults to all.
|
||||
|
||||
Returns:
|
||||
Operation.
|
||||
"""
|
||||
rows = tf.range(self._capacity) if rows is None else rows
|
||||
assert rows.shape.ndims == 1
|
||||
return tf.scatter_update(self._length, rows, tf.zeros_like(rows))
|
||||
168
examples/pybullet/gym/pybullet_envs/agents/ppo/normalize.py
Normal file
168
examples/pybullet/gym/pybullet_envs/agents/ppo/normalize.py
Normal file
@@ -0,0 +1,168 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Normalize tensors based on streaming estimates of mean and variance."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
class StreamingNormalize(object):
|
||||
"""Normalize tensors based on streaming estimates of mean and variance."""
|
||||
|
||||
def __init__(
|
||||
self, template, center=True, scale=True, clip=10, name='normalize'):
|
||||
"""Normalize tensors based on streaming estimates of mean and variance.
|
||||
|
||||
Centering the value, scaling it by the standard deviation, and clipping
|
||||
outlier values are optional.
|
||||
|
||||
Args:
|
||||
template: Example tensor providing shape and dtype of the vaule to track.
|
||||
center: Python boolean indicating whether to subtract mean from values.
|
||||
scale: Python boolean indicating whether to scale values by stddev.
|
||||
clip: If and when to clip normalized values.
|
||||
name: Parent scope of operations provided by this class.
|
||||
"""
|
||||
self._center = center
|
||||
self._scale = scale
|
||||
self._clip = clip
|
||||
self._name = name
|
||||
with tf.name_scope(name):
|
||||
self._count = tf.Variable(0, False)
|
||||
self._mean = tf.Variable(tf.zeros_like(template), False)
|
||||
self._var_sum = tf.Variable(tf.zeros_like(template), False)
|
||||
|
||||
def transform(self, value):
|
||||
"""Normalize a single or batch tensor.
|
||||
|
||||
Applies the activated transformations in the constructor using current
|
||||
estimates of mean and variance.
|
||||
|
||||
Args:
|
||||
value: Batch or single value tensor.
|
||||
|
||||
Returns:
|
||||
Normalized batch or single value tensor.
|
||||
"""
|
||||
with tf.name_scope(self._name + '/transform'):
|
||||
no_batch_dim = value.shape.ndims == self._mean.shape.ndims
|
||||
if no_batch_dim:
|
||||
# Add a batch dimension if necessary.
|
||||
value = value[None, ...]
|
||||
if self._center:
|
||||
value -= self._mean[None, ...]
|
||||
if self._scale:
|
||||
# We cannot scale before seeing at least two samples.
|
||||
value /= tf.cond(
|
||||
self._count > 1, lambda: self._std() + 1e-8,
|
||||
lambda: tf.ones_like(self._var_sum))[None]
|
||||
if self._clip:
|
||||
value = tf.clip_by_value(value, -self._clip, self._clip)
|
||||
# Remove batch dimension if necessary.
|
||||
if no_batch_dim:
|
||||
value = value[0]
|
||||
return tf.check_numerics(value, 'value')
|
||||
|
||||
def update(self, value):
|
||||
"""Update the mean and variance estimates.
|
||||
|
||||
Args:
|
||||
value: Batch or single value tensor.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
with tf.name_scope(self._name + '/update'):
|
||||
if value.shape.ndims == self._mean.shape.ndims:
|
||||
# Add a batch dimension if necessary.
|
||||
value = value[None, ...]
|
||||
count = tf.shape(value)[0]
|
||||
with tf.control_dependencies([self._count.assign_add(count)]):
|
||||
step = tf.cast(self._count, tf.float32)
|
||||
mean_delta = tf.reduce_sum(value - self._mean[None, ...], 0)
|
||||
new_mean = self._mean + mean_delta / step
|
||||
new_mean = tf.cond(self._count > 1, lambda: new_mean, lambda: value[0])
|
||||
var_delta = (
|
||||
value - self._mean[None, ...]) * (value - new_mean[None, ...])
|
||||
new_var_sum = self._var_sum + tf.reduce_sum(var_delta, 0)
|
||||
with tf.control_dependencies([new_mean, new_var_sum]):
|
||||
update = self._mean.assign(new_mean), self._var_sum.assign(new_var_sum)
|
||||
with tf.control_dependencies(update):
|
||||
if value.shape.ndims == 1:
|
||||
value = tf.reduce_mean(value)
|
||||
return self._summary('value', tf.reduce_mean(value))
|
||||
|
||||
def reset(self):
|
||||
"""Reset the estimates of mean and variance.
|
||||
|
||||
Resets the full state of this class.
|
||||
|
||||
Returns:
|
||||
Operation.
|
||||
"""
|
||||
with tf.name_scope(self._name + '/reset'):
|
||||
return tf.group(
|
||||
self._count.assign(0),
|
||||
self._mean.assign(tf.zeros_like(self._mean)),
|
||||
self._var_sum.assign(tf.zeros_like(self._var_sum)))
|
||||
|
||||
def summary(self):
|
||||
"""Summary string of mean and standard deviation.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
with tf.name_scope(self._name + '/summary'):
|
||||
mean_summary = tf.cond(
|
||||
self._count > 0, lambda: self._summary('mean', self._mean), str)
|
||||
std_summary = tf.cond(
|
||||
self._count > 1, lambda: self._summary('stddev', self._std()), str)
|
||||
return tf.summary.merge([mean_summary, std_summary])
|
||||
|
||||
def _std(self):
|
||||
"""Computes the current estimate of the standard deviation.
|
||||
|
||||
Note that the standard deviation is not defined until at least two samples
|
||||
were seen.
|
||||
|
||||
Returns:
|
||||
Tensor of current variance.
|
||||
"""
|
||||
variance = tf.cond(
|
||||
self._count > 1,
|
||||
lambda: self._var_sum / tf.cast(self._count - 1, tf.float32),
|
||||
lambda: tf.ones_like(self._var_sum) * float('nan'))
|
||||
# The epsilon corrects for small negative variance values caused by
|
||||
# the algorithm. It was empirically chosen to work with all environments
|
||||
# tested.
|
||||
return tf.sqrt(variance + 1e-4)
|
||||
|
||||
def _summary(self, name, tensor):
|
||||
"""Create a scalar or histogram summary matching the rank of the tensor.
|
||||
|
||||
Args:
|
||||
name: Name for the summary.
|
||||
tensor: Tensor to summarize.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
if tensor.shape.ndims == 0:
|
||||
return tf.summary.scalar(name, tensor)
|
||||
else:
|
||||
return tf.summary.histogram(name, tensor)
|
||||
213
examples/pybullet/gym/pybullet_envs/agents/ppo/utility.py
Normal file
213
examples/pybullet/gym/pybullet_envs/agents/ppo/utility.py
Normal file
@@ -0,0 +1,213 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Utilities for the PPO algorithm."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import collections
|
||||
import math
|
||||
import re
|
||||
|
||||
import tensorflow as tf
|
||||
from tensorflow.python.client import device_lib
|
||||
|
||||
|
||||
def reinit_nested_vars(variables, indices=None):
|
||||
"""Reset all variables in a nested tuple to zeros.
|
||||
|
||||
Args:
|
||||
variables: Nested tuple or list of variaables.
|
||||
indices: Batch indices to reset, defaults to all.
|
||||
|
||||
Returns:
|
||||
Operation.
|
||||
"""
|
||||
if isinstance(variables, (tuple, list)):
|
||||
return tf.group(*[
|
||||
reinit_nested_vars(variable, indices) for variable in variables])
|
||||
if indices is None:
|
||||
return variables.assign(tf.zeros_like(variables))
|
||||
else:
|
||||
zeros = tf.zeros([tf.shape(indices)[0]] + variables.shape[1:].as_list())
|
||||
return tf.scatter_update(variables, indices, zeros)
|
||||
|
||||
|
||||
def assign_nested_vars(variables, tensors, indices=None):
|
||||
"""Assign tensors to matching nested tuple of variables.
|
||||
|
||||
Args:
|
||||
variables: Nested tuple or list of variables to update.
|
||||
tensors: Nested tuple or list of tensors to assign.
|
||||
indices: Batch indices to assign to; default to all.
|
||||
|
||||
Returns:
|
||||
Operation.
|
||||
"""
|
||||
if isinstance(variables, (tuple, list)):
|
||||
return tf.group(*[
|
||||
assign_nested_vars(variable, tensor)
|
||||
for variable, tensor in zip(variables, tensors)])
|
||||
if indices is None:
|
||||
return variables.assign(tensors)
|
||||
else:
|
||||
return tf.scatter_update(variables, indices, tensors)
|
||||
|
||||
|
||||
def discounted_return(reward, length, discount):
|
||||
"""Discounted Monte-Carlo returns."""
|
||||
timestep = tf.range(reward.shape[1].value)
|
||||
mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
|
||||
return_ = tf.reverse(tf.transpose(tf.scan(
|
||||
lambda agg, cur: cur + discount * agg,
|
||||
tf.transpose(tf.reverse(mask * reward, [1]), [1, 0]),
|
||||
tf.zeros_like(reward[:, -1]), 1, False), [1, 0]), [1])
|
||||
return tf.check_numerics(tf.stop_gradient(return_), 'return')
|
||||
|
||||
|
||||
def fixed_step_return(reward, value, length, discount, window):
|
||||
"""N-step discounted return."""
|
||||
timestep = tf.range(reward.shape[1].value)
|
||||
mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
|
||||
return_ = tf.zeros_like(reward)
|
||||
for _ in range(window):
|
||||
return_ += reward
|
||||
reward = discount * tf.concat(
|
||||
[reward[:, 1:], tf.zeros_like(reward[:, -1:])], 1)
|
||||
return_ += discount ** window * tf.concat(
|
||||
[value[:, window:], tf.zeros_like(value[:, -window:]), 1])
|
||||
return tf.check_numerics(tf.stop_gradient(mask * return_), 'return')
|
||||
|
||||
|
||||
def lambda_return(reward, value, length, discount, lambda_):
|
||||
"""TD-lambda returns."""
|
||||
timestep = tf.range(reward.shape[1].value)
|
||||
mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
|
||||
sequence = mask * reward + discount * value * (1 - lambda_)
|
||||
discount = mask * discount * lambda_
|
||||
sequence = tf.stack([sequence, discount], 2)
|
||||
return_ = tf.reverse(tf.transpose(tf.scan(
|
||||
lambda agg, cur: cur[0] + cur[1] * agg,
|
||||
tf.transpose(tf.reverse(sequence, [1]), [1, 2, 0]),
|
||||
tf.zeros_like(value[:, -1]), 1, False), [1, 0]), [1])
|
||||
return tf.check_numerics(tf.stop_gradient(return_), 'return')
|
||||
|
||||
|
||||
def lambda_advantage(reward, value, length, discount):
|
||||
"""Generalized Advantage Estimation."""
|
||||
timestep = tf.range(reward.shape[1].value)
|
||||
mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
|
||||
next_value = tf.concat([value[:, 1:], tf.zeros_like(value[:, -1:])], 1)
|
||||
delta = reward + discount * next_value - value
|
||||
advantage = tf.reverse(tf.transpose(tf.scan(
|
||||
lambda agg, cur: cur + discount * agg,
|
||||
tf.transpose(tf.reverse(mask * delta, [1]), [1, 0]),
|
||||
tf.zeros_like(delta[:, -1]), 1, False), [1, 0]), [1])
|
||||
return tf.check_numerics(tf.stop_gradient(advantage), 'advantage')
|
||||
|
||||
|
||||
def diag_normal_kl(mean0, logstd0, mean1, logstd1):
|
||||
"""Epirical KL divergence of two normals with diagonal covariance."""
|
||||
logstd0_2, logstd1_2 = 2 * logstd0, 2 * logstd1
|
||||
return 0.5 * (
|
||||
tf.reduce_sum(tf.exp(logstd0_2 - logstd1_2), -1) +
|
||||
tf.reduce_sum((mean1 - mean0) ** 2 / tf.exp(logstd1_2), -1) +
|
||||
tf.reduce_sum(logstd1_2, -1) - tf.reduce_sum(logstd0_2, -1) -
|
||||
mean0.shape[-1].value)
|
||||
|
||||
|
||||
def diag_normal_logpdf(mean, logstd, loc):
|
||||
"""Log density of a normal with diagonal covariance."""
|
||||
constant = -0.5 * math.log(2 * math.pi) - logstd
|
||||
value = -0.5 * ((loc - mean) / tf.exp(logstd)) ** 2
|
||||
return tf.reduce_sum(constant + value, -1)
|
||||
|
||||
|
||||
def diag_normal_entropy(mean, logstd):
|
||||
"""Empirical entropy of a normal with diagonal covariance."""
|
||||
constant = mean.shape[-1].value * math.log(2 * math.pi * math.e)
|
||||
return (constant + tf.reduce_sum(2 * logstd, 1)) / 2
|
||||
|
||||
|
||||
def available_gpus():
|
||||
"""List of GPU device names detected by TensorFlow."""
|
||||
local_device_protos = device_lib.list_local_devices()
|
||||
return [x.name for x in local_device_protos if x.device_type == 'GPU']
|
||||
|
||||
|
||||
def gradient_summaries(grad_vars, groups=None, scope='gradients'):
|
||||
"""Create histogram summaries of the gradient.
|
||||
|
||||
Summaries can be grouped via regexes matching variables names.
|
||||
|
||||
Args:
|
||||
grad_vars: List of (gradient, variable) tuples as returned by optimizers.
|
||||
groups: Mapping of name to regex for grouping summaries.
|
||||
scope: Name scope for this operation.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
groups = groups or {r'all': r'.*'}
|
||||
grouped = collections.defaultdict(list)
|
||||
for grad, var in grad_vars:
|
||||
if grad is None:
|
||||
continue
|
||||
for name, pattern in groups.items():
|
||||
if re.match(pattern, var.name):
|
||||
name = re.sub(pattern, name, var.name)
|
||||
grouped[name].append(grad)
|
||||
for name in groups:
|
||||
if name not in grouped:
|
||||
tf.logging.warn("No variables matching '{}' group.".format(name))
|
||||
summaries = []
|
||||
for name, grads in grouped.items():
|
||||
grads = [tf.reshape(grad, [-1]) for grad in grads]
|
||||
grads = tf.concat(grads, 0)
|
||||
summaries.append(tf.summary.histogram(scope + '/' + name, grads))
|
||||
return tf.summary.merge(summaries)
|
||||
|
||||
|
||||
def variable_summaries(vars_, groups=None, scope='weights'):
|
||||
"""Create histogram summaries for the provided variables.
|
||||
|
||||
Summaries can be grouped via regexes matching variables names.
|
||||
|
||||
Args:
|
||||
vars_: List of variables to summarize.
|
||||
groups: Mapping of name to regex for grouping summaries.
|
||||
scope: Name scope for this operation.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
groups = groups or {r'all': r'.*'}
|
||||
grouped = collections.defaultdict(list)
|
||||
for var in vars_:
|
||||
for name, pattern in groups.items():
|
||||
if re.match(pattern, var.name):
|
||||
name = re.sub(pattern, name, var.name)
|
||||
grouped[name].append(var)
|
||||
for name in groups:
|
||||
if name not in grouped:
|
||||
tf.logging.warn("No variables matching '{}' group.".format(name))
|
||||
summaries = []
|
||||
# pylint: disable=redefined-argument-from-local
|
||||
for name, vars_ in grouped.items():
|
||||
vars_ = [tf.reshape(var, [-1]) for var in vars_]
|
||||
vars_ = tf.concat(vars_, 0)
|
||||
summaries.append(tf.summary.histogram(scope + '/' + name, vars_))
|
||||
return tf.summary.merge(summaries)
|
||||
Reference in New Issue
Block a user