add a temp copy of TF agents (until the API stops changing or configs.py are included)

2017-11-16 16:47:14 +00:00
parent 7f654bdd87
commit 7b030426c1
24 changed files with 3294 additions and 27 deletions
--- a/examples/pybullet/gym/pybullet_envs/agents/ppo/init.py
+++ b/examples/pybullet/gym/pybullet_envs/agents/ppo/init.py
@@ -0,0 +1,21 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Proximal Policy Optimization algorithm."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from .algorithm import PPOAlgorithm
--- a/examples/pybullet/gym/pybullet_envs/agents/ppo/algorithm.py
+++ b/examples/pybullet/gym/pybullet_envs/agents/ppo/algorithm.py
@@ -0,0 +1,515 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Proximal Policy Optimization algorithm.
+
+Based on John Schulman's implementation in Python and Theano:
+https://github.com/joschu/modular_rl/blob/master/modular_rl/ppo.py
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+import tensorflow as tf
+
+from . import memory
+from . import normalize
+from . import utility
+
+
+class PPOAlgorithm(object):
+  """A vectorized implementation of the PPO algorithm by John Schulman."""
+
+  def __init__(self, batch_env, step, is_training, should_log, config):
+    """Create an instance of the PPO algorithm.
+
+    Args:
+      batch_env: In-graph batch environment.
+      step: Integer tensor holding the current training step.
+      is_training: Boolean tensor for whether the algorithm should train.
+      should_log: Boolean tensor for whether summaries should be returned.
+      config: Object containing the agent configuration as attributes.
+    """
+    self._batch_env = batch_env
+    self._step = step
+    self._is_training = is_training
+    self._should_log = should_log
+    self._config = config
+    self._observ_filter = normalize.StreamingNormalize(
+        self._batch_env.observ[0], center=True, scale=True, clip=5,
+        name='normalize_observ')
+    self._reward_filter = normalize.StreamingNormalize(
+        self._batch_env.reward[0], center=False, scale=True, clip=10,
+        name='normalize_reward')
+    # Memory stores tuple of observ, action, mean, logstd, reward.
+    template = (
+        self._batch_env.observ[0], self._batch_env.action[0],
+        self._batch_env.action[0], self._batch_env.action[0],
+        self._batch_env.reward[0])
+    self._memory = memory.EpisodeMemory(
+        template, config.update_every, config.max_length, 'memory')
+    self._memory_index = tf.Variable(0, False)
+    use_gpu = self._config.use_gpu and utility.available_gpus()
+    with tf.device('/gpu:0' if use_gpu else '/cpu:0'):
+      # Create network variables for later calls to reuse.
+      action_size = self._batch_env.action.shape[1].value
+      self._network = tf.make_template(
+          'network', functools.partial(config.network, config, action_size))
+      output = self._network(
+          tf.zeros_like(self._batch_env.observ)[:, None],
+          tf.ones(len(self._batch_env)))
+      with tf.variable_scope('ppo_temporary'):
+        self._episodes = memory.EpisodeMemory(
+            template, len(batch_env), config.max_length, 'episodes')
+        if output.state is None:
+          self._last_state = None
+        else:
+          # Ensure the batch dimension is set.
+          tf.contrib.framework.nest.map_structure(
+              lambda x: x.set_shape([len(batch_env)] + x.shape.as_list()[1:]),
+              output.state)
+          # pylint: disable=undefined-variable
+          self._last_state = tf.contrib.framework.nest.map_structure(
+              lambda x: tf.Variable(lambda: tf.zeros_like(x), False),
+              output.state)
+        self._last_action = tf.Variable(
+            tf.zeros_like(self._batch_env.action), False, name='last_action')
+        self._last_mean = tf.Variable(
+            tf.zeros_like(self._batch_env.action), False, name='last_mean')
+        self._last_logstd = tf.Variable(
+            tf.zeros_like(self._batch_env.action), False, name='last_logstd')
+    self._penalty = tf.Variable(
+        self._config.kl_init_penalty, False, dtype=tf.float32)
+    self._optimizer = self._config.optimizer(self._config.learning_rate)
+
+  def begin_episode(self, agent_indices):
+    """Reset the recurrent states and stored episode.
+
+    Args:
+      agent_indices: Tensor containing current batch indices.
+
+    Returns:
+      Summary tensor.
+    """
+    with tf.name_scope('begin_episode/'):
+      if self._last_state is None:
+        reset_state = tf.no_op()
+      else:
+        reset_state = utility.reinit_nested_vars(
+            self._last_state, agent_indices)
+      reset_buffer = self._episodes.clear(agent_indices)
+      with tf.control_dependencies([reset_state, reset_buffer]):
+        return tf.constant('')
+
+  def perform(self, agent_indices, observ):
+    """Compute batch of actions and a summary for a batch of observation.
+
+    Args:
+      agent_indices: Tensor containing current batch indices.
+      observ: Tensor of a batch of observations for all agents.
+
+    Returns:
+      Tuple of action batch tensor and summary tensor.
+    """
+    with tf.name_scope('perform/'):
+      observ = self._observ_filter.transform(observ)
+      if self._last_state is None:
+        state = None
+      else:
+        state = tf.contrib.framework.nest.map_structure(
+            lambda x: tf.gather(x, agent_indices), self._last_state)
+      output = self._network(observ[:, None], tf.ones(observ.shape[0]), state)
+      action = tf.cond(
+          self._is_training, output.policy.sample, lambda: output.mean)
+      logprob = output.policy.log_prob(action)[:, 0]
+      # pylint: disable=g-long-lambda
+      summary = tf.cond(self._should_log, lambda: tf.summary.merge([
+          tf.summary.histogram('mean', output.mean[:, 0]),
+          tf.summary.histogram('std', tf.exp(output.logstd[:, 0])),
+          tf.summary.histogram('action', action[:, 0]),
+          tf.summary.histogram('logprob', logprob)]), str)
+      # Remember current policy to append to memory in the experience callback.
+      if self._last_state is None:
+        assign_state = tf.no_op()
+      else:
+        assign_state = utility.assign_nested_vars(
+            self._last_state, output.state, agent_indices)
+      with tf.control_dependencies([
+          assign_state,
+          tf.scatter_update(
+              self._last_action, agent_indices, action[:, 0]),
+          tf.scatter_update(
+              self._last_mean, agent_indices, output.mean[:, 0]),
+          tf.scatter_update(
+              self._last_logstd, agent_indices, output.logstd[:, 0])]):
+        return tf.check_numerics(action[:, 0], 'action'), tf.identity(summary)
+
+  def experience(
+      self, agent_indices, observ, action, reward, unused_done, unused_nextob):
+    """Process the transition tuple of the current step.
+
+    When training, add the current transition tuple to the memory and update
+    the streaming statistics for observations and rewards. A summary string is
+    returned if requested at this step.
+
+    Args:
+      agent_indices: Tensor containing current batch indices.
+      observ: Batch tensor of observations.
+      action: Batch tensor of actions.
+      reward: Batch tensor of rewards.
+      unused_done: Batch tensor of done flags.
+      unused_nextob: Batch tensor of successor observations.
+
+    Returns:
+      Summary tensor.
+    """
+    with tf.name_scope('experience/'):
+      return tf.cond(
+          self._is_training,
+          # pylint: disable=g-long-lambda
+          lambda: self._define_experience(
+              agent_indices, observ, action, reward), str)
+
+  def _define_experience(self, agent_indices, observ, action, reward):
+    """Implement the branch of experience() entered during training."""
+    update_filters = tf.summary.merge([
+        self._observ_filter.update(observ),
+        self._reward_filter.update(reward)])
+    with tf.control_dependencies([update_filters]):
+      if self._config.train_on_agent_action:
+        # NOTE: Doesn't seem to change much.
+        action = self._last_action
+      batch = (
+          observ, action, tf.gather(self._last_mean, agent_indices),
+          tf.gather(self._last_logstd, agent_indices), reward)
+      append = self._episodes.append(batch, agent_indices)
+    with tf.control_dependencies([append]):
+      norm_observ = self._observ_filter.transform(observ)
+      norm_reward = tf.reduce_mean(self._reward_filter.transform(reward))
+      # pylint: disable=g-long-lambda
+      summary = tf.cond(self._should_log, lambda: tf.summary.merge([
+          update_filters,
+          self._observ_filter.summary(),
+          self._reward_filter.summary(),
+          tf.summary.scalar('memory_size', self._memory_index),
+          tf.summary.histogram('normalized_observ', norm_observ),
+          tf.summary.histogram('action', self._last_action),
+          tf.summary.scalar('normalized_reward', norm_reward)]), str)
+      return summary
+
+  def end_episode(self, agent_indices):
+    """Add episodes to the memory and perform update steps if memory is full.
+
+    During training, add the collected episodes of the batch indices that
+    finished their episode to the memory. If the memory is full, train on it,
+    and then clear the memory. A summary string is returned if requested at
+    this step.
+
+    Args:
+      agent_indices: Tensor containing current batch indices.
+
+    Returns:
+       Summary tensor.
+    """
+    with tf.name_scope('end_episode/'):
+      return tf.cond(
+          self._is_training,
+          lambda: self._define_end_episode(agent_indices), str)
+
+  def _define_end_episode(self, agent_indices):
+    """Implement the branch of end_episode() entered during training."""
+    episodes, length = self._episodes.data(agent_indices)
+    space_left = self._config.update_every - self._memory_index
+    use_episodes = tf.range(tf.minimum(
+        tf.shape(agent_indices)[0], space_left))
+    episodes = [tf.gather(elem, use_episodes) for elem in episodes]
+    append = self._memory.replace(
+        episodes, tf.gather(length, use_episodes),
+        use_episodes + self._memory_index)
+    with tf.control_dependencies([append]):
+      inc_index = self._memory_index.assign_add(tf.shape(use_episodes)[0])
+    with tf.control_dependencies([inc_index]):
+      memory_full = self._memory_index >= self._config.update_every
+      return tf.cond(memory_full, self._training, str)
+
+  def _training(self):
+    """Perform multiple training iterations of both policy and value baseline.
+
+    Training on the episodes collected in the memory. Reset the memory
+    afterwards. Always returns a summary string.
+
+    Returns:
+      Summary tensor.
+    """
+    with tf.name_scope('training'):
+      assert_full = tf.assert_equal(
+          self._memory_index, self._config.update_every)
+      with tf.control_dependencies([assert_full]):
+        data = self._memory.data()
+      (observ, action, old_mean, old_logstd, reward), length = data
+      with tf.control_dependencies([tf.assert_greater(length, 0)]):
+        length = tf.identity(length)
+      observ = self._observ_filter.transform(observ)
+      reward = self._reward_filter.transform(reward)
+      update_summary = self._perform_update_steps(
+          observ, action, old_mean, old_logstd, reward, length)
+      with tf.control_dependencies([update_summary]):
+        penalty_summary = self._adjust_penalty(
+            observ, old_mean, old_logstd, length)
+      with tf.control_dependencies([penalty_summary]):
+        clear_memory = tf.group(
+            self._memory.clear(), self._memory_index.assign(0))
+      with tf.control_dependencies([clear_memory]):
+        weight_summary = utility.variable_summaries(
+            tf.trainable_variables(), self._config.weight_summaries)
+        return tf.summary.merge([
+            update_summary, penalty_summary, weight_summary])
+
+  def _perform_update_steps(
+      self, observ, action, old_mean, old_logstd, reward, length):
+    """Perform multiple update steps of value function and policy.
+
+    The advantage is computed once at the beginning and shared across
+    iterations. We need to decide for the summary of one iteration, and thus
+    choose the one after half of the iterations.
+
+    Args:
+      observ: Sequences of observations.
+      action: Sequences of actions.
+      old_mean: Sequences of action means of the behavioral policy.
+      old_logstd: Sequences of action log stddevs of the behavioral policy.
+      reward: Sequences of rewards.
+      length: Batch of sequence lengths.
+
+    Returns:
+      Summary tensor.
+    """
+    return_ = utility.discounted_return(
+        reward, length, self._config.discount)
+    value = self._network(observ, length).value
+    if self._config.gae_lambda:
+      advantage = utility.lambda_return(
+          reward, value, length, self._config.discount,
+          self._config.gae_lambda)
+    else:
+      advantage = return_ - value
+    mean, variance = tf.nn.moments(advantage, axes=[0, 1], keep_dims=True)
+    advantage = (advantage - mean) / (tf.sqrt(variance) + 1e-8)
+    advantage = tf.Print(
+        advantage, [tf.reduce_mean(return_), tf.reduce_mean(value)],
+        'return and value: ')
+    advantage = tf.Print(
+        advantage, [tf.reduce_mean(advantage)],
+        'normalized advantage: ')
+    # pylint: disable=g-long-lambda
+    value_loss, policy_loss, summary = tf.scan(
+        lambda _1, _2: self._update_step(
+            observ, action, old_mean, old_logstd, reward, advantage, length),
+        tf.range(self._config.update_epochs),
+        [0., 0., ''], parallel_iterations=1)
+    print_losses = tf.group(
+        tf.Print(0, [tf.reduce_mean(value_loss)], 'value loss: '),
+        tf.Print(0, [tf.reduce_mean(policy_loss)], 'policy loss: '))
+    with tf.control_dependencies([value_loss, policy_loss, print_losses]):
+      return summary[self._config.update_epochs // 2]
+
+  def _update_step(
+      self, observ, action, old_mean, old_logstd, reward, advantage, length):
+    """Compute the current combined loss and perform a gradient update step.
+
+    Args:
+      observ: Sequences of observations.
+      action: Sequences of actions.
+      old_mean: Sequences of action means of the behavioral policy.
+      old_logstd: Sequences of action log stddevs of the behavioral policy.
+      reward: Sequences of reward.
+      advantage: Sequences of advantages.
+      length: Batch of sequence lengths.
+
+    Returns:
+      Tuple of value loss, policy loss, and summary tensor.
+    """
+    value_loss, value_summary = self._value_loss(observ, reward, length)
+    network = self._network(observ, length)
+    policy_loss, policy_summary = self._policy_loss(
+        network.mean, network.logstd, old_mean, old_logstd, action,
+        advantage, length)
+    value_gradients, value_variables = (
+        zip(*self._optimizer.compute_gradients(value_loss)))
+    policy_gradients, policy_variables = (
+        zip(*self._optimizer.compute_gradients(policy_loss)))
+    all_gradients = value_gradients + policy_gradients
+    all_variables = value_variables + policy_variables
+    optimize = self._optimizer.apply_gradients(
+        zip(all_gradients, all_variables))
+    summary = tf.summary.merge([
+        value_summary, policy_summary,
+        tf.summary.scalar(
+            'value_gradient_norm', tf.global_norm(value_gradients)),
+        tf.summary.scalar(
+            'policy_gradient_norm', tf.global_norm(policy_gradients)),
+        utility.gradient_summaries(
+            zip(value_gradients, value_variables), dict(value=r'.*')),
+        utility.gradient_summaries(
+            zip(policy_gradients, policy_variables), dict(policy=r'.*'))])
+    with tf.control_dependencies([optimize]):
+      return [tf.identity(x) for x in (value_loss, policy_loss, summary)]
+
+  def _value_loss(self, observ, reward, length):
+    """Compute the loss function for the value baseline.
+
+    The value loss is the difference between empirical and approximated returns
+    over the collected episodes. Returns the loss tensor and a summary strin.
+
+    Args:
+      observ: Sequences of observations.
+      reward: Sequences of reward.
+      length: Batch of sequence lengths.
+
+    Returns:
+      Tuple of loss tensor and summary tensor.
+    """
+    with tf.name_scope('value_loss'):
+      value = self._network(observ, length).value
+      return_ = utility.discounted_return(
+          reward, length, self._config.discount)
+      advantage = return_ - value
+      value_loss = 0.5 * self._mask(advantage ** 2, length)
+      summary = tf.summary.merge([
+          tf.summary.histogram('value_loss', value_loss),
+          tf.summary.scalar('avg_value_loss', tf.reduce_mean(value_loss))])
+      value_loss = tf.reduce_mean(value_loss)
+      return tf.check_numerics(value_loss, 'value_loss'), summary
+
+  def _policy_loss(
+      self, mean, logstd, old_mean, old_logstd, action, advantage, length):
+    """Compute the policy loss composed of multiple components.
+
+    1. The policy gradient loss is importance sampled from the data-collecting
+       policy at the beginning of training.
+    2. The second term is a KL penalty between the policy at the beginning of
+       training and the current policy.
+    3. Additionally, if this KL already changed more than twice the target
+       amount, we activate a strong penalty discouraging further divergence.
+
+    Args:
+      mean: Sequences of action means of the current policy.
+      logstd: Sequences of action log stddevs of the current policy.
+      old_mean: Sequences of action means of the behavioral policy.
+      old_logstd: Sequences of action log stddevs of the behavioral policy.
+      action: Sequences of actions.
+      advantage: Sequences of advantages.
+      length: Batch of sequence lengths.
+
+    Returns:
+      Tuple of loss tensor and summary tensor.
+    """
+    with tf.name_scope('policy_loss'):
+      entropy = utility.diag_normal_entropy(mean, logstd)
+      kl = tf.reduce_mean(self._mask(utility.diag_normal_kl(
+          old_mean, old_logstd, mean, logstd), length), 1)
+      policy_gradient = tf.exp(
+          utility.diag_normal_logpdf(mean, logstd, action) -
+          utility.diag_normal_logpdf(old_mean, old_logstd, action))
+      surrogate_loss = -tf.reduce_mean(self._mask(
+          policy_gradient * tf.stop_gradient(advantage), length), 1)
+      kl_penalty = self._penalty * kl
+      cutoff_threshold = self._config.kl_target * self._config.kl_cutoff_factor
+      cutoff_count = tf.reduce_sum(
+          tf.cast(kl > cutoff_threshold, tf.int32))
+      with tf.control_dependencies([tf.cond(
+          cutoff_count > 0,
+          lambda: tf.Print(0, [cutoff_count], 'kl cutoff! '), int)]):
+        kl_cutoff = (
+            self._config.kl_cutoff_coef *
+            tf.cast(kl > cutoff_threshold, tf.float32) *
+            (kl - cutoff_threshold) ** 2)
+      policy_loss = surrogate_loss + kl_penalty + kl_cutoff
+      summary = tf.summary.merge([
+          tf.summary.histogram('entropy', entropy),
+          tf.summary.histogram('kl', kl),
+          tf.summary.histogram('surrogate_loss', surrogate_loss),
+          tf.summary.histogram('kl_penalty', kl_penalty),
+          tf.summary.histogram('kl_cutoff', kl_cutoff),
+          tf.summary.histogram('kl_penalty_combined', kl_penalty + kl_cutoff),
+          tf.summary.histogram('policy_loss', policy_loss),
+          tf.summary.scalar('avg_surr_loss', tf.reduce_mean(surrogate_loss)),
+          tf.summary.scalar('avg_kl_penalty', tf.reduce_mean(kl_penalty)),
+          tf.summary.scalar('avg_policy_loss', tf.reduce_mean(policy_loss))])
+      policy_loss = tf.reduce_mean(policy_loss, 0)
+      return tf.check_numerics(policy_loss, 'policy_loss'), summary
+
+  def _adjust_penalty(self, observ, old_mean, old_logstd, length):
+    """Adjust the KL policy between the behavioral and current policy.
+
+    Compute how much the policy actually changed during the multiple
+    update steps. Adjust the penalty strength for the next training phase if we
+    overshot or undershot the target divergence too much.
+
+    Args:
+      observ: Sequences of observations.
+      old_mean: Sequences of action means of the behavioral policy.
+      old_logstd: Sequences of action log stddevs of the behavioral policy.
+      length: Batch of sequence lengths.
+
+    Returns:
+      Summary tensor.
+    """
+    with tf.name_scope('adjust_penalty'):
+      network = self._network(observ, length)
+      assert_change = tf.assert_equal(
+          tf.reduce_all(tf.equal(network.mean, old_mean)), False,
+          message='policy should change')
+      print_penalty = tf.Print(0, [self._penalty], 'current penalty: ')
+      with tf.control_dependencies([assert_change, print_penalty]):
+        kl_change = tf.reduce_mean(self._mask(utility.diag_normal_kl(
+            old_mean, old_logstd, network.mean, network.logstd), length))
+        kl_change = tf.Print(kl_change, [kl_change], 'kl change: ')
+        maybe_increase = tf.cond(
+            kl_change > 1.3 * self._config.kl_target,
+            # pylint: disable=g-long-lambda
+            lambda: tf.Print(self._penalty.assign(
+                self._penalty * 1.5), [0], 'increase penalty '),
+            float)
+        maybe_decrease = tf.cond(
+            kl_change < 0.7 * self._config.kl_target,
+            # pylint: disable=g-long-lambda
+            lambda: tf.Print(self._penalty.assign(
+                self._penalty / 1.5), [0], 'decrease penalty '),
+            float)
+      with tf.control_dependencies([maybe_increase, maybe_decrease]):
+        return tf.summary.merge([
+            tf.summary.scalar('kl_change', kl_change),
+            tf.summary.scalar('penalty', self._penalty)])
+
+  def _mask(self, tensor, length):
+    """Set padding elements of a batch of sequences to zero.
+
+    Useful to then safely sum along the time dimension.
+
+    Args:
+      tensor: Tensor of sequences.
+      length: Batch of sequence lengths.
+
+    Returns:
+      Masked sequences.
+    """
+    with tf.name_scope('mask'):
+      range_ = tf.range(tensor.shape[1].value)
+      mask = tf.cast(range_[None, :] < length[:, None], tf.float32)
+      masked = tensor * mask
+      return tf.check_numerics(masked, 'masked')
--- a/examples/pybullet/gym/pybullet_envs/agents/ppo/memory.py
+++ b/examples/pybullet/gym/pybullet_envs/agents/ppo/memory.py
@@ -0,0 +1,152 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Memory that stores episodes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+class EpisodeMemory(object):
+  """Memory that stores episodes."""
+
+  def __init__(self, template, capacity, max_length, scope):
+    """Create a memory that stores episodes.
+
+    Each transition tuple consists of quantities specified by the template.
+    These quantities would typically be be observartions, actions, rewards, and
+    done indicators.
+
+    Args:
+      template: List of tensors to derive shapes and dtypes of each transition.
+      capacity: Number of episodes, or rows, hold by the memory.
+      max_length: Allocated sequence length for the episodes.
+      scope: Variable scope to use for internal variables.
+    """
+    self._capacity = capacity
+    self._max_length = max_length
+    with tf.variable_scope(scope) as var_scope:
+      self._scope = var_scope
+      self._length = tf.Variable(tf.zeros(capacity, tf.int32), False)
+      self._buffers = [
+          tf.Variable(tf.zeros(
+              [capacity, max_length] + elem.shape.as_list(),
+              elem.dtype), False)
+          for elem in template]
+
+  def length(self, rows=None):
+    """Tensor holding the current length of episodes.
+
+    Args:
+      rows: Episodes to select length from, defaults to all.
+
+    Returns:
+      Batch tensor of sequence lengths.
+    """
+    rows = tf.range(self._capacity) if rows is None else rows
+    return tf.gather(self._length, rows)
+
+  def append(self, transitions, rows=None):
+    """Append a batch of transitions to rows of the memory.
+
+    Args:
+      transitions: Tuple of transition quantities with batch dimension.
+      rows: Episodes to append to, defaults to all.
+
+    Returns:
+      Operation.
+    """
+    rows = tf.range(self._capacity) if rows is None else rows
+    assert rows.shape.ndims == 1
+    assert_capacity = tf.assert_less(
+        rows, self._capacity,
+        message='capacity exceeded')
+    with tf.control_dependencies([assert_capacity]):
+      assert_max_length = tf.assert_less(
+          tf.gather(self._length, rows), self._max_length,
+          message='max length exceeded')
+    append_ops = []
+    with tf.control_dependencies([assert_max_length]):
+      for buffer_, elements in zip(self._buffers, transitions):
+        timestep = tf.gather(self._length, rows)
+        indices = tf.stack([rows, timestep], 1)
+        append_ops.append(tf.scatter_nd_update(buffer_, indices, elements))
+    with tf.control_dependencies(append_ops):
+      episode_mask = tf.reduce_sum(tf.one_hot(
+          rows, self._capacity, dtype=tf.int32), 0)
+      return self._length.assign_add(episode_mask)
+
+  def replace(self, episodes, length, rows=None):
+    """Replace full episodes.
+
+    Args:
+      episodes: Tuple of transition quantities with batch and time dimensions.
+      length: Batch of sequence lengths.
+      rows: Episodes to replace, defaults to all.
+
+    Returns:
+      Operation.
+    """
+    rows = tf.range(self._capacity) if rows is None else rows
+    assert rows.shape.ndims == 1
+    assert_capacity = tf.assert_less(
+        rows, self._capacity, message='capacity exceeded')
+    with tf.control_dependencies([assert_capacity]):
+      assert_max_length = tf.assert_less_equal(
+          length, self._max_length, message='max length exceeded')
+    replace_ops = []
+    with tf.control_dependencies([assert_max_length]):
+      for buffer_, elements in zip(self._buffers, episodes):
+        replace_op = tf.scatter_update(buffer_, rows, elements)
+        replace_ops.append(replace_op)
+    with tf.control_dependencies(replace_ops):
+      return tf.scatter_update(self._length, rows, length)
+
+  def data(self, rows=None):
+    """Access a batch of episodes from the memory.
+
+    Padding elements after the length of each episode are unspecified and might
+    contain old data.
+
+    Args:
+      rows: Episodes to select, defaults to all.
+
+    Returns:
+      Tuple containing a tuple of transition quantiries with batch and time
+      dimensions, and a batch of sequence lengths.
+    """
+    rows = tf.range(self._capacity) if rows is None else rows
+    assert rows.shape.ndims == 1
+    episode = [tf.gather(buffer_, rows) for buffer_ in self._buffers]
+    length = tf.gather(self._length, rows)
+    return episode, length
+
+  def clear(self, rows=None):
+    """Reset episodes in the memory.
+
+    Internally, this only sets their lengths to zero. The memory entries will
+    be overridden by future calls to append() or replace().
+
+    Args:
+      rows: Episodes to clear, defaults to all.
+
+    Returns:
+      Operation.
+    """
+    rows = tf.range(self._capacity) if rows is None else rows
+    assert rows.shape.ndims == 1
+    return tf.scatter_update(self._length, rows, tf.zeros_like(rows))
--- a/examples/pybullet/gym/pybullet_envs/agents/ppo/normalize.py
+++ b/examples/pybullet/gym/pybullet_envs/agents/ppo/normalize.py
@@ -0,0 +1,168 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Normalize tensors based on streaming estimates of mean and variance."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+class StreamingNormalize(object):
+  """Normalize tensors based on streaming estimates of mean and variance."""
+
+  def __init__(
+      self, template, center=True, scale=True, clip=10, name='normalize'):
+    """Normalize tensors based on streaming estimates of mean and variance.
+
+    Centering the value, scaling it by the standard deviation, and clipping
+    outlier values are optional.
+
+    Args:
+      template: Example tensor providing shape and dtype of the vaule to track.
+      center: Python boolean indicating whether to subtract mean from values.
+      scale: Python boolean indicating whether to scale values by stddev.
+      clip: If and when to clip normalized values.
+      name: Parent scope of operations provided by this class.
+    """
+    self._center = center
+    self._scale = scale
+    self._clip = clip
+    self._name = name
+    with tf.name_scope(name):
+      self._count = tf.Variable(0, False)
+      self._mean = tf.Variable(tf.zeros_like(template), False)
+      self._var_sum = tf.Variable(tf.zeros_like(template), False)
+
+  def transform(self, value):
+    """Normalize a single or batch tensor.
+
+    Applies the activated transformations in the constructor using current
+    estimates of mean and variance.
+
+    Args:
+      value: Batch or single value tensor.
+
+    Returns:
+      Normalized batch or single value tensor.
+    """
+    with tf.name_scope(self._name + '/transform'):
+      no_batch_dim = value.shape.ndims == self._mean.shape.ndims
+      if no_batch_dim:
+        # Add a batch dimension if necessary.
+        value = value[None, ...]
+      if self._center:
+        value -= self._mean[None, ...]
+      if self._scale:
+        # We cannot scale before seeing at least two samples.
+        value /= tf.cond(
+            self._count > 1, lambda: self._std() + 1e-8,
+            lambda: tf.ones_like(self._var_sum))[None]
+      if self._clip:
+        value = tf.clip_by_value(value, -self._clip, self._clip)
+      # Remove batch dimension if necessary.
+      if no_batch_dim:
+        value = value[0]
+      return tf.check_numerics(value, 'value')
+
+  def update(self, value):
+    """Update the mean and variance estimates.
+
+    Args:
+      value: Batch or single value tensor.
+
+    Returns:
+      Summary tensor.
+    """
+    with tf.name_scope(self._name + '/update'):
+      if value.shape.ndims == self._mean.shape.ndims:
+        # Add a batch dimension if necessary.
+        value = value[None, ...]
+      count = tf.shape(value)[0]
+      with tf.control_dependencies([self._count.assign_add(count)]):
+        step = tf.cast(self._count, tf.float32)
+        mean_delta = tf.reduce_sum(value - self._mean[None, ...], 0)
+        new_mean = self._mean + mean_delta / step
+        new_mean = tf.cond(self._count > 1, lambda: new_mean, lambda: value[0])
+        var_delta = (
+            value - self._mean[None, ...]) * (value - new_mean[None, ...])
+        new_var_sum = self._var_sum + tf.reduce_sum(var_delta, 0)
+      with tf.control_dependencies([new_mean, new_var_sum]):
+        update = self._mean.assign(new_mean), self._var_sum.assign(new_var_sum)
+      with tf.control_dependencies(update):
+        if value.shape.ndims == 1:
+          value = tf.reduce_mean(value)
+        return self._summary('value', tf.reduce_mean(value))
+
+  def reset(self):
+    """Reset the estimates of mean and variance.
+
+    Resets the full state of this class.
+
+    Returns:
+      Operation.
+    """
+    with tf.name_scope(self._name + '/reset'):
+      return tf.group(
+          self._count.assign(0),
+          self._mean.assign(tf.zeros_like(self._mean)),
+          self._var_sum.assign(tf.zeros_like(self._var_sum)))
+
+  def summary(self):
+    """Summary string of mean and standard deviation.
+
+    Returns:
+      Summary tensor.
+    """
+    with tf.name_scope(self._name + '/summary'):
+      mean_summary = tf.cond(
+          self._count > 0, lambda: self._summary('mean', self._mean), str)
+      std_summary = tf.cond(
+          self._count > 1, lambda: self._summary('stddev', self._std()), str)
+      return tf.summary.merge([mean_summary, std_summary])
+
+  def _std(self):
+    """Computes the current estimate of the standard deviation.
+
+    Note that the standard deviation is not defined until at least two samples
+    were seen.
+
+    Returns:
+      Tensor of current variance.
+    """
+    variance = tf.cond(
+        self._count > 1,
+        lambda: self._var_sum / tf.cast(self._count - 1, tf.float32),
+        lambda: tf.ones_like(self._var_sum) * float('nan'))
+    # The epsilon corrects for small negative variance values caused by
+    # the algorithm. It was empirically chosen to work with all environments
+    # tested.
+    return tf.sqrt(variance + 1e-4)
+
+  def _summary(self, name, tensor):
+    """Create a scalar or histogram summary matching the rank of the tensor.
+
+    Args:
+      name: Name for the summary.
+      tensor: Tensor to summarize.
+
+    Returns:
+      Summary tensor.
+    """
+    if tensor.shape.ndims == 0:
+      return tf.summary.scalar(name, tensor)
+    else:
+      return tf.summary.histogram(name, tensor)
--- a/examples/pybullet/gym/pybullet_envs/agents/ppo/utility.py
+++ b/examples/pybullet/gym/pybullet_envs/agents/ppo/utility.py
@@ -0,0 +1,213 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for the PPO algorithm."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import math
+import re
+
+import tensorflow as tf
+from tensorflow.python.client import device_lib
+
+
+def reinit_nested_vars(variables, indices=None):
+  """Reset all variables in a nested tuple to zeros.
+
+  Args:
+    variables: Nested tuple or list of variaables.
+    indices: Batch indices to reset, defaults to all.
+
+  Returns:
+    Operation.
+  """
+  if isinstance(variables, (tuple, list)):
+    return tf.group(*[
+        reinit_nested_vars(variable, indices) for variable in variables])
+  if indices is None:
+    return variables.assign(tf.zeros_like(variables))
+  else:
+    zeros = tf.zeros([tf.shape(indices)[0]] + variables.shape[1:].as_list())
+    return tf.scatter_update(variables, indices, zeros)
+
+
+def assign_nested_vars(variables, tensors, indices=None):
+  """Assign tensors to matching nested tuple of variables.
+
+  Args:
+    variables: Nested tuple or list of variables to update.
+    tensors: Nested tuple or list of tensors to assign.
+    indices: Batch indices to assign to; default to all.
+
+  Returns:
+    Operation.
+  """
+  if isinstance(variables, (tuple, list)):
+    return tf.group(*[
+        assign_nested_vars(variable, tensor)
+        for variable, tensor in zip(variables, tensors)])
+  if indices is None:
+    return variables.assign(tensors)
+  else:
+    return tf.scatter_update(variables, indices, tensors)
+
+
+def discounted_return(reward, length, discount):
+  """Discounted Monte-Carlo returns."""
+  timestep = tf.range(reward.shape[1].value)
+  mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
+  return_ = tf.reverse(tf.transpose(tf.scan(
+      lambda agg, cur: cur + discount * agg,
+      tf.transpose(tf.reverse(mask * reward, [1]), [1, 0]),
+      tf.zeros_like(reward[:, -1]), 1, False), [1, 0]), [1])
+  return tf.check_numerics(tf.stop_gradient(return_), 'return')
+
+
+def fixed_step_return(reward, value, length, discount, window):
+  """N-step discounted return."""
+  timestep = tf.range(reward.shape[1].value)
+  mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
+  return_ = tf.zeros_like(reward)
+  for _ in range(window):
+    return_ += reward
+    reward = discount * tf.concat(
+        [reward[:, 1:], tf.zeros_like(reward[:, -1:])], 1)
+  return_ += discount ** window * tf.concat(
+      [value[:, window:], tf.zeros_like(value[:, -window:]), 1])
+  return tf.check_numerics(tf.stop_gradient(mask * return_), 'return')
+
+
+def lambda_return(reward, value, length, discount, lambda_):
+  """TD-lambda returns."""
+  timestep = tf.range(reward.shape[1].value)
+  mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
+  sequence = mask * reward + discount * value * (1 - lambda_)
+  discount = mask * discount * lambda_
+  sequence = tf.stack([sequence, discount], 2)
+  return_ = tf.reverse(tf.transpose(tf.scan(
+      lambda agg, cur: cur[0] + cur[1] * agg,
+      tf.transpose(tf.reverse(sequence, [1]), [1, 2, 0]),
+      tf.zeros_like(value[:, -1]), 1, False), [1, 0]), [1])
+  return tf.check_numerics(tf.stop_gradient(return_), 'return')
+
+
+def lambda_advantage(reward, value, length, discount):
+  """Generalized Advantage Estimation."""
+  timestep = tf.range(reward.shape[1].value)
+  mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
+  next_value = tf.concat([value[:, 1:], tf.zeros_like(value[:, -1:])], 1)
+  delta = reward + discount * next_value - value
+  advantage = tf.reverse(tf.transpose(tf.scan(
+      lambda agg, cur: cur + discount * agg,
+      tf.transpose(tf.reverse(mask * delta, [1]), [1, 0]),
+      tf.zeros_like(delta[:, -1]), 1, False), [1, 0]), [1])
+  return tf.check_numerics(tf.stop_gradient(advantage), 'advantage')
+
+
+def diag_normal_kl(mean0, logstd0, mean1, logstd1):
+  """Epirical KL divergence of two normals with diagonal covariance."""
+  logstd0_2, logstd1_2 = 2 * logstd0, 2 * logstd1
+  return 0.5 * (
+      tf.reduce_sum(tf.exp(logstd0_2 - logstd1_2), -1) +
+      tf.reduce_sum((mean1 - mean0) ** 2 / tf.exp(logstd1_2), -1) +
+      tf.reduce_sum(logstd1_2, -1) - tf.reduce_sum(logstd0_2, -1) -
+      mean0.shape[-1].value)
+
+
+def diag_normal_logpdf(mean, logstd, loc):
+  """Log density of a normal with diagonal covariance."""
+  constant = -0.5 * math.log(2 * math.pi) - logstd
+  value = -0.5 * ((loc - mean) / tf.exp(logstd)) ** 2
+  return tf.reduce_sum(constant + value, -1)
+
+
+def diag_normal_entropy(mean, logstd):
+  """Empirical entropy of a normal with diagonal covariance."""
+  constant = mean.shape[-1].value * math.log(2 * math.pi * math.e)
+  return (constant + tf.reduce_sum(2 * logstd, 1)) / 2
+
+
+def available_gpus():
+  """List of GPU device names detected by TensorFlow."""
+  local_device_protos = device_lib.list_local_devices()
+  return [x.name for x in local_device_protos if x.device_type == 'GPU']
+
+
+def gradient_summaries(grad_vars, groups=None, scope='gradients'):
+  """Create histogram summaries of the gradient.
+
+  Summaries can be grouped via regexes matching variables names.
+
+  Args:
+    grad_vars: List of (gradient, variable) tuples as returned by optimizers.
+    groups: Mapping of name to regex for grouping summaries.
+    scope: Name scope for this operation.
+
+  Returns:
+    Summary tensor.
+  """
+  groups = groups or {r'all': r'.*'}
+  grouped = collections.defaultdict(list)
+  for grad, var in grad_vars:
+    if grad is None:
+      continue
+    for name, pattern in groups.items():
+      if re.match(pattern, var.name):
+        name = re.sub(pattern, name, var.name)
+        grouped[name].append(grad)
+  for name in groups:
+    if name not in grouped:
+      tf.logging.warn("No variables matching '{}' group.".format(name))
+  summaries = []
+  for name, grads in grouped.items():
+    grads = [tf.reshape(grad, [-1]) for grad in grads]
+    grads = tf.concat(grads, 0)
+    summaries.append(tf.summary.histogram(scope + '/' + name, grads))
+  return tf.summary.merge(summaries)
+
+
+def variable_summaries(vars_, groups=None, scope='weights'):
+  """Create histogram summaries for the provided variables.
+
+  Summaries can be grouped via regexes matching variables names.
+
+  Args:
+    vars_: List of variables to summarize.
+    groups: Mapping of name to regex for grouping summaries.
+    scope: Name scope for this operation.
+
+  Returns:
+    Summary tensor.
+  """
+  groups = groups or {r'all': r'.*'}
+  grouped = collections.defaultdict(list)
+  for var in vars_:
+    for name, pattern in groups.items():
+      if re.match(pattern, var.name):
+        name = re.sub(pattern, name, var.name)
+        grouped[name].append(var)
+  for name in groups:
+    if name not in grouped:
+      tf.logging.warn("No variables matching '{}' group.".format(name))
+  summaries = []
+  # pylint: disable=redefined-argument-from-local
+  for name, vars_ in grouped.items():
+    vars_ = [tf.reshape(var, [-1]) for var in vars_]
+    vars_ = tf.concat(vars_, 0)
+    summaries.append(tf.summary.histogram(scope + '/' + name, vars_))
+  return tf.summary.merge(summaries)