add yapf style and apply yapf to format all Python files

This recreates pull request #2192
2019-04-27 07:31:15 -07:00
parent c591735042
commit ef9570c315
347 changed files with 70304 additions and 22752 deletions
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/init.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/init.py
@@ -1 +1 @@
- 
+
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/init.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/init.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Proximal Policy Optimization algorithm."""

 from __future__ import absolute_import
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/algorithm.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/algorithm.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Proximal Policy Optimization algorithm.

 Based on John Schulman's implementation in Python and Theano:
@@ -30,9 +29,7 @@ from . import memory
 from . import normalize
 from . import utility

-
-_NetworkOutput = collections.namedtuple(
-    'NetworkOutput', 'policy, mean, logstd, value, state')
+_NetworkOutput = collections.namedtuple('NetworkOutput', 'policy, mean, logstd, value, state')


 class PPOAlgorithm(object):
@@ -53,44 +50,46 @@ class PPOAlgorithm(object):
    self._is_training = is_training
    self._should_log = should_log
    self._config = config
-    self._observ_filter = normalize.StreamingNormalize(
-        self._batch_env.observ[0], center=True, scale=True, clip=5,
-        name='normalize_observ')
-    self._reward_filter = normalize.StreamingNormalize(
-        self._batch_env.reward[0], center=False, scale=True, clip=10,
-        name='normalize_reward')
+    self._observ_filter = normalize.StreamingNormalize(self._batch_env.observ[0],
+                                                       center=True,
+                                                       scale=True,
+                                                       clip=5,
+                                                       name='normalize_observ')
+    self._reward_filter = normalize.StreamingNormalize(self._batch_env.reward[0],
+                                                       center=False,
+                                                       scale=True,
+                                                       clip=10,
+                                                       name='normalize_reward')
    # Memory stores tuple of observ, action, mean, logstd, reward.
-    template = (
-        self._batch_env.observ[0], self._batch_env.action[0],
-        self._batch_env.action[0], self._batch_env.action[0],
-        self._batch_env.reward[0])
-    self._memory = memory.EpisodeMemory(
-        template, config.update_every, config.max_length, 'memory')
+    template = (self._batch_env.observ[0], self._batch_env.action[0], self._batch_env.action[0],
+                self._batch_env.action[0], self._batch_env.reward[0])
+    self._memory = memory.EpisodeMemory(template, config.update_every, config.max_length, 'memory')
    self._memory_index = tf.Variable(0, False)
    use_gpu = self._config.use_gpu and utility.available_gpus()
    with tf.device('/gpu:0' if use_gpu else '/cpu:0'):
      # Create network variables for later calls to reuse.
-      self._network(
-          tf.zeros_like(self._batch_env.observ)[:, None],
-          tf.ones(len(self._batch_env)), reuse=None)
+      self._network(tf.zeros_like(self._batch_env.observ)[:, None],
+                    tf.ones(len(self._batch_env)),
+                    reuse=None)
      cell = self._config.network(self._batch_env.action.shape[1].value)
      with tf.variable_scope('ppo_temporary'):
-        self._episodes = memory.EpisodeMemory(
-            template, len(batch_env), config.max_length, 'episodes')
-        self._last_state = utility.create_nested_vars(
-            cell.zero_state(len(batch_env), tf.float32))
-        self._last_action = tf.Variable(
-            tf.zeros_like(self._batch_env.action), False, name='last_action')
-        self._last_mean = tf.Variable(
-            tf.zeros_like(self._batch_env.action), False, name='last_mean')
-        self._last_logstd = tf.Variable(
-            tf.zeros_like(self._batch_env.action), False, name='last_logstd')
-    self._penalty = tf.Variable(
-        self._config.kl_init_penalty, False, dtype=tf.float32)
-    self._policy_optimizer = self._config.policy_optimizer(
-        self._config.policy_lr, name='policy_optimizer')
-    self._value_optimizer = self._config.value_optimizer(
-        self._config.value_lr, name='value_optimizer')
+        self._episodes = memory.EpisodeMemory(template, len(batch_env), config.max_length,
+                                              'episodes')
+        self._last_state = utility.create_nested_vars(cell.zero_state(len(batch_env), tf.float32))
+        self._last_action = tf.Variable(tf.zeros_like(self._batch_env.action),
+                                        False,
+                                        name='last_action')
+        self._last_mean = tf.Variable(tf.zeros_like(self._batch_env.action),
+                                      False,
+                                      name='last_mean')
+        self._last_logstd = tf.Variable(tf.zeros_like(self._batch_env.action),
+                                        False,
+                                        name='last_logstd')
+    self._penalty = tf.Variable(self._config.kl_init_penalty, False, dtype=tf.float32)
+    self._policy_optimizer = self._config.policy_optimizer(self._config.policy_lr,
+                                                           name='policy_optimizer')
+    self._value_optimizer = self._config.value_optimizer(self._config.value_lr,
+                                                         name='value_optimizer')

  def begin_episode(self, agent_indices):
    """Reset the recurrent states and stored episode.
@@ -118,23 +117,24 @@ class PPOAlgorithm(object):
    """
    with tf.name_scope('perform/'):
      observ = self._observ_filter.transform(observ)
-      network = self._network(
-          observ[:, None], tf.ones(observ.shape[0]), self._last_state)
-      action = tf.cond(
-          self._is_training, network.policy.sample, lambda: network.mean)
+      network = self._network(observ[:, None], tf.ones(observ.shape[0]), self._last_state)
+      action = tf.cond(self._is_training, network.policy.sample, lambda: network.mean)
      logprob = network.policy.log_prob(action)[:, 0]
      # pylint: disable=g-long-lambda
-      summary = tf.cond(self._should_log, lambda: tf.summary.merge([
-          tf.summary.histogram('mean', network.mean[:, 0]),
-          tf.summary.histogram('std', tf.exp(network.logstd[:, 0])),
-          tf.summary.histogram('action', action[:, 0]),
-          tf.summary.histogram('logprob', logprob)]), str)
+      summary = tf.cond(
+          self._should_log, lambda: tf.summary.merge([
+              tf.summary.histogram('mean', network.mean[:, 0]),
+              tf.summary.histogram('std', tf.exp(network.logstd[:, 0])),
+              tf.summary.histogram('action', action[:, 0]),
+              tf.summary.histogram('logprob', logprob)
+          ]), str)
      # Remember current policy to append to memory in the experience callback.
      with tf.control_dependencies([
          utility.assign_nested_vars(self._last_state, network.state),
          self._last_action.assign(action[:, 0]),
          self._last_mean.assign(network.mean[:, 0]),
-          self._last_logstd.assign(network.logstd[:, 0])]):
+          self._last_logstd.assign(network.logstd[:, 0])
+      ]):
        return tf.check_numerics(action[:, 0], 'action'), tf.identity(summary)

  def experience(self, observ, action, reward, unused_done, unused_nextob):
@@ -155,15 +155,14 @@ class PPOAlgorithm(object):
      Summary tensor.
    """
    with tf.name_scope('experience/'):
-      return tf.cond(
-          self._is_training,
-          lambda: self._define_experience(observ, action, reward), str)
+      return tf.cond(self._is_training, lambda: self._define_experience(observ, action, reward),
+                     str)

  def _define_experience(self, observ, action, reward):
    """Implement the branch of experience() entered during training."""
-    update_filters = tf.summary.merge([
-        self._observ_filter.update(observ),
-        self._reward_filter.update(reward)])
+    update_filters = tf.summary.merge(
+        [self._observ_filter.update(observ),
+         self._reward_filter.update(reward)])
    with tf.control_dependencies([update_filters]):
      if self._config.train_on_agent_action:
        # NOTE: Doesn't seem to change much.
@@ -174,14 +173,16 @@ class PPOAlgorithm(object):
      norm_observ = self._observ_filter.transform(observ)
      norm_reward = tf.reduce_mean(self._reward_filter.transform(reward))
      # pylint: disable=g-long-lambda
-      summary = tf.cond(self._should_log, lambda: tf.summary.merge([
-          update_filters,
-          self._observ_filter.summary(),
-          self._reward_filter.summary(),
-          tf.summary.scalar('memory_size', self._memory_index),
-          tf.summary.histogram('normalized_observ', norm_observ),
-          tf.summary.histogram('action', self._last_action),
-          tf.summary.scalar('normalized_reward', norm_reward)]), str)
+      summary = tf.cond(
+          self._should_log, lambda: tf.summary.merge([
+              update_filters,
+              self._observ_filter.summary(),
+              self._reward_filter.summary(),
+              tf.summary.scalar('memory_size', self._memory_index),
+              tf.summary.histogram('normalized_observ', norm_observ),
+              tf.summary.histogram('action', self._last_action),
+              tf.summary.scalar('normalized_reward', norm_reward)
+          ]), str)
      return summary

  def end_episode(self, agent_indices):
@@ -199,20 +200,16 @@ class PPOAlgorithm(object):
       Summary tensor.
    """
    with tf.name_scope('end_episode/'):
-      return tf.cond(
-          self._is_training,
-          lambda: self._define_end_episode(agent_indices), str)
+      return tf.cond(self._is_training, lambda: self._define_end_episode(agent_indices), str)

  def _define_end_episode(self, agent_indices):
    """Implement the branch of end_episode() entered during training."""
    episodes, length = self._episodes.data(agent_indices)
    space_left = self._config.update_every - self._memory_index
-    use_episodes = tf.range(tf.minimum(
-        tf.shape(agent_indices)[0], space_left))
+    use_episodes = tf.range(tf.minimum(tf.shape(agent_indices)[0], space_left))
    episodes = [tf.gather(elem, use_episodes) for elem in episodes]
-    append = self._memory.replace(
-        episodes, tf.gather(length, use_episodes),
-        use_episodes + self._memory_index)
+    append = self._memory.replace(episodes, tf.gather(length, use_episodes),
+                                  use_episodes + self._memory_index)
    with tf.control_dependencies([append]):
      inc_index = self._memory_index.assign_add(tf.shape(use_episodes)[0])
    with tf.control_dependencies([inc_index]):
@@ -229,8 +226,7 @@ class PPOAlgorithm(object):
      Summary tensor.
    """
    with tf.name_scope('training'):
-      assert_full = tf.assert_equal(
-          self._memory_index, self._config.update_every)
+      assert_full = tf.assert_equal(self._memory_index, self._config.update_every)
      with tf.control_dependencies([assert_full]):
        data = self._memory.data()
      (observ, action, old_mean, old_logstd, reward), length = data
@@ -238,21 +234,17 @@ class PPOAlgorithm(object):
        length = tf.identity(length)
      observ = self._observ_filter.transform(observ)
      reward = self._reward_filter.transform(reward)
-      policy_summary = self._update_policy(
-          observ, action, old_mean, old_logstd, reward, length)
+      policy_summary = self._update_policy(observ, action, old_mean, old_logstd, reward, length)
      with tf.control_dependencies([policy_summary]):
        value_summary = self._update_value(observ, reward, length)
      with tf.control_dependencies([value_summary]):
-        penalty_summary = self._adjust_penalty(
-            observ, old_mean, old_logstd, length)
+        penalty_summary = self._adjust_penalty(observ, old_mean, old_logstd, length)
      with tf.control_dependencies([penalty_summary]):
-        clear_memory = tf.group(
-            self._memory.clear(), self._memory_index.assign(0))
+        clear_memory = tf.group(self._memory.clear(), self._memory_index.assign(0))
      with tf.control_dependencies([clear_memory]):
-        weight_summary = utility.variable_summaries(
-            tf.trainable_variables(), self._config.weight_summaries)
-        return tf.summary.merge([
-            policy_summary, value_summary, penalty_summary, weight_summary])
+        weight_summary = utility.variable_summaries(tf.trainable_variables(),
+                                                    self._config.weight_summaries)
+        return tf.summary.merge([policy_summary, value_summary, penalty_summary, weight_summary])

  def _update_value(self, observ, reward, length):
    """Perform multiple update steps of the value baseline.
@@ -269,10 +261,9 @@ class PPOAlgorithm(object):
      Summary tensor.
    """
    with tf.name_scope('update_value'):
-      loss, summary = tf.scan(
-          lambda _1, _2: self._update_value_step(observ, reward, length),
-          tf.range(self._config.update_epochs_value),
-          [0., ''], parallel_iterations=1)
+      loss, summary = tf.scan(lambda _1, _2: self._update_value_step(observ, reward, length),
+                              tf.range(self._config.update_epochs_value), [0., ''],
+                              parallel_iterations=1)
      print_loss = tf.Print(0, [tf.reduce_mean(loss)], 'value loss: ')
      with tf.control_dependencies([loss, print_loss]):
        return summary[self._config.update_epochs_value // 2]
@@ -289,15 +280,13 @@ class PPOAlgorithm(object):
      Tuple of loss tensor and summary tensor.
    """
    loss, summary = self._value_loss(observ, reward, length)
-    gradients, variables = (
-        zip(*self._value_optimizer.compute_gradients(loss)))
-    optimize = self._value_optimizer.apply_gradients(
-        zip(gradients, variables))
+    gradients, variables = (zip(*self._value_optimizer.compute_gradients(loss)))
+    optimize = self._value_optimizer.apply_gradients(zip(gradients, variables))
    summary = tf.summary.merge([
        summary,
        tf.summary.scalar('gradient_norm', tf.global_norm(gradients)),
-        utility.gradient_summaries(
-            zip(gradients, variables), dict(value=r'.*'))])
+        utility.gradient_summaries(zip(gradients, variables), dict(value=r'.*'))
+    ])
    with tf.control_dependencies([optimize]):
      return [tf.identity(loss), tf.identity(summary)]

@@ -317,18 +306,17 @@ class PPOAlgorithm(object):
    """
    with tf.name_scope('value_loss'):
      value = self._network(observ, length).value
-      return_ = utility.discounted_return(
-          reward, length, self._config.discount)
+      return_ = utility.discounted_return(reward, length, self._config.discount)
      advantage = return_ - value
-      value_loss = 0.5 * self._mask(advantage ** 2, length)
+      value_loss = 0.5 * self._mask(advantage**2, length)
      summary = tf.summary.merge([
          tf.summary.histogram('value_loss', value_loss),
-          tf.summary.scalar('avg_value_loss', tf.reduce_mean(value_loss))])
+          tf.summary.scalar('avg_value_loss', tf.reduce_mean(value_loss))
+      ])
      value_loss = tf.reduce_mean(value_loss)
      return tf.check_numerics(value_loss, 'value_loss'), summary

-  def _update_policy(
-      self, observ, action, old_mean, old_logstd, reward, length):
+  def _update_policy(self, observ, action, old_mean, old_logstd, reward, length):
    """Perform multiple update steps of the policy.

    The advantage is computed once at the beginning and shared across
@@ -347,35 +335,28 @@ class PPOAlgorithm(object):
      Summary tensor.
    """
    with tf.name_scope('update_policy'):
-      return_ = utility.discounted_return(
-          reward, length, self._config.discount)
+      return_ = utility.discounted_return(reward, length, self._config.discount)
      value = self._network(observ, length).value
      if self._config.gae_lambda:
-        advantage = utility.lambda_return(
-            reward, value, length, self._config.discount,
-            self._config.gae_lambda)
+        advantage = utility.lambda_return(reward, value, length, self._config.discount,
+                                          self._config.gae_lambda)
      else:
        advantage = return_ - value
      mean, variance = tf.nn.moments(advantage, axes=[0, 1], keep_dims=True)
      advantage = (advantage - mean) / (tf.sqrt(variance) + 1e-8)
      advantage = tf.Print(
-          advantage, [tf.reduce_mean(return_), tf.reduce_mean(value)],
-          'return and value: ')
-      advantage = tf.Print(
-          advantage, [tf.reduce_mean(advantage)],
-          'normalized advantage: ')
+          advantage, [tf.reduce_mean(return_), tf.reduce_mean(value)], 'return and value: ')
+      advantage = tf.Print(advantage, [tf.reduce_mean(advantage)], 'normalized advantage: ')
      # pylint: disable=g-long-lambda
-      loss, summary = tf.scan(
-          lambda _1, _2: self._update_policy_step(
-              observ, action, old_mean, old_logstd, advantage, length),
-          tf.range(self._config.update_epochs_policy),
-          [0., ''], parallel_iterations=1)
+      loss, summary = tf.scan(lambda _1, _2: self._update_policy_step(
+          observ, action, old_mean, old_logstd, advantage, length),
+                              tf.range(self._config.update_epochs_policy), [0., ''],
+                              parallel_iterations=1)
      print_loss = tf.Print(0, [tf.reduce_mean(loss)], 'policy loss: ')
      with tf.control_dependencies([loss, print_loss]):
        return summary[self._config.update_epochs_policy // 2]

-  def _update_policy_step(
-      self, observ, action, old_mean, old_logstd, advantage, length):
+  def _update_policy_step(self, observ, action, old_mean, old_logstd, advantage, length):
    """Compute the current policy loss and perform a gradient update step.

    Args:
@@ -390,23 +371,19 @@ class PPOAlgorithm(object):
      Tuple of loss tensor and summary tensor.
    """
    network = self._network(observ, length)
-    loss, summary = self._policy_loss(
-        network.mean, network.logstd, old_mean, old_logstd, action,
-        advantage, length)
-    gradients, variables = (
-        zip(*self._policy_optimizer.compute_gradients(loss)))
-    optimize = self._policy_optimizer.apply_gradients(
-        zip(gradients, variables))
+    loss, summary = self._policy_loss(network.mean, network.logstd, old_mean, old_logstd, action,
+                                      advantage, length)
+    gradients, variables = (zip(*self._policy_optimizer.compute_gradients(loss)))
+    optimize = self._policy_optimizer.apply_gradients(zip(gradients, variables))
    summary = tf.summary.merge([
        summary,
        tf.summary.scalar('gradient_norm', tf.global_norm(gradients)),
-        utility.gradient_summaries(
-            zip(gradients, variables), dict(policy=r'.*'))])
+        utility.gradient_summaries(zip(gradients, variables), dict(policy=r'.*'))
+    ])
    with tf.control_dependencies([optimize]):
      return [tf.identity(loss), tf.identity(summary)]

-  def _policy_loss(
-      self, mean, logstd, old_mean, old_logstd, action, advantage, length):
+  def _policy_loss(self, mean, logstd, old_mean, old_logstd, action, advantage, length):
    """Compute the policy loss composed of multiple components.

    1. The policy gradient loss is importance sampled from the data-collecting
@@ -430,24 +407,20 @@ class PPOAlgorithm(object):
    """
    with tf.name_scope('policy_loss'):
      entropy = utility.diag_normal_entropy(mean, logstd)
-      kl = tf.reduce_mean(self._mask(utility.diag_normal_kl(
-          old_mean, old_logstd, mean, logstd), length), 1)
+      kl = tf.reduce_mean(
+          self._mask(utility.diag_normal_kl(old_mean, old_logstd, mean, logstd), length), 1)
      policy_gradient = tf.exp(
          utility.diag_normal_logpdf(mean, logstd, action) -
          utility.diag_normal_logpdf(old_mean, old_logstd, action))
-      surrogate_loss = -tf.reduce_mean(self._mask(
-          policy_gradient * tf.stop_gradient(advantage), length), 1)
+      surrogate_loss = -tf.reduce_mean(
+          self._mask(policy_gradient * tf.stop_gradient(advantage), length), 1)
      kl_penalty = self._penalty * kl
      cutoff_threshold = self._config.kl_target * self._config.kl_cutoff_factor
-      cutoff_count = tf.reduce_sum(
-          tf.cast(kl > cutoff_threshold, tf.int32))
-      with tf.control_dependencies([tf.cond(
-          cutoff_count > 0,
-          lambda: tf.Print(0, [cutoff_count], 'kl cutoff! '), int)]):
-        kl_cutoff = (
-            self._config.kl_cutoff_coef *
-            tf.cast(kl > cutoff_threshold, tf.float32) *
-            (kl - cutoff_threshold) ** 2)
+      cutoff_count = tf.reduce_sum(tf.cast(kl > cutoff_threshold, tf.int32))
+      with tf.control_dependencies(
+          [tf.cond(cutoff_count > 0, lambda: tf.Print(0, [cutoff_count], 'kl cutoff! '), int)]):
+        kl_cutoff = (self._config.kl_cutoff_coef * tf.cast(kl > cutoff_threshold, tf.float32) *
+                     (kl - cutoff_threshold)**2)
      policy_loss = surrogate_loss + kl_penalty + kl_cutoff
      summary = tf.summary.merge([
          tf.summary.histogram('entropy', entropy),
@@ -459,7 +432,8 @@ class PPOAlgorithm(object):
          tf.summary.histogram('policy_loss', policy_loss),
          tf.summary.scalar('avg_surr_loss', tf.reduce_mean(surrogate_loss)),
          tf.summary.scalar('avg_kl_penalty', tf.reduce_mean(kl_penalty)),
-          tf.summary.scalar('avg_policy_loss', tf.reduce_mean(policy_loss))])
+          tf.summary.scalar('avg_policy_loss', tf.reduce_mean(policy_loss))
+      ])
      policy_loss = tf.reduce_mean(policy_loss, 0)
      return tf.check_numerics(policy_loss, 'policy_loss'), summary

@@ -481,30 +455,30 @@ class PPOAlgorithm(object):
    """
    with tf.name_scope('adjust_penalty'):
      network = self._network(observ, length)
-      assert_change = tf.assert_equal(
-          tf.reduce_all(tf.equal(network.mean, old_mean)), False,
-          message='policy should change')
+      assert_change = tf.assert_equal(tf.reduce_all(tf.equal(network.mean, old_mean)),
+                                      False,
+                                      message='policy should change')
      print_penalty = tf.Print(0, [self._penalty], 'current penalty: ')
      with tf.control_dependencies([assert_change, print_penalty]):
-        kl_change = tf.reduce_mean(self._mask(utility.diag_normal_kl(
-            old_mean, old_logstd, network.mean, network.logstd), length))
+        kl_change = tf.reduce_mean(
+            self._mask(utility.diag_normal_kl(old_mean, old_logstd, network.mean, network.logstd),
+                       length))
        kl_change = tf.Print(kl_change, [kl_change], 'kl change: ')
        maybe_increase = tf.cond(
            kl_change > 1.3 * self._config.kl_target,
            # pylint: disable=g-long-lambda
-            lambda: tf.Print(self._penalty.assign(
-                self._penalty * 1.5), [0], 'increase penalty '),
+            lambda: tf.Print(self._penalty.assign(self._penalty * 1.5), [0], 'increase penalty '),
            float)
        maybe_decrease = tf.cond(
            kl_change < 0.7 * self._config.kl_target,
            # pylint: disable=g-long-lambda
-            lambda: tf.Print(self._penalty.assign(
-                self._penalty / 1.5), [0], 'decrease penalty '),
+            lambda: tf.Print(self._penalty.assign(self._penalty / 1.5), [0], 'decrease penalty '),
            float)
      with tf.control_dependencies([maybe_increase, maybe_decrease]):
        return tf.summary.merge([
            tf.summary.scalar('kl_change', kl_change),
-            tf.summary.scalar('penalty', self._penalty)])
+            tf.summary.scalar('penalty', self._penalty)
+        ])

  def _mask(self, tensor, length):
    """Set padding elements of a batch of sequences to zero.
@@ -548,11 +522,14 @@ class PPOAlgorithm(object):
      with tf.device('/gpu:0' if use_gpu else '/cpu:0'):
        observ = tf.check_numerics(observ, 'observ')
        cell = self._config.network(self._batch_env.action.shape[1].value)
-        (mean, logstd, value), state = tf.nn.dynamic_rnn(
-            cell, observ, length, state, tf.float32, swap_memory=True)
+        (mean, logstd, value), state = tf.nn.dynamic_rnn(cell,
+                                                         observ,
+                                                         length,
+                                                         state,
+                                                         tf.float32,
+                                                         swap_memory=True)
      mean = tf.check_numerics(mean, 'mean')
      logstd = tf.check_numerics(logstd, 'logstd')
      value = tf.check_numerics(value, 'value')
-      policy = tf.contrib.distributions.MultivariateNormalDiag(
-          mean, tf.exp(logstd))
+      policy = tf.contrib.distributions.MultivariateNormalDiag(mean, tf.exp(logstd))
      return _NetworkOutput(policy, mean, logstd, value, state)
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/memory.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/memory.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Memory that stores episodes."""

 from __future__ import absolute_import
@@ -43,10 +42,9 @@ class EpisodeMemory(object):
      self._scope = scope
      self._length = tf.Variable(tf.zeros(capacity, tf.int32), False)
      self._buffers = [
-          tf.Variable(tf.zeros(
-              [capacity, max_length] + elem.shape.as_list(),
-              elem.dtype), False)
-          for elem in template]
+          tf.Variable(tf.zeros([capacity, max_length] + elem.shape.as_list(), elem.dtype), False)
+          for elem in template
+      ]

  def length(self, rows=None):
    """Tensor holding the current length of episodes.
@@ -72,13 +70,11 @@ class EpisodeMemory(object):
    """
    rows = tf.range(self._capacity) if rows is None else rows
    assert rows.shape.ndims == 1
-    assert_capacity = tf.assert_less(
-        rows, self._capacity,
-        message='capacity exceeded')
+    assert_capacity = tf.assert_less(rows, self._capacity, message='capacity exceeded')
    with tf.control_dependencies([assert_capacity]):
-      assert_max_length = tf.assert_less(
-          tf.gather(self._length, rows), self._max_length,
-          message='max length exceeded')
+      assert_max_length = tf.assert_less(tf.gather(self._length, rows),
+                                         self._max_length,
+                                         message='max length exceeded')
    append_ops = []
    with tf.control_dependencies([assert_max_length]):
      for buffer_, elements in zip(self._buffers, transitions):
@@ -86,8 +82,7 @@ class EpisodeMemory(object):
        indices = tf.stack([rows, timestep], 1)
        append_ops.append(tf.scatter_nd_update(buffer_, indices, elements))
    with tf.control_dependencies(append_ops):
-      episode_mask = tf.reduce_sum(tf.one_hot(
-          rows, self._capacity, dtype=tf.int32), 0)
+      episode_mask = tf.reduce_sum(tf.one_hot(rows, self._capacity, dtype=tf.int32), 0)
      return self._length.assign_add(episode_mask)

  def replace(self, episodes, length, rows=None):
@@ -103,11 +98,11 @@ class EpisodeMemory(object):
    """
    rows = tf.range(self._capacity) if rows is None else rows
    assert rows.shape.ndims == 1
-    assert_capacity = tf.assert_less(
-        rows, self._capacity, message='capacity exceeded')
+    assert_capacity = tf.assert_less(rows, self._capacity, message='capacity exceeded')
    with tf.control_dependencies([assert_capacity]):
-      assert_max_length = tf.assert_less_equal(
-          length, self._max_length, message='max length exceeded')
+      assert_max_length = tf.assert_less_equal(length,
+                                               self._max_length,
+                                               message='max length exceeded')
    replace_ops = []
    with tf.control_dependencies([assert_max_length]):
      for buffer_, elements in zip(self._buffers, episodes):
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/normalize.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/normalize.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Normalize tensors based on streaming estimates of mean and variance."""

 from __future__ import absolute_import
@@ -24,8 +23,7 @@ import tensorflow as tf
 class StreamingNormalize(object):
  """Normalize tensors based on streaming estimates of mean and variance."""

-  def __init__(
-      self, template, center=True, scale=True, clip=10, name='normalize'):
+  def __init__(self, template, center=True, scale=True, clip=10, name='normalize'):
    """Normalize tensors based on streaming estimates of mean and variance.

    Centering the value, scaling it by the standard deviation, and clipping
@@ -69,8 +67,7 @@ class StreamingNormalize(object):
      if self._scale:
        # We cannot scale before seeing at least two samples.
        value /= tf.cond(
-            self._count > 1, lambda: self._std() + 1e-8,
-            lambda: tf.ones_like(self._var_sum))[None]
+            self._count > 1, lambda: self._std() + 1e-8, lambda: tf.ones_like(self._var_sum))[None]
      if self._clip:
        value = tf.clip_by_value(value, -self._clip, self._clip)
      # Remove batch dimension if necessary.
@@ -97,8 +94,7 @@ class StreamingNormalize(object):
        mean_delta = tf.reduce_sum(value - self._mean[None, ...], 0)
        new_mean = self._mean + mean_delta / step
        new_mean = tf.cond(self._count > 1, lambda: new_mean, lambda: value[0])
-        var_delta = (
-            value - self._mean[None, ...]) * (value - new_mean[None, ...])
+        var_delta = (value - self._mean[None, ...]) * (value - new_mean[None, ...])
        new_var_sum = self._var_sum + tf.reduce_sum(var_delta, 0)
      with tf.control_dependencies([new_mean, new_var_sum]):
        update = self._mean.assign(new_mean), self._var_sum.assign(new_var_sum)
@@ -116,10 +112,8 @@ class StreamingNormalize(object):
      Operation.
    """
    with tf.name_scope(self._name + '/reset'):
-      return tf.group(
-          self._count.assign(0),
-          self._mean.assign(tf.zeros_like(self._mean)),
-          self._var_sum.assign(tf.zeros_like(self._var_sum)))
+      return tf.group(self._count.assign(0), self._mean.assign(tf.zeros_like(self._mean)),
+                      self._var_sum.assign(tf.zeros_like(self._var_sum)))

  def summary(self):
    """Summary string of mean and standard deviation.
@@ -128,10 +122,8 @@ class StreamingNormalize(object):
      Summary tensor.
    """
    with tf.name_scope(self._name + '/summary'):
-      mean_summary = tf.cond(
-          self._count > 0, lambda: self._summary('mean', self._mean), str)
-      std_summary = tf.cond(
-          self._count > 1, lambda: self._summary('stddev', self._std()), str)
+      mean_summary = tf.cond(self._count > 0, lambda: self._summary('mean', self._mean), str)
+      std_summary = tf.cond(self._count > 1, lambda: self._summary('stddev', self._std()), str)
      return tf.summary.merge([mean_summary, std_summary])

  def _std(self):
@@ -143,10 +135,8 @@ class StreamingNormalize(object):
    Returns:
      Tensor of current variance.
    """
-    variance = tf.cond(
-        self._count > 1,
-        lambda: self._var_sum / tf.cast(self._count - 1, tf.float32),
-        lambda: tf.ones_like(self._var_sum) * float('nan'))
+    variance = tf.cond(self._count > 1, lambda: self._var_sum / tf.cast(
+        self._count - 1, tf.float32), lambda: tf.ones_like(self._var_sum) * float('nan'))
    # The epsilon corrects for small negative variance values caused by
    # the algorithm. It was empirically chosen to work with all environments
    # tested.
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/utility.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/utility.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Utilities for the PPO algorithm."""

 from __future__ import absolute_import
@@ -51,8 +50,7 @@ def reinit_nested_vars(variables, indices=None):
    Operation.
  """
  if isinstance(variables, (tuple, list)):
-    return tf.group(*[
-        reinit_nested_vars(variable, indices) for variable in variables])
+    return tf.group(*[reinit_nested_vars(variable, indices) for variable in variables])
  if indices is None:
    return variables.assign(tf.zeros_like(variables))
  else:
@@ -71,9 +69,8 @@ def assign_nested_vars(variables, tensors):
    Operation.
  """
  if isinstance(variables, (tuple, list)):
-    return tf.group(*[
-        assign_nested_vars(variable, tensor)
-        for variable, tensor in zip(variables, tensors)])
+    return tf.group(
+        *[assign_nested_vars(variable, tensor) for variable, tensor in zip(variables, tensors)])
  return variables.assign(tensors)


@@ -81,10 +78,11 @@ def discounted_return(reward, length, discount):
  """Discounted Monte-Carlo returns."""
  timestep = tf.range(reward.shape[1].value)
  mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
-  return_ = tf.reverse(tf.transpose(tf.scan(
-      lambda agg, cur: cur + discount * agg,
-      tf.transpose(tf.reverse(mask * reward, [1]), [1, 0]),
-      tf.zeros_like(reward[:, -1]), 1, False), [1, 0]), [1])
+  return_ = tf.reverse(
+      tf.transpose(
+          tf.scan(lambda agg, cur: cur + discount * agg,
+                  tf.transpose(tf.reverse(mask * reward, [1]), [1, 0]),
+                  tf.zeros_like(reward[:, -1]), 1, False), [1, 0]), [1])
  return tf.check_numerics(tf.stop_gradient(return_), 'return')


@@ -95,9 +93,8 @@ def fixed_step_return(reward, value, length, discount, window):
  return_ = tf.zeros_like(reward)
  for _ in range(window):
    return_ += reward
-    reward = discount * tf.concat(
-        [reward[:, 1:], tf.zeros_like(reward[:, -1:])], 1)
-  return_ += discount ** window * tf.concat(
+    reward = discount * tf.concat([reward[:, 1:], tf.zeros_like(reward[:, -1:])], 1)
+  return_ += discount**window * tf.concat(
      [value[:, window:], tf.zeros_like(value[:, -window:]), 1])
  return tf.check_numerics(tf.stop_gradient(mask * return_), 'return')

@@ -109,10 +106,11 @@ def lambda_return(reward, value, length, discount, lambda_):
  sequence = mask * reward + discount * value * (1 - lambda_)
  discount = mask * discount * lambda_
  sequence = tf.stack([sequence, discount], 2)
-  return_ = tf.reverse(tf.transpose(tf.scan(
-      lambda agg, cur: cur[0] + cur[1] * agg,
-      tf.transpose(tf.reverse(sequence, [1]), [1, 2, 0]),
-      tf.zeros_like(value[:, -1]), 1, False), [1, 0]), [1])
+  return_ = tf.reverse(
+      tf.transpose(
+          tf.scan(lambda agg, cur: cur[0] + cur[1] * agg,
+                  tf.transpose(tf.reverse(sequence, [1]), [1, 2, 0]), tf.zeros_like(value[:, -1]),
+                  1, False), [1, 0]), [1])
  return tf.check_numerics(tf.stop_gradient(return_), 'return')


@@ -122,27 +120,26 @@ def lambda_advantage(reward, value, length, discount):
  mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
  next_value = tf.concat([value[:, 1:], tf.zeros_like(value[:, -1:])], 1)
  delta = reward + discount * next_value - value
-  advantage = tf.reverse(tf.transpose(tf.scan(
-      lambda agg, cur: cur + discount * agg,
-      tf.transpose(tf.reverse(mask * delta, [1]), [1, 0]),
-      tf.zeros_like(delta[:, -1]), 1, False), [1, 0]), [1])
+  advantage = tf.reverse(
+      tf.transpose(
+          tf.scan(lambda agg, cur: cur + discount * agg,
+                  tf.transpose(tf.reverse(mask * delta, [1]), [1, 0]), tf.zeros_like(delta[:, -1]),
+                  1, False), [1, 0]), [1])
  return tf.check_numerics(tf.stop_gradient(advantage), 'advantage')


 def diag_normal_kl(mean0, logstd0, mean1, logstd1):
  """Epirical KL divergence of two normals with diagonal covariance."""
  logstd0_2, logstd1_2 = 2 * logstd0, 2 * logstd1
-  return 0.5 * (
-      tf.reduce_sum(tf.exp(logstd0_2 - logstd1_2), -1) +
-      tf.reduce_sum((mean1 - mean0) ** 2 / tf.exp(logstd1_2), -1) +
-      tf.reduce_sum(logstd1_2, -1) - tf.reduce_sum(logstd0_2, -1) -
-      mean0.shape[-1].value)
+  return 0.5 * (tf.reduce_sum(tf.exp(logstd0_2 - logstd1_2), -1) + tf.reduce_sum(
+      (mean1 - mean0)**2 / tf.exp(logstd1_2), -1) + tf.reduce_sum(logstd1_2, -1) -
+                tf.reduce_sum(logstd0_2, -1) - mean0.shape[-1].value)


 def diag_normal_logpdf(mean, logstd, loc):
  """Log density of a normal with diagonal covariance."""
  constant = -0.5 * (math.log(2 * math.pi) + logstd)
-  value = -0.5 * ((loc - mean) / tf.exp(logstd)) ** 2
+  value = -0.5 * ((loc - mean) / tf.exp(logstd))**2
  return tf.reduce_sum(constant + value, -1)


--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/init.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/init.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Executable scripts for reinforcement learning."""

 from __future__ import absolute_import
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/configs.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/configs.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Example configurations using the PPO algorithm."""

 from __future__ import absolute_import
@@ -33,10 +32,7 @@ def default():
  use_gpu = False
  # Network
  network = networks.ForwardGaussianPolicy
-  weight_summaries = dict(
-      all=r'.*',
-      policy=r'.*/policy/.*',
-      value=r'.*/value/.*')
+  weight_summaries = dict(all=r'.*', policy=r'.*/policy/.*', value=r'.*/value/.*')
  policy_layers = 200, 100
  value_layers = 200, 100
  init_mean_factor = 0.05
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/networks.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/networks.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Networks for the PPO algorithm defined as recurrent cells."""

 from __future__ import absolute_import
@@ -20,11 +19,10 @@ from __future__ import print_function

 import tensorflow as tf

-
-_MEAN_WEIGHTS_INITIALIZER = tf.contrib.layers.variance_scaling_initializer(
-    factor=0.1)
+_MEAN_WEIGHTS_INITIALIZER = tf.contrib.layers.variance_scaling_initializer(factor=0.1)
 _LOGSTD_INITIALIZER = tf.random_normal_initializer(-1, 1e-10)

+
 class LinearGaussianPolicy(tf.contrib.rnn.RNNCell):
  """Indepent linear network with a tanh at the end for policy and feedforward network for the value.

@@ -56,15 +54,12 @@ class LinearGaussianPolicy(tf.contrib.rnn.RNNCell):
  def __call__(self, observation, state):
    with tf.variable_scope('policy'):
      x = tf.contrib.layers.flatten(observation)
-      mean = tf.contrib.layers.fully_connected(
-          x,
-          self._action_size,
-          tf.tanh,
-          weights_initializer=self._mean_weights_initializer)
-      logstd = tf.get_variable('logstd', mean.shape[1:], tf.float32,
-                               self._logstd_initializer)
-      logstd = tf.tile(logstd[None, ...],
-                       [tf.shape(mean)[0]] + [1] * logstd.shape.ndims)
+      mean = tf.contrib.layers.fully_connected(x,
+                                               self._action_size,
+                                               tf.tanh,
+                                               weights_initializer=self._mean_weights_initializer)
+      logstd = tf.get_variable('logstd', mean.shape[1:], tf.float32, self._logstd_initializer)
+      logstd = tf.tile(logstd[None, ...], [tf.shape(mean)[0]] + [1] * logstd.shape.ndims)
    with tf.variable_scope('value'):
      x = tf.contrib.layers.flatten(observation)
      for size in self._value_layers:
@@ -80,10 +75,12 @@ class ForwardGaussianPolicy(tf.contrib.rnn.RNNCell):
  is learned as independent parameter vector.
  """

-  def __init__(
-      self, policy_layers, value_layers, action_size,
-      mean_weights_initializer=_MEAN_WEIGHTS_INITIALIZER,
-      logstd_initializer=_LOGSTD_INITIALIZER):
+  def __init__(self,
+               policy_layers,
+               value_layers,
+               action_size,
+               mean_weights_initializer=_MEAN_WEIGHTS_INITIALIZER,
+               logstd_initializer=_LOGSTD_INITIALIZER):
    self._policy_layers = policy_layers
    self._value_layers = value_layers
    self._action_size = action_size
@@ -104,13 +101,12 @@ class ForwardGaussianPolicy(tf.contrib.rnn.RNNCell):
      x = tf.contrib.layers.flatten(observation)
      for size in self._policy_layers:
        x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
-      mean = tf.contrib.layers.fully_connected(
-          x, self._action_size, tf.tanh,
-          weights_initializer=self._mean_weights_initializer)
-      logstd = tf.get_variable(
-          'logstd', mean.shape[1:], tf.float32, self._logstd_initializer)
-      logstd = tf.tile(
-          logstd[None, ...], [tf.shape(mean)[0]] + [1] * logstd.shape.ndims)
+      mean = tf.contrib.layers.fully_connected(x,
+                                               self._action_size,
+                                               tf.tanh,
+                                               weights_initializer=self._mean_weights_initializer)
+      logstd = tf.get_variable('logstd', mean.shape[1:], tf.float32, self._logstd_initializer)
+      logstd = tf.tile(logstd[None, ...], [tf.shape(mean)[0]] + [1] * logstd.shape.ndims)
    with tf.variable_scope('value'):
      x = tf.contrib.layers.flatten(observation)
      for size in self._value_layers:
@@ -127,10 +123,12 @@ class RecurrentGaussianPolicy(tf.contrib.rnn.RNNCell):
  and uses a GRU cell.
  """

-  def __init__(
-      self, policy_layers, value_layers, action_size,
-      mean_weights_initializer=_MEAN_WEIGHTS_INITIALIZER,
-      logstd_initializer=_LOGSTD_INITIALIZER):
+  def __init__(self,
+               policy_layers,
+               value_layers,
+               action_size,
+               mean_weights_initializer=_MEAN_WEIGHTS_INITIALIZER,
+               logstd_initializer=_LOGSTD_INITIALIZER):
    self._policy_layers = policy_layers
    self._value_layers = value_layers
    self._action_size = action_size
@@ -152,13 +150,12 @@ class RecurrentGaussianPolicy(tf.contrib.rnn.RNNCell):
      for size in self._policy_layers[:-1]:
        x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
      x, state = self._cell(x, state)
-      mean = tf.contrib.layers.fully_connected(
-          x, self._action_size, tf.tanh,
-          weights_initializer=self._mean_weights_initializer)
-      logstd = tf.get_variable(
-          'logstd', mean.shape[1:], tf.float32, self._logstd_initializer)
-      logstd = tf.tile(
-          logstd[None, ...], [tf.shape(mean)[0]] + [1] * logstd.shape.ndims)
+      mean = tf.contrib.layers.fully_connected(x,
+                                               self._action_size,
+                                               tf.tanh,
+                                               weights_initializer=self._mean_weights_initializer)
+      logstd = tf.get_variable('logstd', mean.shape[1:], tf.float32, self._logstd_initializer)
+      logstd = tf.tile(logstd[None, ...], [tf.shape(mean)[0]] + [1] * logstd.shape.ndims)
    with tf.variable_scope('value'):
      x = tf.contrib.layers.flatten(observation)
      for size in self._value_layers:
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/train.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/train.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 r"""Script to train a batch reinforcement learning algorithm.

 Command line:
@@ -68,21 +67,25 @@ def _define_loop(graph, logdir, train_steps, eval_steps):
  Returns:
    Loop object.
  """
-  loop = tools.Loop(
-      logdir, graph.step, graph.should_log, graph.do_report,
-      graph.force_reset)
-  loop.add_phase(
-      'train', graph.done, graph.score, graph.summary, train_steps,
-      report_every=None,
-      log_every=train_steps // 2,
-      checkpoint_every=None,
-      feed={graph.is_training: True})
-  loop.add_phase(
-      'eval', graph.done, graph.score, graph.summary, eval_steps,
-      report_every=eval_steps,
-      log_every=eval_steps // 2,
-      checkpoint_every=10 * eval_steps,
-      feed={graph.is_training: False})
+  loop = tools.Loop(logdir, graph.step, graph.should_log, graph.do_report, graph.force_reset)
+  loop.add_phase('train',
+                 graph.done,
+                 graph.score,
+                 graph.summary,
+                 train_steps,
+                 report_every=None,
+                 log_every=train_steps // 2,
+                 checkpoint_every=None,
+                 feed={graph.is_training: True})
+  loop.add_phase('eval',
+                 graph.done,
+                 graph.score,
+                 graph.summary,
+                 eval_steps,
+                 report_every=eval_steps,
+                 log_every=eval_steps // 2,
+                 checkpoint_every=10 * eval_steps,
+                 feed={graph.is_training: False})
  return loop


@@ -101,25 +104,19 @@ def train(config, env_processes):
  """
  tf.reset_default_graph()
  with config.unlocked:
-    config.network = functools.partial(
-        utility.define_network, config.network, config)
+    config.network = functools.partial(utility.define_network, config.network, config)
    config.policy_optimizer = getattr(tf.train, config.policy_optimizer)
    config.value_optimizer = getattr(tf.train, config.value_optimizer)
  if config.update_every % config.num_agents:
    tf.logging.warn('Number of agents should divide episodes per update.')
  with tf.device('/cpu:0'):
-    batch_env = utility.define_batch_env(
-        lambda: _create_environment(config),
-        config.num_agents, env_processes)
-    graph = utility.define_simulation_graph(
-        batch_env, config.algorithm, config)
-    loop = _define_loop(
-        graph, config.logdir,
-        config.update_every * config.max_length,
-        config.eval_episodes * config.max_length)
-    total_steps = int(
-        config.steps / config.update_every *
-        (config.update_every + config.eval_episodes))
+    batch_env = utility.define_batch_env(lambda: _create_environment(config), config.num_agents,
+                                         env_processes)
+    graph = utility.define_simulation_graph(batch_env, config.algorithm, config)
+    loop = _define_loop(graph, config.logdir, config.update_every * config.max_length,
+                        config.eval_episodes * config.max_length)
+    total_steps = int(config.steps / config.update_every *
+                      (config.update_every + config.eval_episodes))
  # Exclude episode related variables since the Python state of environments is
  # not checkpointed and thus new episodes start after resuming.
  saver = utility.define_saver(exclude=(r'.*_temporary/.*',))
@@ -137,8 +134,8 @@ def main(_):
  utility.set_up_logging()
  if not FLAGS.config:
    raise KeyError('You must specify a configuration.')
-  logdir = FLAGS.logdir and os.path.expanduser(os.path.join(
-      FLAGS.logdir, '{}-{}'.format(FLAGS.timestamp, FLAGS.config)))
+  logdir = FLAGS.logdir and os.path.expanduser(
+      os.path.join(FLAGS.logdir, '{}-{}'.format(FLAGS.timestamp, FLAGS.config)))
  try:
    config = utility.load_config(logdir)
  except IOError:
@@ -150,16 +147,11 @@ def main(_):

 if __name__ == '__main__':
  FLAGS = tf.app.flags.FLAGS
-  tf.app.flags.DEFINE_string(
-      'logdir', None,
-      'Base directory to store logs.')
-  tf.app.flags.DEFINE_string(
-      'timestamp', datetime.datetime.now().strftime('%Y%m%dT%H%M%S'),
-      'Sub directory to store logs.')
-  tf.app.flags.DEFINE_string(
-      'config', None,
-      'Configuration to execute.')
-  tf.app.flags.DEFINE_boolean(
-      'env_processes', True,
-      'Step environments in separate processes to circumvent the GIL.')
+  tf.app.flags.DEFINE_string('logdir', None, 'Base directory to store logs.')
+  tf.app.flags.DEFINE_string('timestamp',
+                             datetime.datetime.now().strftime('%Y%m%dT%H%M%S'),
+                             'Sub directory to store logs.')
+  tf.app.flags.DEFINE_string('config', None, 'Configuration to execute.')
+  tf.app.flags.DEFINE_boolean('env_processes', True,
+                              'Step environments in separate processes to circumvent the GIL.')
  tf.app.run()
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/train_ppo_test.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/train_ppo_test.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Tests for the PPO algorithm usage example."""

 from __future__ import absolute_import
@@ -29,7 +28,6 @@ from google3.robotics.reinforcement_learning.agents.scripts import configs
 from google3.robotics.reinforcement_learning.agents.scripts import networks
 from google3.robotics.reinforcement_learning.agents.scripts import train

-
 FLAGS = tf.app.flags.FLAGS


@@ -65,9 +63,11 @@ class PPOTest(tf.test.TestCase):
    for network, observ_shape in itertools.product(nets, observ_shapes):
      config = self._define_config()
      with config.unlocked:
-        config.env = functools.partial(
-            tools.MockEnvironment, observ_shape, action_shape=(3,),
-            min_duration=15, max_duration=15)
+        config.env = functools.partial(tools.MockEnvironment,
+                                       observ_shape,
+                                       action_shape=(3,),
+                                       min_duration=15,
+                                       max_duration=15)
        config.max_length = 20
        config.steps = 100
        config.network = network
@@ -77,9 +77,11 @@ class PPOTest(tf.test.TestCase):
  def test_no_crash_variable_duration(self):
    config = self._define_config()
    with config.unlocked:
-      config.env = functools.partial(
-          tools.MockEnvironment, observ_shape=(2, 3), action_shape=(3,),
-          min_duration=5, max_duration=25)
+      config.env = functools.partial(tools.MockEnvironment,
+                                     observ_shape=(2, 3),
+                                     action_shape=(3,),
+                                     min_duration=5,
+                                     max_duration=25)
      config.max_length = 25
      config.steps = 200
      config.network = networks.RecurrentGaussianPolicy
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/utility.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/utility.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Utilities for using reinforcement learning algorithms."""

 from __future__ import absolute_import
@@ -46,8 +45,7 @@ def define_simulation_graph(batch_env, algo_cls, config):
  do_report = tf.placeholder(tf.bool, name='do_report')
  force_reset = tf.placeholder(tf.bool, name='force_reset')
  algo = algo_cls(batch_env, step, is_training, should_log, config)
-  done, score, summary = tools.simulate(
-      batch_env, algo, should_log, force_reset)
+  done, score, summary = tools.simulate(batch_env, algo, should_log, force_reset)
  message = 'Graph contains {} trainable variables.'
  tf.logging.info(message.format(tools.count_weights()))
  # pylint: enable=unused-variable
@@ -67,9 +65,7 @@ def define_batch_env(constructor, num_agents, env_processes):
  """
  with tf.variable_scope('environments'):
    if env_processes:
-      envs = [
-          tools.wrappers.ExternalProcess(constructor)
-          for _ in range(num_agents)]
+      envs = [tools.wrappers.ExternalProcess(constructor) for _ in range(num_agents)]
    else:
      envs = [constructor() for _ in range(num_agents)]
    batch_env = tools.BatchEnv(envs, blocking=not env_processes)
@@ -108,15 +104,14 @@ def define_network(constructor, config, action_size):
  Returns:
    Created recurrent cell object.
  """
-  mean_weights_initializer = (
-      tf.contrib.layers.variance_scaling_initializer(
-          factor=config.init_mean_factor))
-  logstd_initializer = tf.random_normal_initializer(
-      config.init_logstd, 1e-10)
-  network = constructor(
-      config.policy_layers, config.value_layers, action_size,
-      mean_weights_initializer=mean_weights_initializer,
-      logstd_initializer=logstd_initializer)
+  mean_weights_initializer = (tf.contrib.layers.variance_scaling_initializer(
+      factor=config.init_mean_factor))
+  logstd_initializer = tf.random_normal_initializer(config.init_logstd, 1e-10)
+  network = constructor(config.policy_layers,
+                        config.value_layers,
+                        action_size,
+                        mean_weights_initializer=mean_weights_initializer,
+                        logstd_initializer=logstd_initializer)
  return network


@@ -134,9 +129,7 @@ def initialize_variables(sess, saver, logdir, checkpoint=None, resume=None):
    ValueError: If resume expected but no log directory specified.
    RuntimeError: If no resume expected but a checkpoint was found.
  """
-  sess.run(tf.group(
-      tf.local_variables_initializer(),
-      tf.global_variables_initializer()))
+  sess.run(tf.group(tf.local_variables_initializer(), tf.global_variables_initializer()))
  if resume and not (logdir or checkpoint):
    raise ValueError('Need to specify logdir to resume a checkpoint.')
  if logdir:
@@ -175,9 +168,8 @@ def save_config(config, logdir=None):
    with tf.gfile.GFile(config_path, 'w') as file_:
      yaml.dump(config, file_, default_flow_style=False)
  else:
-    message = (
-        'Start a new run without storing summaries and checkpoints since no '
-        'logging directory was specified.')
+    message = ('Start a new run without storing summaries and checkpoints since no '
+               'logging directory was specified.')
    tf.logging.info(message)
  return config

@@ -196,9 +188,8 @@ def load_config(logdir):
  """
  config_path = logdir and os.path.join(logdir, 'config.yaml')
  if not config_path or not tf.gfile.Exists(config_path):
-    message = (
-        'Cannot resume an existing run since the logging directory does not '
-        'contain a configuration file.')
+    message = ('Cannot resume an existing run since the logging directory does not '
+               'contain a configuration file.')
    raise IOError(message)
  with tf.gfile.FastGFile(config_path, 'r') as file_:
    config = yaml.load(file_)
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/visualize.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/visualize.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 r"""Script to render videos of the Proximal Policy Gradient algorithm.

 Command line:
@@ -54,6 +53,8 @@ def _create_environment(config, outdir):
    setattr(env, 'spec', getattr(env, 'spec', None))
  if config.max_length:
    env = tools.wrappers.LimitDuration(env, config.max_length)
+
+
 #  env = gym.wrappers.Monitor(
 #      env, outdir, lambda unused_episode_number: True)
  env = tools.wrappers.RangeNormalize(env)
@@ -72,20 +73,20 @@ def _define_loop(graph, eval_steps):
  Returns:
    Loop object.
  """
-  loop = tools.Loop(
-      None, graph.step, graph.should_log, graph.do_report, graph.force_reset)
-  loop.add_phase(
-      'eval', graph.done, graph.score, graph.summary, eval_steps,
-      report_every=eval_steps,
-      log_every=None,
-      checkpoint_every=None,
-      feed={graph.is_training: False})
+  loop = tools.Loop(None, graph.step, graph.should_log, graph.do_report, graph.force_reset)
+  loop.add_phase('eval',
+                 graph.done,
+                 graph.score,
+                 graph.summary,
+                 eval_steps,
+                 report_every=eval_steps,
+                 log_every=None,
+                 checkpoint_every=None,
+                 feed={graph.is_training: False})
  return loop


-def visualize(
-    logdir, outdir, num_agents, num_episodes, checkpoint=None,
-    env_processes=True):
+def visualize(logdir, outdir, num_agents, num_episodes, checkpoint=None, env_processes=True):
  """Recover checkpoint and render videos from it.

  Args:
@@ -98,25 +99,20 @@ def visualize(
  """
  config = utility.load_config(logdir)
  with config.unlocked:
-    config.network = functools.partial(
-        utility.define_network, config.network, config)
+    config.network = functools.partial(utility.define_network, config.network, config)
    config.policy_optimizer = getattr(tf.train, config.policy_optimizer)
    config.value_optimizer = getattr(tf.train, config.value_optimizer)
  with tf.device('/cpu:0'):
-    batch_env = utility.define_batch_env(
-        lambda: _create_environment(config, outdir),
-        num_agents, env_processes)
-    graph = utility.define_simulation_graph(
-        batch_env, config.algorithm, config)
+    batch_env = utility.define_batch_env(lambda: _create_environment(config, outdir), num_agents,
+                                         env_processes)
+    graph = utility.define_simulation_graph(batch_env, config.algorithm, config)
    total_steps = num_episodes * config.max_length
    loop = _define_loop(graph, total_steps)
-  saver = utility.define_saver(
-      exclude=(r'.*_temporary/.*', r'global_step'))
+  saver = utility.define_saver(exclude=(r'.*_temporary/.*', r'global_step'))
  sess_config = tf.ConfigProto(allow_soft_placement=True)
  sess_config.gpu_options.allow_growth = True
  with tf.Session(config=sess_config) as sess:
-    utility.initialize_variables(
-        sess, saver, config.logdir, checkpoint, resume=True)
+    utility.initialize_variables(sess, saver, config.logdir, checkpoint, resume=True)
    for unused_score in loop.run(sess, saver, total_steps):
      pass
  batch_env.close()
@@ -129,29 +125,18 @@ def main(_):
    raise KeyError('You must specify logging and outdirs directories.')
  FLAGS.logdir = os.path.expanduser(FLAGS.logdir)
  FLAGS.outdir = os.path.expanduser(FLAGS.outdir)
-  visualize(
-      FLAGS.logdir, FLAGS.outdir, FLAGS.num_agents, FLAGS.num_episodes,
-      FLAGS.checkpoint, FLAGS.env_processes)
+  visualize(FLAGS.logdir, FLAGS.outdir, FLAGS.num_agents, FLAGS.num_episodes, FLAGS.checkpoint,
+            FLAGS.env_processes)


 if __name__ == '__main__':
  FLAGS = tf.app.flags.FLAGS
-  tf.app.flags.DEFINE_string(
-      'logdir', None,
-      'Directory to the checkpoint of a training run.')
-  tf.app.flags.DEFINE_string(
-      'outdir', None,
-      'Local directory for storing the monitoring outdir.')
-  tf.app.flags.DEFINE_string(
-      'checkpoint', None,
-      'Checkpoint name to load; defaults to most recent.')
-  tf.app.flags.DEFINE_integer(
-      'num_agents', 1,
-      'How many environments to step in parallel.')
-  tf.app.flags.DEFINE_integer(
-      'num_episodes', 5,
-      'Minimum number of episodes to render.')
-  tf.app.flags.DEFINE_boolean(
-      'env_processes', True,
-      'Step environments in separate processes to circumvent the GIL.')
+  tf.app.flags.DEFINE_string('logdir', None, 'Directory to the checkpoint of a training run.')
+  tf.app.flags.DEFINE_string('outdir', None, 'Local directory for storing the monitoring outdir.')
+  tf.app.flags.DEFINE_string('checkpoint', None,
+                             'Checkpoint name to load; defaults to most recent.')
+  tf.app.flags.DEFINE_integer('num_agents', 1, 'How many environments to step in parallel.')
+  tf.app.flags.DEFINE_integer('num_episodes', 5, 'Minimum number of episodes to render.')
+  tf.app.flags.DEFINE_boolean('env_processes', True,
+                              'Step environments in separate processes to circumvent the GIL.')
  tf.app.run()
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/init.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/init.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Tools for reinforcement learning."""

 from __future__ import absolute_import
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/attr_dict.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/attr_dict.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Wrap a dictionary to access keys as attributes."""

 from __future__ import absolute_import
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/attr_dict_test.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/attr_dict_test.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Tests for the attribute dictionary."""

 from __future__ import absolute_import
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/batch_env.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/batch_env.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Combine multiple environments to step them in batch."""

 from __future__ import absolute_import
@@ -84,13 +83,9 @@ class BatchEnv(object):
        message = 'Invalid action at index {}: {}'
        raise ValueError(message.format(index, action))
    if self._blocking:
-      transitions = [
-          env.step(action)
-          for env, action in zip(self._envs, actions)]
+      transitions = [env.step(action) for env, action in zip(self._envs, actions)]
    else:
-      transitions = [
-          env.step(action, blocking=False)
-          for env, action in zip(self._envs, actions)]
+      transitions = [env.step(action, blocking=False) for env, action in zip(self._envs, actions)]
      transitions = [transition() for transition in transitions]
    observs, rewards, dones, infos = zip(*transitions)
    observ = np.stack(observs)
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/count_weights.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/count_weights.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Count learnable parameters."""

 from __future__ import absolute_import
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/count_weights_test.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/count_weights_test.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Tests for the weight counting utility."""

 from __future__ import absolute_import
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/in_graph_batch_env.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/in_graph_batch_env.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Batch of environments inside the TensorFlow graph."""

 from __future__ import absolute_import
@@ -42,18 +41,18 @@ class InGraphBatchEnv(object):
    action_shape = self._parse_shape(self._batch_env.action_space)
    action_dtype = self._parse_dtype(self._batch_env.action_space)
    with tf.variable_scope('env_temporary'):
-      self._observ = tf.Variable(
-          tf.zeros((len(self._batch_env),) + observ_shape, observ_dtype),
-          name='observ', trainable=False)
-      self._action = tf.Variable(
-          tf.zeros((len(self._batch_env),) + action_shape, action_dtype),
-          name='action', trainable=False)
-      self._reward = tf.Variable(
-          tf.zeros((len(self._batch_env),), tf.float32),
-          name='reward', trainable=False)
-      self._done = tf.Variable(
-          tf.cast(tf.ones((len(self._batch_env),)), tf.bool),
-          name='done', trainable=False)
+      self._observ = tf.Variable(tf.zeros((len(self._batch_env),) + observ_shape, observ_dtype),
+                                 name='observ',
+                                 trainable=False)
+      self._action = tf.Variable(tf.zeros((len(self._batch_env),) + action_shape, action_dtype),
+                                 name='action',
+                                 trainable=False)
+      self._reward = tf.Variable(tf.zeros((len(self._batch_env),), tf.float32),
+                                 name='reward',
+                                 trainable=False)
+      self._done = tf.Variable(tf.cast(tf.ones((len(self._batch_env),)), tf.bool),
+                               name='done',
+                               trainable=False)

  def __getattr__(self, name):
    """Forward unimplemented attributes to one of the original environments.
@@ -89,16 +88,13 @@ class InGraphBatchEnv(object):
      if action.dtype in (tf.float16, tf.float32, tf.float64):
        action = tf.check_numerics(action, 'action')
      observ_dtype = self._parse_dtype(self._batch_env.observation_space)
-      observ, reward, done = tf.py_func(
-          lambda a: self._batch_env.step(a)[:3], [action],
-          [observ_dtype, tf.float32, tf.bool], name='step')
+      observ, reward, done = tf.py_func(lambda a: self._batch_env.step(a)[:3], [action],
+                                        [observ_dtype, tf.float32, tf.bool],
+                                        name='step')
      observ = tf.check_numerics(observ, 'observ')
      reward = tf.check_numerics(reward, 'reward')
-      return tf.group(
-          self._observ.assign(observ),
-          self._action.assign(action),
-          self._reward.assign(reward),
-          self._done.assign(done))
+      return tf.group(self._observ.assign(observ), self._action.assign(action),
+                      self._reward.assign(reward), self._done.assign(done))

  def reset(self, indices=None):
    """Reset the batch of environments.
@@ -112,15 +108,15 @@ class InGraphBatchEnv(object):
    if indices is None:
      indices = tf.range(len(self._batch_env))
    observ_dtype = self._parse_dtype(self._batch_env.observation_space)
-    observ = tf.py_func(
-        self._batch_env.reset, [indices], observ_dtype, name='reset')
+    observ = tf.py_func(self._batch_env.reset, [indices], observ_dtype, name='reset')
    observ = tf.check_numerics(observ, 'observ')
    reward = tf.zeros_like(indices, tf.float32)
    done = tf.zeros_like(indices, tf.bool)
    with tf.control_dependencies([
        tf.scatter_update(self._observ, indices, observ),
        tf.scatter_update(self._reward, indices, reward),
-        tf.scatter_update(self._done, indices, done)]):
+        tf.scatter_update(self._done, indices, done)
+    ]):
      return tf.identity(observ)

  @property
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/in_graph_env.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/in_graph_env.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Put an OpenAI Gym environment into the TensorFlow graph."""

 from __future__ import absolute_import
@@ -42,16 +41,15 @@ class InGraphEnv(object):
    action_shape = self._parse_shape(self._env.action_space)
    action_dtype = self._parse_dtype(self._env.action_space)
    with tf.name_scope('environment'):
-      self._observ = tf.Variable(
-          tf.zeros(observ_shape, observ_dtype), name='observ', trainable=False)
-      self._action = tf.Variable(
-          tf.zeros(action_shape, action_dtype), name='action', trainable=False)
-      self._reward = tf.Variable(
-          0.0, dtype=tf.float32, name='reward', trainable=False)
-      self._done = tf.Variable(
-          True, dtype=tf.bool, name='done', trainable=False)
-      self._step = tf.Variable(
-          0, dtype=tf.int32, name='step', trainable=False)
+      self._observ = tf.Variable(tf.zeros(observ_shape, observ_dtype),
+                                 name='observ',
+                                 trainable=False)
+      self._action = tf.Variable(tf.zeros(action_shape, action_dtype),
+                                 name='action',
+                                 trainable=False)
+      self._reward = tf.Variable(0.0, dtype=tf.float32, name='reward', trainable=False)
+      self._done = tf.Variable(True, dtype=tf.bool, name='done', trainable=False)
+      self._step = tf.Variable(0, dtype=tf.int32, name='step', trainable=False)

  def __getattr__(self, name):
    """Forward unimplemented attributes to the original environment.
@@ -79,17 +77,14 @@ class InGraphEnv(object):
      if action.dtype in (tf.float16, tf.float32, tf.float64):
        action = tf.check_numerics(action, 'action')
      observ_dtype = self._parse_dtype(self._env.observation_space)
-      observ, reward, done = tf.py_func(
-          lambda a: self._env.step(a)[:3], [action],
-          [observ_dtype, tf.float32, tf.bool], name='step')
+      observ, reward, done = tf.py_func(lambda a: self._env.step(a)[:3], [action],
+                                        [observ_dtype, tf.float32, tf.bool],
+                                        name='step')
      observ = tf.check_numerics(observ, 'observ')
      reward = tf.check_numerics(reward, 'reward')
-      return tf.group(
-          self._observ.assign(observ),
-          self._action.assign(action),
-          self._reward.assign(reward),
-          self._done.assign(done),
-          self._step.assign_add(1))
+      return tf.group(self._observ.assign(observ), self._action.assign(action),
+                      self._reward.assign(reward), self._done.assign(done),
+                      self._step.assign_add(1))

  def reset(self):
    """Reset the environment.
@@ -100,10 +95,10 @@ class InGraphEnv(object):
    observ_dtype = self._parse_dtype(self._env.observation_space)
    observ = tf.py_func(self._env.reset, [], observ_dtype, name='reset')
    observ = tf.check_numerics(observ, 'observ')
-    with tf.control_dependencies([
-        self._observ.assign(observ),
-        self._reward.assign(0),
-        self._done.assign(False)]):
+    with tf.control_dependencies(
+        [self._observ.assign(observ),
+         self._reward.assign(0),
+         self._done.assign(False)]):
      return tf.identity(observ)

  @property
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/loop.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/loop.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Execute operations in a loop and coordinate logging and checkpoints."""

 from __future__ import absolute_import
@@ -25,10 +24,8 @@ import tensorflow as tf

 from pybullet_envs.minitaur.agents.tools import streaming_mean

-
 _Phase = collections.namedtuple(
-    'Phase',
-    'name, writer, op, batch, steps, feed, report_every, log_every,'
+    'Phase', 'name, writer, op, batch, steps, feed, report_every, log_every,'
    'checkpoint_every')


@@ -56,16 +53,22 @@ class Loop(object):
      reset: Tensor indicating to the model to start a new computation.
    """
    self._logdir = logdir
-    self._step = (
-        tf.Variable(0, False, name='global_step') if step is None else step)
+    self._step = (tf.Variable(0, False, name='global_step') if step is None else step)
    self._log = tf.placeholder(tf.bool) if log is None else log
    self._report = tf.placeholder(tf.bool) if report is None else report
    self._reset = tf.placeholder(tf.bool) if reset is None else reset
    self._phases = []

-  def add_phase(
-      self, name, done, score, summary, steps,
-      report_every=None, log_every=None, checkpoint_every=None, feed=None):
+  def add_phase(self,
+                name,
+                done,
+                score,
+                summary,
+                steps,
+                report_every=None,
+                log_every=None,
+                checkpoint_every=None,
+                feed=None):
    """Add a phase to the loop protocol.

    If the model breaks long computation into multiple steps, the done tensor
@@ -97,13 +100,12 @@ class Loop(object):
    if done.shape.ndims is None or score.shape.ndims is None:
      raise ValueError("Rank of 'done' and 'score' tensors must be known.")
    writer = self._logdir and tf.summary.FileWriter(
-        os.path.join(self._logdir, name), tf.get_default_graph(),
-        flush_secs=60)
+        os.path.join(self._logdir, name), tf.get_default_graph(), flush_secs=60)
    op = self._define_step(done, score, summary)
    batch = 1 if score.shape.ndims == 0 else score.shape[0].value
-    self._phases.append(_Phase(
-        name, writer, op, batch, int(steps), feed, report_every,
-        log_every, checkpoint_every))
+    self._phases.append(
+        _Phase(name, writer, op, batch, int(steps), feed, report_every, log_every,
+               checkpoint_every))

  def run(self, sess, saver, max_step=None):
    """Run the loop schedule for a specified number of steps.
@@ -133,13 +135,11 @@ class Loop(object):
        tf.logging.info(message.format(phase.name, phase_step, global_step))
      # Populate book keeping tensors.
      phase.feed[self._reset] = (steps_in < steps_made)
-      phase.feed[self._log] = (
-          phase.writer and
-          self._is_every_steps(phase_step, phase.batch, phase.log_every))
-      phase.feed[self._report] = (
-          self._is_every_steps(phase_step, phase.batch, phase.report_every))
-      summary, mean_score, global_step, steps_made = sess.run(
-          phase.op, phase.feed)
+      phase.feed[self._log] = (phase.writer and
+                               self._is_every_steps(phase_step, phase.batch, phase.log_every))
+      phase.feed[self._report] = (self._is_every_steps(phase_step, phase.batch,
+                                                       phase.report_every))
+      summary, mean_score, global_step, steps_made = sess.run(phase.op, phase.feed)
      if self._is_every_steps(phase_step, phase.batch, phase.checkpoint_every):
        self._store_checkpoint(sess, saver, global_step)
      if self._is_every_steps(phase_step, phase.batch, phase.report_every):
@@ -207,8 +207,7 @@ class Loop(object):
    score_mean = streaming_mean.StreamingMean((), tf.float32)
    with tf.control_dependencies([done, score, summary]):
      done_score = tf.gather(score, tf.where(done)[:, 0])
-      submit_score = tf.cond(
-          tf.reduce_any(done), lambda: score_mean.submit(done_score), tf.no_op)
+      submit_score = tf.cond(tf.reduce_any(done), lambda: score_mean.submit(done_score), tf.no_op)
    with tf.control_dependencies([submit_score]):
      mean_score = tf.cond(self._report, score_mean.clear, float)
      steps_made = tf.shape(score)[0]
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/loop_test.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/loop_test.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Tests for the training loop."""

 from __future__ import absolute_import
@@ -28,8 +27,7 @@ class LoopTest(tf.test.TestCase):
  def test_report_every_step(self):
    step = tf.Variable(0, False, dtype=tf.int32, name='step')
    loop = tools.Loop(None, step)
-    loop.add_phase(
-        'phase_1', done=True, score=0, summary='', steps=1, report_every=3)
+    loop.add_phase('phase_1', done=True, score=0, summary='', steps=1, report_every=3)
    # Step:   0 1 2 3 4 5 6 7 8
    # Report:     x     x     x
    with self.test_session() as sess:
@@ -45,15 +43,33 @@ class LoopTest(tf.test.TestCase):
  def test_phases_feed(self):
    score = tf.placeholder(tf.float32, [])
    loop = tools.Loop(None)
-    loop.add_phase(
-        'phase_1', done=True, score=score, summary='', steps=1, report_every=1,
-        log_every=None, checkpoint_every=None, feed={score: 1})
-    loop.add_phase(
-        'phase_2', done=True, score=score, summary='', steps=3, report_every=1,
-        log_every=None, checkpoint_every=None, feed={score: 2})
-    loop.add_phase(
-        'phase_3', done=True, score=score, summary='', steps=2, report_every=1,
-        log_every=None, checkpoint_every=None, feed={score: 3})
+    loop.add_phase('phase_1',
+                   done=True,
+                   score=score,
+                   summary='',
+                   steps=1,
+                   report_every=1,
+                   log_every=None,
+                   checkpoint_every=None,
+                   feed={score: 1})
+    loop.add_phase('phase_2',
+                   done=True,
+                   score=score,
+                   summary='',
+                   steps=3,
+                   report_every=1,
+                   log_every=None,
+                   checkpoint_every=None,
+                   feed={score: 2})
+    loop.add_phase('phase_3',
+                   done=True,
+                   score=score,
+                   summary='',
+                   steps=2,
+                   report_every=1,
+                   log_every=None,
+                   checkpoint_every=None,
+                   feed={score: 3})
    with self.test_session() as sess:
      sess.run(tf.global_variables_initializer())
      scores = list(loop.run(sess, saver=None, max_step=15))
@@ -61,10 +77,8 @@ class LoopTest(tf.test.TestCase):

  def test_average_score_over_phases(self):
    loop = tools.Loop(None)
-    loop.add_phase(
-        'phase_1', done=True, score=1, summary='', steps=1, report_every=2)
-    loop.add_phase(
-        'phase_2', done=True, score=2, summary='', steps=2, report_every=5)
+    loop.add_phase('phase_1', done=True, score=1, summary='', steps=1, report_every=2)
+    loop.add_phase('phase_2', done=True, score=2, summary='', steps=2, report_every=5)
    # Score:    1 2 2 1 2 2 1 2 2 1 2 2 1 2 2 1 2
    # Report 1:       x           x           x
    # Report 2:               x             x
@@ -78,8 +92,7 @@ class LoopTest(tf.test.TestCase):
    done = tf.equal((step + 1) % 2, 0)
    score = tf.cast(step, tf.float32)
    loop = tools.Loop(None, step)
-    loop.add_phase(
-        'phase_1', done, score, summary='', steps=1, report_every=3)
+    loop.add_phase('phase_1', done, score, summary='', steps=1, report_every=3)
    # Score:  0 1 2 3 4 5 6 7 8
    # Done:     x   x   x   x
    # Report:     x     x     x
@@ -91,10 +104,9 @@ class LoopTest(tf.test.TestCase):
  def test_not_done_batch(self):
    step = tf.Variable(0, False, dtype=tf.int32, name='step')
    done = tf.equal([step % 3, step % 4], 0)
-    score = tf.cast([step, step ** 2], tf.float32)
+    score = tf.cast([step, step**2], tf.float32)
    loop = tools.Loop(None, step)
-    loop.add_phase(
-        'phase_1', done, score, summary='', steps=1, report_every=8)
+    loop.add_phase('phase_1', done, score, summary='', steps=1, report_every=8)
    # Step:    0  2  4  6
    # Score 1: 0  2  4  6
    # Done 1:  x        x
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/mock_algorithm.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/mock_algorithm.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Mock algorithm for testing reinforcement learning code."""

 from __future__ import absolute_import
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/mock_environment.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/mock_environment.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Mock environment for testing reinforcement learning code."""

 from __future__ import absolute_import
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/simulate.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/simulate.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """In-graph simulation step of a vecrotized algorithm with environments."""

 from __future__ import absolute_import
@@ -55,7 +54,8 @@ def simulate(batch_env, algo, log=True, reset=False):
    reset_ops = [
        batch_env.reset(agent_indices),
        tf.scatter_update(score, agent_indices, zero_scores),
-        tf.scatter_update(length, agent_indices, zero_durations)]
+        tf.scatter_update(length, agent_indices, zero_durations)
+    ]
    with tf.control_dependencies(reset_ops):
      return algo.begin_episode(agent_indices)

@@ -76,9 +76,8 @@ def simulate(batch_env, algo, log=True, reset=False):
      add_score = score.assign_add(batch_env.reward)
      inc_length = length.assign_add(tf.ones(len(batch_env), tf.int32))
    with tf.control_dependencies([add_score, inc_length]):
-      experience_summary = algo.experience(
-          prevob, batch_env.action, batch_env.reward, batch_env.done,
-          batch_env.observ)
+      experience_summary = algo.experience(prevob, batch_env.action, batch_env.reward,
+                                           batch_env.done, batch_env.observ)
    return tf.summary.merge([step_summary, experience_summary])

  def _define_end_episode(agent_indices):
@@ -94,8 +93,7 @@ def simulate(batch_env, algo, log=True, reset=False):
    """
    assert agent_indices.shape.ndims == 1
    submit_score = mean_score.submit(tf.gather(score, agent_indices))
-    submit_length = mean_length.submit(
-        tf.cast(tf.gather(length, agent_indices), tf.float32))
+    submit_length = mean_length.submit(tf.cast(tf.gather(length, agent_indices), tf.float32))
    with tf.control_dependencies([submit_score, submit_length]):
      return algo.end_episode(agent_indices)

@@ -105,41 +103,34 @@ def simulate(batch_env, algo, log=True, reset=False):
    Returns:
      Summary string.
    """
-    score_summary = tf.cond(
-        tf.logical_and(log, tf.cast(mean_score.count, tf.bool)),
-        lambda: tf.summary.scalar('mean_score', mean_score.clear()), str)
-    length_summary = tf.cond(
-        tf.logical_and(log, tf.cast(mean_length.count, tf.bool)),
-        lambda: tf.summary.scalar('mean_length', mean_length.clear()), str)
+    score_summary = tf.cond(tf.logical_and(log, tf.cast(
+        mean_score.count, tf.bool)), lambda: tf.summary.scalar('mean_score', mean_score.clear()),
+                            str)
+    length_summary = tf.cond(tf.logical_and(
+        log, tf.cast(mean_length.count,
+                     tf.bool)), lambda: tf.summary.scalar('mean_length', mean_length.clear()), str)
    return tf.summary.merge([score_summary, length_summary])

  with tf.name_scope('simulate'):
    log = tf.convert_to_tensor(log)
    reset = tf.convert_to_tensor(reset)
    with tf.variable_scope('simulate_temporary'):
-      score = tf.Variable(
-          tf.zeros(len(batch_env), dtype=tf.float32), False, name='score')
-      length = tf.Variable(
-          tf.zeros(len(batch_env), dtype=tf.int32), False, name='length')
+      score = tf.Variable(tf.zeros(len(batch_env), dtype=tf.float32), False, name='score')
+      length = tf.Variable(tf.zeros(len(batch_env), dtype=tf.int32), False, name='length')
    mean_score = streaming_mean.StreamingMean((), tf.float32)
    mean_length = streaming_mean.StreamingMean((), tf.float32)
-    agent_indices = tf.cond(
-        reset,
-        lambda: tf.range(len(batch_env)),
-        lambda: tf.cast(tf.where(batch_env.done)[:, 0], tf.int32))
-    begin_episode = tf.cond(
-        tf.cast(tf.shape(agent_indices)[0], tf.bool),
-        lambda: _define_begin_episode(agent_indices), str)
+    agent_indices = tf.cond(reset, lambda: tf.range(len(batch_env)), lambda: tf.cast(
+        tf.where(batch_env.done)[:, 0], tf.int32))
+    begin_episode = tf.cond(tf.cast(tf.shape(agent_indices)[0],
+                                    tf.bool), lambda: _define_begin_episode(agent_indices), str)
    with tf.control_dependencies([begin_episode]):
      step = _define_step()
    with tf.control_dependencies([step]):
      agent_indices = tf.cast(tf.where(batch_env.done)[:, 0], tf.int32)
-      end_episode = tf.cond(
-          tf.cast(tf.shape(agent_indices)[0], tf.bool),
-          lambda: _define_end_episode(agent_indices), str)
+      end_episode = tf.cond(tf.cast(tf.shape(agent_indices)[0],
+                                    tf.bool), lambda: _define_end_episode(agent_indices), str)
    with tf.control_dependencies([end_episode]):
-      summary = tf.summary.merge([
-          _define_summaries(), begin_episode, step, end_episode])
+      summary = tf.summary.merge([_define_summaries(), begin_episode, step, end_episode])
    with tf.control_dependencies([summary]):
      done, score = tf.identity(batch_env.done), tf.identity(score)
    return done, score, summary
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/simulate_test.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/simulate_test.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Tests for the simulation operation."""

 from __future__ import absolute_import
@@ -84,9 +83,10 @@ class SimulateTest(tf.test.TestCase):
  def _create_test_batch_env(self, durations):
    envs = []
    for duration in durations:
-      env = tools.MockEnvironment(
-          observ_shape=(2, 3), action_shape=(3,),
-          min_duration=duration, max_duration=duration)
+      env = tools.MockEnvironment(observ_shape=(2, 3),
+                                  action_shape=(3,),
+                                  min_duration=duration,
+                                  max_duration=duration)
      env = tools.wrappers.ConvertTo32Bit(env)
      envs.append(env)
    batch_env = tools.BatchEnv(envs, blocking=True)
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/streaming_mean.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/streaming_mean.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Compute a streaming estimation of the mean of submitted tensors."""

 from __future__ import absolute_import
@@ -53,9 +52,8 @@ class StreamingMean(object):
    # Add a batch dimension if necessary.
    if value.shape.ndims == self._sum.shape.ndims:
      value = value[None, ...]
-    return tf.group(
-        self._sum.assign_add(tf.reduce_sum(value, 0)),
-        self._count.assign_add(tf.shape(value)[0]))
+    return tf.group(self._sum.assign_add(tf.reduce_sum(value, 0)),
+                    self._count.assign_add(tf.shape(value)[0]))

  def clear(self):
    """Return the mean estimate and reset the streaming statistics."""
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/wrappers.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/wrappers.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Wrappers for OpenAI Gym environments."""

 from __future__ import absolute_import
@@ -150,8 +149,7 @@ class FrameHistory(object):
    return self._select_frames()

  def _select_frames(self):
-    indices = [
-        (self._step - index) % self._capacity for index in self._past_indices]
+    indices = [(self._step - index) % self._capacity for index in self._past_indices]
    observ = self._buffer[indices]
    if self._flatten:
      observ = np.reshape(observ, (-1,) + observ.shape[2:])
@@ -192,14 +190,14 @@ class RangeNormalize(object):

  def __init__(self, env, observ=None, action=None):
    self._env = env
-    self._should_normalize_observ = (
-        observ is not False and self._is_finite(self._env.observation_space))
+    self._should_normalize_observ = (observ is not False and
+                                     self._is_finite(self._env.observation_space))
    if observ is True and not self._should_normalize_observ:
      raise ValueError('Cannot normalize infinite observation range.')
    if observ is None and not self._should_normalize_observ:
      tf.logging.info('Not normalizing infinite observation range.')
-    self._should_normalize_action = (
-        action is not False and self._is_finite(self._env.action_space))
+    self._should_normalize_action = (action is not False and
+                                     self._is_finite(self._env.action_space))
    if action is True and not self._should_normalize_action:
      raise ValueError('Cannot normalize infinite action range.')
    if action is None and not self._should_normalize_action:
@@ -327,8 +325,7 @@ class ExternalProcess(object):
      action_space: The cached action space of the environment.
    """
    self._conn, conn = multiprocessing.Pipe()
-    self._process = multiprocessing.Process(
-        target=self._worker, args=(constructor, conn))
+    self._process = multiprocessing.Process(target=self._worker, args=(constructor, conn))
    atexit.register(self.close)
    self._process.start()
    self._observ_space = None
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/wrappers_test.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/wrappers_test.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Tests for environment wrappers."""

 from __future__ import absolute_import
@@ -28,18 +27,20 @@ from agents import tools
 class ExternalProcessTest(tf.test.TestCase):

  def test_close_no_hang_after_init(self):
-    constructor = functools.partial(
-        tools.MockEnvironment,
-        observ_shape=(2, 3), action_shape=(2,),
-        min_duration=2, max_duration=2)
+    constructor = functools.partial(tools.MockEnvironment,
+                                    observ_shape=(2, 3),
+                                    action_shape=(2,),
+                                    min_duration=2,
+                                    max_duration=2)
    env = tools.wrappers.ExternalProcess(constructor)
    env.close()

  def test_close_no_hang_after_step(self):
-    constructor = functools.partial(
-        tools.MockEnvironment,
-        observ_shape=(2, 3), action_shape=(2,),
-        min_duration=5, max_duration=5)
+    constructor = functools.partial(tools.MockEnvironment,
+                                    observ_shape=(2, 3),
+                                    action_shape=(2,),
+                                    min_duration=5,
+                                    max_duration=5)
    env = tools.wrappers.ExternalProcess(constructor)
    env.reset()
    env.step(env.action_space.sample())
@@ -53,8 +54,7 @@ class ExternalProcessTest(tf.test.TestCase):
      env.step(env.action_space.sample())

  def test_reraise_exception_in_step(self):
-    constructor = functools.partial(
-        MockEnvironmentCrashInStep, crash_at_step=3)
+    constructor = functools.partial(MockEnvironmentCrashInStep, crash_at_step=3)
    env = tools.wrappers.ExternalProcess(constructor)
    env.reset()
    env.step(env.action_space.sample())
@@ -74,9 +74,10 @@ class MockEnvironmentCrashInStep(tools.MockEnvironment):
  """Raise an error after specified number of steps in an episode."""

  def __init__(self, crash_at_step):
-    super(MockEnvironmentCrashInStep, self).__init__(
-        observ_shape=(2, 3), action_shape=(2,),
-        min_duration=crash_at_step + 1, max_duration=crash_at_step + 1)
+    super(MockEnvironmentCrashInStep, self).__init__(observ_shape=(2, 3),
+                                                     action_shape=(2,),
+                                                     min_duration=crash_at_step + 1,
+                                                     max_duration=crash_at_step + 1)
    self._crash_at_step = crash_at_step

  def step(self, *args, **kwargs):