make sure that the pre-trained galloping and trotting policies work for the minitaur_reactive_env and minitaur_trotting_env environments.

2018-04-24 21:48:27 -07:00
parent 982453abc6
commit a375a349ce
43 changed files with 4447 additions and 42 deletions
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/init.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/init.py
@@ -0,0 +1,23 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Executable scripts for reinforcement learning."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from . import train
+from . import utility
+from . import visualize
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/configs.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/configs.py
@@ -0,0 +1,128 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Example configurations using the PPO algorithm."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-variable
+
+from pybullet_envs.minitaur.agents import ppo
+from pybullet_envs.minitaur.agents.scripts import networks
+
+
+def default():
+  """Default configuration for PPO."""
+  # General
+  algorithm = ppo.PPOAlgorithm
+  num_agents = 10
+  eval_episodes = 25
+  use_gpu = False
+  # Network
+  network = networks.ForwardGaussianPolicy
+  weight_summaries = dict(
+      all=r'.*',
+      policy=r'.*/policy/.*',
+      value=r'.*/value/.*')
+  policy_layers = 200, 100
+  value_layers = 200, 100
+  init_mean_factor = 0.05
+  init_logstd = -1
+  # Optimization
+  update_every = 25
+  policy_optimizer = 'AdamOptimizer'
+  value_optimizer = 'AdamOptimizer'
+  update_epochs_policy = 50
+  update_epochs_value = 50
+  policy_lr = 1e-4
+  value_lr = 3e-4
+  # Losses
+  discount = 0.985
+  kl_target = 1e-2
+  kl_cutoff_factor = 2
+  kl_cutoff_coef = 1000
+  kl_init_penalty = 1
+  return locals()
+
+
+def pendulum():
+  """Configuration for the pendulum classic control task."""
+  locals().update(default())
+  # Environment
+  env = 'Pendulum-v0'
+  max_length = 200
+  steps = 1e6  # 1M
+  return locals()
+
+
+def cheetah():
+  """Configuration for MuJoCo's half cheetah task."""
+  locals().update(default())
+  # Environment
+  env = 'HalfCheetah-v1'
+  max_length = 1000
+  steps = 1e7  # 10M
+  return locals()
+
+
+def walker():
+  """Configuration for MuJoCo's walker task."""
+  locals().update(default())
+  # Environment
+  env = 'Walker2d-v1'
+  max_length = 1000
+  steps = 1e7  # 10M
+  return locals()
+
+
+def reacher():
+  """Configuration for MuJoCo's reacher task."""
+  locals().update(default())
+  # Environment
+  env = 'Reacher-v1'
+  max_length = 1000
+  steps = 1e7  # 10M
+  return locals()
+
+
+def hopper():
+  """Configuration for MuJoCo's hopper task."""
+  locals().update(default())
+  # Environment
+  env = 'Hopper-v1'
+  max_length = 1000
+  steps = 2e7  # 20M
+  return locals()
+
+
+def ant():
+  """Configuration for MuJoCo's ant task."""
+  locals().update(default())
+  # Environment
+  env = 'Ant-v1'
+  max_length = 1000
+  steps = 5e7  # 50M
+  return locals()
+
+
+def humanoid():
+  """Configuration for MuJoCo's humanoid task."""
+  locals().update(default())
+  # Environment
+  env = 'Humanoid-v1'
+  max_length = 1000
+  steps = 5e7  # 50M
+  return locals()
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/networks.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/networks.py
@@ -0,0 +1,167 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Networks for the PPO algorithm defined as recurrent cells."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+_MEAN_WEIGHTS_INITIALIZER = tf.contrib.layers.variance_scaling_initializer(
+    factor=0.1)
+_LOGSTD_INITIALIZER = tf.random_normal_initializer(-1, 1e-10)
+
+class LinearGaussianPolicy(tf.contrib.rnn.RNNCell):
+  """Indepent linear network with a tanh at the end for policy and feedforward network for the value.
+
+  The policy network outputs the mean action and the log standard deviation
+  is learned as indepent parameter vector.
+  """
+
+  def __init__(self,
+               policy_layers,
+               value_layers,
+               action_size,
+               mean_weights_initializer=_MEAN_WEIGHTS_INITIALIZER,
+               logstd_initializer=_LOGSTD_INITIALIZER):
+    self._policy_layers = policy_layers
+    self._value_layers = value_layers
+    self._action_size = action_size
+    self._mean_weights_initializer = mean_weights_initializer
+    self._logstd_initializer = logstd_initializer
+
+  @property
+  def state_size(self):
+    unused_state_size = 1
+    return unused_state_size
+
+  @property
+  def output_size(self):
+    return (self._action_size, self._action_size, tf.TensorShape([]))
+
+  def __call__(self, observation, state):
+    with tf.variable_scope('policy'):
+      x = tf.contrib.layers.flatten(observation)
+      mean = tf.contrib.layers.fully_connected(
+          x,
+          self._action_size,
+          tf.tanh,
+          weights_initializer=self._mean_weights_initializer)
+      logstd = tf.get_variable('logstd', mean.shape[1:], tf.float32,
+                               self._logstd_initializer)
+      logstd = tf.tile(logstd[None, ...],
+                       [tf.shape(mean)[0]] + [1] * logstd.shape.ndims)
+    with tf.variable_scope('value'):
+      x = tf.contrib.layers.flatten(observation)
+      for size in self._value_layers:
+        x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
+      value = tf.contrib.layers.fully_connected(x, 1, None)[:, 0]
+    return (mean, logstd, value), state
+
+
+class ForwardGaussianPolicy(tf.contrib.rnn.RNNCell):
+  """Independent feed forward networks for policy and value.
+
+  The policy network outputs the mean action and the log standard deviation
+  is learned as independent parameter vector.
+  """
+
+  def __init__(
+      self, policy_layers, value_layers, action_size,
+      mean_weights_initializer=_MEAN_WEIGHTS_INITIALIZER,
+      logstd_initializer=_LOGSTD_INITIALIZER):
+    self._policy_layers = policy_layers
+    self._value_layers = value_layers
+    self._action_size = action_size
+    self._mean_weights_initializer = mean_weights_initializer
+    self._logstd_initializer = logstd_initializer
+
+  @property
+  def state_size(self):
+    unused_state_size = 1
+    return unused_state_size
+
+  @property
+  def output_size(self):
+    return (self._action_size, self._action_size, tf.TensorShape([]))
+
+  def __call__(self, observation, state):
+    with tf.variable_scope('policy'):
+      x = tf.contrib.layers.flatten(observation)
+      for size in self._policy_layers:
+        x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
+      mean = tf.contrib.layers.fully_connected(
+          x, self._action_size, tf.tanh,
+          weights_initializer=self._mean_weights_initializer)
+      logstd = tf.get_variable(
+          'logstd', mean.shape[1:], tf.float32, self._logstd_initializer)
+      logstd = tf.tile(
+          logstd[None, ...], [tf.shape(mean)[0]] + [1] * logstd.shape.ndims)
+    with tf.variable_scope('value'):
+      x = tf.contrib.layers.flatten(observation)
+      for size in self._value_layers:
+        x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
+      value = tf.contrib.layers.fully_connected(x, 1, None)[:, 0]
+    return (mean, logstd, value), state
+
+
+class RecurrentGaussianPolicy(tf.contrib.rnn.RNNCell):
+  """Independent recurrent policy and feed forward value networks.
+
+  The policy network outputs the mean action and the log standard deviation
+  is learned as independent parameter vector. The last policy layer is recurrent
+  and uses a GRU cell.
+  """
+
+  def __init__(
+      self, policy_layers, value_layers, action_size,
+      mean_weights_initializer=_MEAN_WEIGHTS_INITIALIZER,
+      logstd_initializer=_LOGSTD_INITIALIZER):
+    self._policy_layers = policy_layers
+    self._value_layers = value_layers
+    self._action_size = action_size
+    self._mean_weights_initializer = mean_weights_initializer
+    self._logstd_initializer = logstd_initializer
+    self._cell = tf.contrib.rnn.GRUBlockCell(100)
+
+  @property
+  def state_size(self):
+    return self._cell.state_size
+
+  @property
+  def output_size(self):
+    return (self._action_size, self._action_size, tf.TensorShape([]))
+
+  def __call__(self, observation, state):
+    with tf.variable_scope('policy'):
+      x = tf.contrib.layers.flatten(observation)
+      for size in self._policy_layers[:-1]:
+        x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
+      x, state = self._cell(x, state)
+      mean = tf.contrib.layers.fully_connected(
+          x, self._action_size, tf.tanh,
+          weights_initializer=self._mean_weights_initializer)
+      logstd = tf.get_variable(
+          'logstd', mean.shape[1:], tf.float32, self._logstd_initializer)
+      logstd = tf.tile(
+          logstd[None, ...], [tf.shape(mean)[0]] + [1] * logstd.shape.ndims)
+    with tf.variable_scope('value'):
+      x = tf.contrib.layers.flatten(observation)
+      for size in self._value_layers:
+        x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
+      value = tf.contrib.layers.fully_connected(x, 1, None)[:, 0]
+    return (mean, logstd, value), state
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/train.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/train.py
@@ -0,0 +1,165 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Script to train a batch reinforcement learning algorithm.
+
+Command line:
+
+  python3 -m agents.scripts.train --logdir=/path/to/logdir --config=pendulum
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import datetime
+import functools
+import os
+
+import gym
+import tensorflow as tf
+
+from pybullet_envs.minitaur.agents import tools
+from pybullet_envs.minitaur.agents.scripts import configs
+from pybullet_envs.minitaur.agents.scripts import utility
+
+
+def _create_environment(config):
+  """Constructor for an instance of the environment.
+
+  Args:
+    config: Object providing configurations via attributes.
+
+  Returns:
+    Wrapped OpenAI Gym environment.
+  """
+  if isinstance(config.env, str):
+    env = gym.make(config.env)
+  else:
+    env = config.env()
+  if config.max_length:
+    env = tools.wrappers.LimitDuration(env, config.max_length)
+  env = tools.wrappers.RangeNormalize(env)
+  env = tools.wrappers.ClipAction(env)
+  env = tools.wrappers.ConvertTo32Bit(env)
+  return env
+
+
+def _define_loop(graph, logdir, train_steps, eval_steps):
+  """Create and configure a training loop with training and evaluation phases.
+
+  Args:
+    graph: Object providing graph elements via attributes.
+    logdir: Log directory for storing checkpoints and summaries.
+    train_steps: Number of training steps per epoch.
+    eval_steps: Number of evaluation steps per epoch.
+
+  Returns:
+    Loop object.
+  """
+  loop = tools.Loop(
+      logdir, graph.step, graph.should_log, graph.do_report,
+      graph.force_reset)
+  loop.add_phase(
+      'train', graph.done, graph.score, graph.summary, train_steps,
+      report_every=None,
+      log_every=train_steps // 2,
+      checkpoint_every=None,
+      feed={graph.is_training: True})
+  loop.add_phase(
+      'eval', graph.done, graph.score, graph.summary, eval_steps,
+      report_every=eval_steps,
+      log_every=eval_steps // 2,
+      checkpoint_every=10 * eval_steps,
+      feed={graph.is_training: False})
+  return loop
+
+
+def train(config, env_processes):
+  """Training and evaluation entry point yielding scores.
+
+  Resolves some configuration attributes, creates environments, graph, and
+  training loop. By default, assigns all operations to the CPU.
+
+  Args:
+    config: Object providing configurations via attributes.
+    env_processes: Whether to step environments in separate processes.
+
+  Yields:
+    Evaluation scores.
+  """
+  tf.reset_default_graph()
+  with config.unlocked:
+    config.network = functools.partial(
+        utility.define_network, config.network, config)
+    config.policy_optimizer = getattr(tf.train, config.policy_optimizer)
+    config.value_optimizer = getattr(tf.train, config.value_optimizer)
+  if config.update_every % config.num_agents:
+    tf.logging.warn('Number of agents should divide episodes per update.')
+  with tf.device('/cpu:0'):
+    batch_env = utility.define_batch_env(
+        lambda: _create_environment(config),
+        config.num_agents, env_processes)
+    graph = utility.define_simulation_graph(
+        batch_env, config.algorithm, config)
+    loop = _define_loop(
+        graph, config.logdir,
+        config.update_every * config.max_length,
+        config.eval_episodes * config.max_length)
+    total_steps = int(
+        config.steps / config.update_every *
+        (config.update_every + config.eval_episodes))
+  # Exclude episode related variables since the Python state of environments is
+  # not checkpointed and thus new episodes start after resuming.
+  saver = utility.define_saver(exclude=(r'.*_temporary/.*',))
+  sess_config = tf.ConfigProto(allow_soft_placement=True)
+  sess_config.gpu_options.allow_growth = True
+  with tf.Session(config=sess_config) as sess:
+    utility.initialize_variables(sess, saver, config.logdir)
+    for score in loop.run(sess, saver, total_steps):
+      yield score
+  batch_env.close()
+
+
+def main(_):
+  """Create or load configuration and launch the trainer."""
+  utility.set_up_logging()
+  if not FLAGS.config:
+    raise KeyError('You must specify a configuration.')
+  logdir = FLAGS.logdir and os.path.expanduser(os.path.join(
+      FLAGS.logdir, '{}-{}'.format(FLAGS.timestamp, FLAGS.config)))
+  try:
+    config = utility.load_config(logdir)
+  except IOError:
+    config = tools.AttrDict(getattr(configs, FLAGS.config)())
+    config = utility.save_config(config, logdir)
+  for score in train(config, FLAGS.env_processes):
+    tf.logging.info('Score {}.'.format(score))
+
+
+if __name__ == '__main__':
+  FLAGS = tf.app.flags.FLAGS
+  tf.app.flags.DEFINE_string(
+      'logdir', None,
+      'Base directory to store logs.')
+  tf.app.flags.DEFINE_string(
+      'timestamp', datetime.datetime.now().strftime('%Y%m%dT%H%M%S'),
+      'Sub directory to store logs.')
+  tf.app.flags.DEFINE_string(
+      'config', None,
+      'Configuration to execute.')
+  tf.app.flags.DEFINE_boolean(
+      'env_processes', True,
+      'Step environments in separate processes to circumvent the GIL.')
+  tf.app.run()
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/train_ppo_test.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/train_ppo_test.py
@@ -0,0 +1,110 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for the PPO algorithm usage example."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import itertools
+
+import tensorflow as tf
+
+from google3.robotics.reinforcement_learning.agents import ppo
+from google3.robotics.reinforcement_learning.agents import tools
+from google3.robotics.reinforcement_learning.agents.scripts import configs
+from google3.robotics.reinforcement_learning.agents.scripts import networks
+from google3.robotics.reinforcement_learning.agents.scripts import train
+
+
+FLAGS = tf.app.flags.FLAGS
+
+
+class PPOTest(tf.test.TestCase):
+
+  def test_no_crash_cheetah(self):
+    nets = networks.ForwardGaussianPolicy, networks.RecurrentGaussianPolicy
+    for network in nets:
+      config = self._define_config()
+      with config.unlocked:
+        config.env = 'HalfCheetah-v1'
+        config.max_length = 200
+        config.steps = 1000
+        config.network = network
+      for score in train.train(config, env_processes=True):
+        float(score)
+
+  def test_no_crash_ant(self):
+    nets = networks.ForwardGaussianPolicy, networks.RecurrentGaussianPolicy
+    for network in nets:
+      config = self._define_config()
+      with config.unlocked:
+        config.env = 'Ant-v1'
+        config.max_length = 200
+        config.steps = 1000
+        config.network = network
+      for score in train.train(config, env_processes=True):
+        float(score)
+
+  def test_no_crash_observation_shape(self):
+    nets = networks.ForwardGaussianPolicy, networks.RecurrentGaussianPolicy
+    observ_shapes = (1,), (2, 3), (2, 3, 4)
+    for network, observ_shape in itertools.product(nets, observ_shapes):
+      config = self._define_config()
+      with config.unlocked:
+        config.env = functools.partial(
+            tools.MockEnvironment, observ_shape, action_shape=(3,),
+            min_duration=15, max_duration=15)
+        config.max_length = 20
+        config.steps = 100
+        config.network = network
+      for score in train.train(config, env_processes=False):
+        float(score)
+
+  def test_no_crash_variable_duration(self):
+    config = self._define_config()
+    with config.unlocked:
+      config.env = functools.partial(
+          tools.MockEnvironment, observ_shape=(2, 3), action_shape=(3,),
+          min_duration=5, max_duration=25)
+      config.max_length = 25
+      config.steps = 200
+      config.network = networks.RecurrentGaussianPolicy
+    for score in train.train(config, env_processes=False):
+      float(score)
+
+  def _define_config(self):
+    # Start from the example configuration.
+    locals().update(configs.default())
+    # pylint: disable=unused-variable
+    # General
+    algorithm = ppo.PPOAlgorithm
+    num_agents = 2
+    update_every = 4
+    use_gpu = False
+    # Network
+    policy_layers = 20, 10
+    value_layers = 20, 10
+    # Optimization
+    update_epochs_policy = 2
+    update_epochs_value = 2
+    # pylint: enable=unused-variable
+    return tools.AttrDict(locals())
+
+
+if __name__ == '__main__':
+  FLAGS.config = 'unused'
+  tf.test.main()
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/utility.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/utility.py
@@ -0,0 +1,213 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for using reinforcement learning algorithms."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import logging
+import os
+import re
+
+import ruamel.yaml as yaml
+import tensorflow as tf
+
+from pybullet_envs.minitaur.agents import tools
+
+
+def define_simulation_graph(batch_env, algo_cls, config):
+  """Define the algortihm and environment interaction.
+
+  Args:
+    batch_env: In-graph environments object.
+    algo_cls: Constructor of a batch algorithm.
+    config: Configuration object for the algorithm.
+
+  Returns:
+    Object providing graph elements via attributes.
+  """
+  # pylint: disable=unused-variable
+  step = tf.Variable(0, False, dtype=tf.int32, name='global_step')
+  is_training = tf.placeholder(tf.bool, name='is_training')
+  should_log = tf.placeholder(tf.bool, name='should_log')
+  do_report = tf.placeholder(tf.bool, name='do_report')
+  force_reset = tf.placeholder(tf.bool, name='force_reset')
+  algo = algo_cls(batch_env, step, is_training, should_log, config)
+  done, score, summary = tools.simulate(
+      batch_env, algo, should_log, force_reset)
+  message = 'Graph contains {} trainable variables.'
+  tf.logging.info(message.format(tools.count_weights()))
+  # pylint: enable=unused-variable
+  return tools.AttrDict(locals())
+
+
+def define_batch_env(constructor, num_agents, env_processes):
+  """Create environments and apply all desired wrappers.
+
+  Args:
+    constructor: Constructor of an OpenAI gym environment.
+    num_agents: Number of environments to combine in the batch.
+    env_processes: Whether to step environment in external processes.
+
+  Returns:
+    In-graph environments object.
+  """
+  with tf.variable_scope('environments'):
+    if env_processes:
+      envs = [
+          tools.wrappers.ExternalProcess(constructor)
+          for _ in range(num_agents)]
+    else:
+      envs = [constructor() for _ in range(num_agents)]
+    batch_env = tools.BatchEnv(envs, blocking=not env_processes)
+    batch_env = tools.InGraphBatchEnv(batch_env)
+  return batch_env
+
+
+def define_saver(exclude=None):
+  """Create a saver for the variables we want to checkpoint.
+
+  Args:
+    exclude: List of regexes to match variable names to exclude.
+
+  Returns:
+    Saver object.
+  """
+  variables = []
+  exclude = exclude or []
+  exclude = [re.compile(regex) for regex in exclude]
+  for variable in tf.global_variables():
+    if any(regex.match(variable.name) for regex in exclude):
+      continue
+    variables.append(variable)
+  saver = tf.train.Saver(variables, keep_checkpoint_every_n_hours=5)
+  return saver
+
+
+def define_network(constructor, config, action_size):
+  """Constructor for the recurrent cell for the algorithm.
+
+  Args:
+    constructor: Callable returning the network as RNNCell.
+    config: Object providing configurations via attributes.
+    action_size: Integer indicating the amount of action dimensions.
+
+  Returns:
+    Created recurrent cell object.
+  """
+  mean_weights_initializer = (
+      tf.contrib.layers.variance_scaling_initializer(
+          factor=config.init_mean_factor))
+  logstd_initializer = tf.random_normal_initializer(
+      config.init_logstd, 1e-10)
+  network = constructor(
+      config.policy_layers, config.value_layers, action_size,
+      mean_weights_initializer=mean_weights_initializer,
+      logstd_initializer=logstd_initializer)
+  return network
+
+
+def initialize_variables(sess, saver, logdir, checkpoint=None, resume=None):
+  """Initialize or restore variables from a checkpoint if available.
+
+  Args:
+    sess: Session to initialize variables in.
+    saver: Saver to restore variables.
+    logdir: Directory to search for checkpoints.
+    checkpoint: Specify what checkpoint name to use; defaults to most recent.
+    resume: Whether to expect recovering a checkpoint or starting a new run.
+
+  Raises:
+    ValueError: If resume expected but no log directory specified.
+    RuntimeError: If no resume expected but a checkpoint was found.
+  """
+  sess.run(tf.group(
+      tf.local_variables_initializer(),
+      tf.global_variables_initializer()))
+  if resume and not (logdir or checkpoint):
+    raise ValueError('Need to specify logdir to resume a checkpoint.')
+  if logdir:
+    state = tf.train.get_checkpoint_state(logdir)
+    if checkpoint:
+      checkpoint = os.path.join(logdir, checkpoint)
+    if not checkpoint and state and state.model_checkpoint_path:
+      checkpoint = state.model_checkpoint_path
+    if checkpoint and resume is False:
+      message = 'Found unexpected checkpoint when starting a new run.'
+      raise RuntimeError(message)
+    if checkpoint:
+      saver.restore(sess, checkpoint)
+
+
+def save_config(config, logdir=None):
+  """Save a new configuration by name.
+
+  If a logging directory is specified, is will be created and the configuration
+  will be stored there. Otherwise, a log message will be printed.
+
+  Args:
+    config: Configuration object.
+    logdir: Location for writing summaries and checkpoints if specified.
+
+  Returns:
+    Configuration object.
+  """
+  if logdir:
+    with config.unlocked:
+      config.logdir = logdir
+    message = 'Start a new run and write summaries and checkpoints to {}.'
+    tf.logging.info(message.format(config.logdir))
+    tf.gfile.MakeDirs(config.logdir)
+    config_path = os.path.join(config.logdir, 'config.yaml')
+    with tf.gfile.FastGFile(config_path, 'w') as file_:
+      yaml.dump(config, file_, default_flow_style=False)
+  else:
+    message = (
+        'Start a new run without storing summaries and checkpoints since no '
+        'logging directory was specified.')
+    tf.logging.info(message)
+  return config
+
+
+def load_config(logdir):
+  """Load a configuration from the log directory.
+
+  Args:
+    logdir: The logging directory containing the configuration file.
+
+  Raises:
+    IOError: The logging directory does not contain a configuration file.
+
+  Returns:
+    Configuration object.
+  """
+  config_path = logdir and os.path.join(logdir, 'config.yaml')
+  if not config_path or not tf.gfile.Exists(config_path):
+    message = (
+        'Cannot resume an existing run since the logging directory does not '
+        'contain a configuration file.')
+    raise IOError(message)
+  with tf.gfile.FastGFile(config_path, 'r') as file_:
+    config = yaml.load(file_)
+  message = 'Resume run and write summaries and checkpoints to {}.'
+  tf.logging.info(message.format(config.logdir))
+  return config
+
+
+def set_up_logging():
+  """Configure the TensorFlow logger."""
+  tf.logging.set_verbosity(tf.logging.INFO)
+  logging.getLogger('tensorflow').propagate = False
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/visualize.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/visualize.py
@@ -0,0 +1,157 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Script to render videos of the Proximal Policy Gradient algorithm.
+
+Command line:
+
+  python3 -m agents.scripts.visualize \
+      --logdir=/path/to/logdir/<time>-<config> --outdir=/path/to/outdir/
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+
+import gym
+import tensorflow as tf
+
+from pybullet_envs.minitaur.agents import tools
+from pybullet_envs.minitaur.agents.scripts import utility
+
+
+def _create_environment(config, outdir):
+  """Constructor for an instance of the environment.
+
+  Args:
+    config: Object providing configurations via attributes.
+    outdir: Directory to store videos in.
+
+  Returns:
+    Wrapped OpenAI Gym environment.
+  """
+  if isinstance(config.env, str):
+    env = gym.make(config.env)
+  else:
+    env = config.env()
+  # Ensure that the environment has the specification attribute set as expected
+  # by the monitor wrapper.
+  if not hasattr(env, 'spec'):
+    setattr(env, 'spec', getattr(env, 'spec', None))
+  if config.max_length:
+    env = tools.wrappers.LimitDuration(env, config.max_length)
+#  env = gym.wrappers.Monitor(
+#      env, outdir, lambda unused_episode_number: True)
+  env = tools.wrappers.RangeNormalize(env)
+  env = tools.wrappers.ClipAction(env)
+  env = tools.wrappers.ConvertTo32Bit(env)
+  return env
+
+
+def _define_loop(graph, eval_steps):
+  """Create and configure an evaluation loop.
+
+  Args:
+    graph: Object providing graph elements via attributes.
+    eval_steps: Number of evaluation steps per epoch.
+
+  Returns:
+    Loop object.
+  """
+  loop = tools.Loop(
+      None, graph.step, graph.should_log, graph.do_report, graph.force_reset)
+  loop.add_phase(
+      'eval', graph.done, graph.score, graph.summary, eval_steps,
+      report_every=eval_steps,
+      log_every=None,
+      checkpoint_every=None,
+      feed={graph.is_training: False})
+  return loop
+
+
+def visualize(
+    logdir, outdir, num_agents, num_episodes, checkpoint=None,
+    env_processes=True):
+  """Recover checkpoint and render videos from it.
+
+  Args:
+    logdir: Logging directory of the trained algorithm.
+    outdir: Directory to store rendered videos in.
+    num_agents: Number of environments to simulate in parallel.
+    num_episodes: Total number of episodes to simulate.
+    checkpoint: Checkpoint name to load; defaults to most recent.
+    env_processes: Whether to step environments in separate processes.
+  """
+  config = utility.load_config(logdir)
+  with config.unlocked:
+    config.network = functools.partial(
+        utility.define_network, config.network, config)
+    config.policy_optimizer = getattr(tf.train, config.policy_optimizer)
+    config.value_optimizer = getattr(tf.train, config.value_optimizer)
+  with tf.device('/cpu:0'):
+    batch_env = utility.define_batch_env(
+        lambda: _create_environment(config, outdir),
+        num_agents, env_processes)
+    graph = utility.define_simulation_graph(
+        batch_env, config.algorithm, config)
+    total_steps = num_episodes * config.max_length
+    loop = _define_loop(graph, total_steps)
+  saver = utility.define_saver(
+      exclude=(r'.*_temporary/.*', r'global_step'))
+  sess_config = tf.ConfigProto(allow_soft_placement=True)
+  sess_config.gpu_options.allow_growth = True
+  with tf.Session(config=sess_config) as sess:
+    utility.initialize_variables(
+        sess, saver, config.logdir, checkpoint, resume=True)
+    for unused_score in loop.run(sess, saver, total_steps):
+      pass
+  batch_env.close()
+
+
+def main(_):
+  """Load a trained algorithm and render videos."""
+  utility.set_up_logging()
+  if not FLAGS.logdir or not FLAGS.outdir:
+    raise KeyError('You must specify logging and outdirs directories.')
+  FLAGS.logdir = os.path.expanduser(FLAGS.logdir)
+  FLAGS.outdir = os.path.expanduser(FLAGS.outdir)
+  visualize(
+      FLAGS.logdir, FLAGS.outdir, FLAGS.num_agents, FLAGS.num_episodes,
+      FLAGS.checkpoint, FLAGS.env_processes)
+
+
+if __name__ == '__main__':
+  FLAGS = tf.app.flags.FLAGS
+  tf.app.flags.DEFINE_string(
+      'logdir', None,
+      'Directory to the checkpoint of a training run.')
+  tf.app.flags.DEFINE_string(
+      'outdir', None,
+      'Local directory for storing the monitoring outdir.')
+  tf.app.flags.DEFINE_string(
+      'checkpoint', None,
+      'Checkpoint name to load; defaults to most recent.')
+  tf.app.flags.DEFINE_integer(
+      'num_agents', 1,
+      'How many environments to step in parallel.')
+  tf.app.flags.DEFINE_integer(
+      'num_episodes', 5,
+      'Minimum number of episodes to render.')
+  tf.app.flags.DEFINE_boolean(
+      'env_processes', True,
+      'Step environments in separate processes to circumvent the GIL.')
+  tf.app.run()