Merge pull request #1346 from erwincoumans/master

revert to original agents train/visualize scripts (from github)
2017-09-28 11:36:15 -07:00
parent b44a67b530 0948ce5984
commit a6ca5632b2
3 changed files with 306 additions and 58 deletions
--- a/examples/pybullet/gym/pybullet_envs/agents/config_ppo.py
+++ b/examples/pybullet/gym/pybullet_envs/agents/config_ppo.py
@@ -1,8 +1,25 @@
-"""The PPO training configuration file for minitaur environments."""
+# Copyright 2017 The TensorFlow Agents Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Example configurations using the PPO algorithm."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import functools
 from agents import ppo
 from agents.scripts import networks
 from pybullet_envs.bullet import minitaur_gym_env
@@ -11,33 +28,33 @@ import pybullet_envs.bullet.minitaur_gym_env as minitaur_gym_env
 import pybullet_envs
 # pylint: disable=unused-variable
 def default():
-  """The default configurations."""
+  """Default configuration for PPO."""
  # General
  algorithm = ppo.PPOAlgorithm
-  num_agents = 25
+  num_agents = 10
  eval_episodes = 25
  use_gpu = False
  # Network
  network = networks.ForwardGaussianPolicy
  weight_summaries = dict(
-      all=r'.*', policy=r'.*/policy/.*', value=r'.*/value/.*')
+      all=r'.*',
      policy=r'.*/policy/.*',
      value=r'.*/value/.*')
  policy_layers = 200, 100
  value_layers = 200, 100
-  init_mean_factor = 0.2
+  init_mean_factor = 0.05
  init_logstd = -1
  network_config = dict()
  # Optimization
  update_every = 25
  policy_optimizer = 'AdamOptimizer'
  value_optimizer = 'AdamOptimizer'
-  update_epochs_policy = 25
+  update_epochs_policy = 50
-  update_epochs_value = 25
+  update_epochs_value = 50
  value_lr = 1e-3
  policy_lr = 1e-4
  value_lr = 3e-4
  # Losses
-  discount = 0.99
+  discount = 0.985
  kl_target = 1e-2
  kl_cutoff_factor = 2
  kl_cutoff_coef = 1000
@@ -107,4 +124,3 @@ def pybullet_minitaur():
  steps = 3e7  # 30M
  return locals()
--- a/examples/pybullet/gym/pybullet_envs/agents/train_ppo.py
+++ b/examples/pybullet/gym/pybullet_envs/agents/train_ppo.py
@@ -1,8 +1,22 @@
-r"""Script to use Proximal Policy Gradient for the minitaur environments.
+# Copyright 2017 The TensorFlow Agents Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-Run:
+r"""Script to train a batch reinforcement learning algorithm.
 python train_ppo.py --logdif=/tmp/train --config=minitaur_pybullet
 Command line:
  python3 -m agents.scripts.train --logdir=/path/to/logdir --config=pendulum
 """
 from __future__ import absolute_import
@@ -10,39 +24,142 @@ from __future__ import division
 from __future__ import print_function
 import datetime
 import functools
 import os
 import gym
 import tensorflow as tf
 from agents import tools
-from agents.scripts import train
+from . import configs
 from agents.scripts import utility
 from . import config_ppo
-flags = tf.app.flags
+def _create_environment(config):
-FLAGS = tf.app.flags.FLAGS
+  """Constructor for an instance of the environment.
-flags.DEFINE_string(
+  Args:
-    'logdir', None,
+    config: Object providing configurations via attributes.
-    'Base directory to store logs.')
+
-flags.DEFINE_string(
+  Returns:
-    'config', None,
+    Wrapped OpenAI Gym environment.
-    'Configuration to execute.')
+  """
-flags.DEFINE_string(
+  if isinstance(config.env, str):
-    'timestamp', datetime.datetime.now().strftime('%Y%m%dT%H%M%S'),
+    env = gym.make(config.env)
-    'Sub directory to store logs.')
+  else:
    env = config.env()
  if config.max_length:
    env = tools.wrappers.LimitDuration(env, config.max_length)
  env = tools.wrappers.RangeNormalize(env)
  env = tools.wrappers.ClipAction(env)
  env = tools.wrappers.ConvertTo32Bit(env)
  return env
 def _define_loop(graph, logdir, train_steps, eval_steps):
  """Create and configure a training loop with training and evaluation phases.
  Args:
    graph: Object providing graph elements via attributes.
    logdir: Log directory for storing checkpoints and summaries.
    train_steps: Number of training steps per epoch.
    eval_steps: Number of evaluation steps per epoch.
  Returns:
    Loop object.
  """
  loop = tools.Loop(
      logdir, graph.step, graph.should_log, graph.do_report,
      graph.force_reset)
  loop.add_phase(
      'train', graph.done, graph.score, graph.summary, train_steps,
      report_every=None,
      log_every=train_steps // 2,
      checkpoint_every=None,
      feed={graph.is_training: True})
  loop.add_phase(
      'eval', graph.done, graph.score, graph.summary, eval_steps,
      report_every=eval_steps,
      log_every=eval_steps // 2,
      checkpoint_every=10 * eval_steps,
      feed={graph.is_training: False})
  return loop
 def train(config, env_processes):
  """Training and evaluation entry point yielding scores.
  Resolves some configuration attributes, creates environments, graph, and
  training loop. By default, assigns all operations to the CPU.
  Args:
    config: Object providing configurations via attributes.
    env_processes: Whether to step environments in separate processes.
  Yields:
    Evaluation scores.
  """
  tf.reset_default_graph()
  with config.unlocked:
    config.network = functools.partial(
        utility.define_network, config.network, config)
    config.policy_optimizer = getattr(tf.train, config.policy_optimizer)
    config.value_optimizer = getattr(tf.train, config.value_optimizer)
  if config.update_every % config.num_agents:
    tf.logging.warn('Number of agents should divide episodes per update.')
  with tf.device('/cpu:0'):
    batch_env = utility.define_batch_env(
        lambda: _create_environment(config),
        config.num_agents, env_processes)
    graph = utility.define_simulation_graph(
        batch_env, config.algorithm, config)
    loop = _define_loop(
        graph, config.logdir,
        config.update_every * config.max_length,
        config.eval_episodes * config.max_length)
    total_steps = int(
        config.steps / config.update_every *
        (config.update_every + config.eval_episodes))
  # Exclude episode related variables since the Python state of environments is
  # not checkpointed and thus new episodes start after resuming.
  saver = utility.define_saver(exclude=(r'.*_temporary/.*',))
  sess_config = tf.ConfigProto(allow_soft_placement=True)
  sess_config.gpu_options.allow_growth = True
  with tf.Session(config=sess_config) as sess:
    utility.initialize_variables(sess, saver, config.logdir)
    for score in loop.run(sess, saver, total_steps):
      yield score
  batch_env.close()
 def main(_):
  """Create or load configuration and launch the trainer."""
-  config = tools.AttrDict(getattr(config_ppo, FLAGS.config)())
+  utility.set_up_logging()
-  logdir = FLAGS.logdir and os.path.join(
+  if not FLAGS.config:
-      FLAGS.logdir, '{}-{}'.format(FLAGS.timestamp, FLAGS.config))
+    raise KeyError('You must specify a configuration.')
-  utility.save_config(config, logdir)
+  logdir = FLAGS.logdir and os.path.expanduser(os.path.join(
-  for score in train.train(config, env_processes=True):
+      FLAGS.logdir, '{}-{}'.format(FLAGS.timestamp, FLAGS.config)))
-    tf.logging.info(str(score))
+  try:
    config = utility.load_config(logdir)
  except IOError:
    config = tools.AttrDict(getattr(configs, FLAGS.config)())
    config = utility.save_config(config, logdir)
  for score in train(config, FLAGS.env_processes):
    tf.logging.info('Score {}.'.format(score))
 if __name__ == '__main__':
  FLAGS = tf.app.flags.FLAGS
  tf.app.flags.DEFINE_string(
      'logdir', None,
      'Base directory to store logs.')
  tf.app.flags.DEFINE_string(
      'timestamp', datetime.datetime.now().strftime('%Y%m%dT%H%M%S'),
      'Sub directory to store logs.')
  tf.app.flags.DEFINE_string(
      'config', None,
      'Configuration to execute.')
  tf.app.flags.DEFINE_boolean(
      'env_processes', True,
      'Step environments in separate processes to circumvent the GIL.')
  tf.app.run()
--- a/examples/pybullet/gym/pybullet_envs/agents/visualize_ppo.py
+++ b/examples/pybullet/gym/pybullet_envs/agents/visualize_ppo.py
@@ -1,42 +1,157 @@
 # Copyright 2017 The TensorFlow Agents Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-r"""Script to visualize the trained PPO agent.
+r"""Script to render videos of the Proximal Policy Gradient algorithm.
-python -m pybullet_envs.agents.visualize \
+Command line:
 --logdir=ppo
 --outdir=/tmp/video/
  python3 -m agents.scripts.visualize \
      --logdir=/path/to/logdir/<time>-<config> --outdir=/path/to/outdir/
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import functools
 import os
 import gym
 import tensorflow as tf
-from agents.scripts import visualize
+from agents import tools
 from agents.scripts import utility
-flags = tf.app.flags
+def _create_environment(config, outdir):
-FLAGS = tf.app.flags.FLAGS
+  """Constructor for an instance of the environment.
-flags.DEFINE_string("logdir", None,
+
-                    "Directory to the checkpoint of a training run.")
+  Args:
-flags.DEFINE_string("outdir", None,
+    config: Object providing configurations via attributes.
-                    "Local directory for storing the monitoring outdir.")
+    outdir: Directory to store videos in.
-flags.DEFINE_string("checkpoint", None,
+
-                    "Checkpoint name to load; defaults to most recent.")
+  Returns:
-flags.DEFINE_integer("num_agents", 1,
+    Wrapped OpenAI Gym environment.
-                     "How many environments to step in parallel.")
+  """
-flags.DEFINE_integer("num_episodes", 1, "Minimum number of episodes to render.")
+  if isinstance(config.env, str):
-flags.DEFINE_boolean(
+    env = gym.make(config.env)
-    "env_processes", False,
+  else:
-    "Step environments in separate processes to circumvent the GIL.")
+    env = config.env()
  # Ensure that the environment has the specification attribute set as expected
  # by the monitor wrapper.
  if not hasattr(env, 'spec'):
    setattr(env, 'spec', getattr(env, 'spec', None))
  if config.max_length:
    env = tools.wrappers.LimitDuration(env, config.max_length)
  env = gym.wrappers.Monitor(
      env, outdir, lambda unused_episode_number: True)
  env = tools.wrappers.RangeNormalize(env)
  env = tools.wrappers.ClipAction(env)
  env = tools.wrappers.ConvertTo32Bit(env)
  return env
 def _define_loop(graph, eval_steps):
  """Create and configure an evaluation loop.
  Args:
    graph: Object providing graph elements via attributes.
    eval_steps: Number of evaluation steps per epoch.
  Returns:
    Loop object.
  """
  loop = tools.Loop(
      None, graph.step, graph.should_log, graph.do_report, graph.force_reset)
  loop.add_phase(
      'eval', graph.done, graph.score, graph.summary, eval_steps,
      report_every=eval_steps,
      log_every=None,
      checkpoint_every=None,
      feed={graph.is_training: False})
  return loop
 def visualize(
    logdir, outdir, num_agents, num_episodes, checkpoint=None,
    env_processes=True):
  """Recover checkpoint and render videos from it.
  Args:
    logdir: Logging directory of the trained algorithm.
    outdir: Directory to store rendered videos in.
    num_agents: Number of environments to simulate in parallel.
    num_episodes: Total number of episodes to simulate.
    checkpoint: Checkpoint name to load; defaults to most recent.
    env_processes: Whether to step environments in separate processes.
  """
  config = utility.load_config(logdir)
  with config.unlocked:
    config.network = functools.partial(
        utility.define_network, config.network, config)
    config.policy_optimizer = getattr(tf.train, config.policy_optimizer)
    config.value_optimizer = getattr(tf.train, config.value_optimizer)
  with tf.device('/cpu:0'):
    batch_env = utility.define_batch_env(
        lambda: _create_environment(config, outdir),
        num_agents, env_processes)
    graph = utility.define_simulation_graph(
        batch_env, config.algorithm, config)
    total_steps = num_episodes * config.max_length
    loop = _define_loop(graph, total_steps)
  saver = utility.define_saver(
      exclude=(r'.*_temporary/.*', r'global_step'))
  sess_config = tf.ConfigProto(allow_soft_placement=True)
  sess_config.gpu_options.allow_growth = True
  with tf.Session(config=sess_config) as sess:
    utility.initialize_variables(
        sess, saver, config.logdir, checkpoint, resume=True)
    for unused_score in loop.run(sess, saver, total_steps):
      pass
  batch_env.close()
 def main(_):
-  visualize.visualize(FLAGS.logdir, FLAGS.outdir, FLAGS.num_agents,
+  """Load a trained algorithm and render videos."""
-                      FLAGS.num_episodes, FLAGS.checkpoint, FLAGS.env_processes)
+  utility.set_up_logging()
  if not FLAGS.logdir or not FLAGS.outdir:
    raise KeyError('You must specify logging and outdirs directories.')
  FLAGS.logdir = os.path.expanduser(FLAGS.logdir)
  FLAGS.outdir = os.path.expanduser(FLAGS.outdir)
  visualize(
      FLAGS.logdir, FLAGS.outdir, FLAGS.num_agents, FLAGS.num_episodes,
      FLAGS.checkpoint, FLAGS.env_processes)
-if __name__ == "__main__":
+if __name__ == '__main__':
  FLAGS = tf.app.flags.FLAGS
  tf.app.flags.DEFINE_string(
      'logdir', None,
      'Directory to the checkpoint of a training run.')
  tf.app.flags.DEFINE_string(
      'outdir', None,
      'Local directory for storing the monitoring outdir.')
  tf.app.flags.DEFINE_string(
      'checkpoint', None,
      'Checkpoint name to load; defaults to most recent.')
  tf.app.flags.DEFINE_integer(
      'num_agents', 1,
      'How many environments to step in parallel.')
  tf.app.flags.DEFINE_integer(
      'num_episodes', 5,
      'Minimum number of episodes to render.')
  tf.app.flags.DEFINE_boolean(
      'env_processes', True,
      'Step environments in separate processes to circumvent the GIL.')
  tf.app.run()