From e4a3b3fe3871e8400e041fa0d016532bcd7c8890 Mon Sep 17 00:00:00 2001 From: Erwin Coumans Date: Wed, 27 Sep 2017 10:20:38 -0700 Subject: [PATCH] add TensorFlow Agents PPO training script for various pybullet environments: example: python -m pybullet_envs.agents.train_ppo --config=pybullet_pendulum --logdir=pendulum --- .../gym/pybullet_envs/agents/config_ppo.py | 110 ++++++++++++++++++ .../gym/pybullet_envs/agents/train_ppo.py | 48 ++++++++ .../gym/pybullet_envs/agents/visualize_ppo.py | 42 +++++++ 3 files changed, 200 insertions(+) create mode 100644 examples/pybullet/gym/pybullet_envs/agents/config_ppo.py create mode 100644 examples/pybullet/gym/pybullet_envs/agents/train_ppo.py create mode 100644 examples/pybullet/gym/pybullet_envs/agents/visualize_ppo.py diff --git a/examples/pybullet/gym/pybullet_envs/agents/config_ppo.py b/examples/pybullet/gym/pybullet_envs/agents/config_ppo.py new file mode 100644 index 000000000..26dcf5af4 --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/agents/config_ppo.py @@ -0,0 +1,110 @@ +"""The PPO training configuration file for minitaur environments.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import functools +from agents import ppo +from agents.scripts import networks +from pybullet_envs.bullet import minitaur_gym_env +from pybullet_envs.bullet import minitaur_env_randomizer +import pybullet_envs.bullet.minitaur_gym_env as minitaur_gym_env +import pybullet_envs + + +# pylint: disable=unused-variable +def default(): + """The default configurations.""" + # General + algorithm = ppo.PPOAlgorithm + num_agents = 25 + eval_episodes = 25 + use_gpu = False + # Network + network = networks.ForwardGaussianPolicy + weight_summaries = dict( + all=r'.*', policy=r'.*/policy/.*', value=r'.*/value/.*') + policy_layers = 200, 100 + value_layers = 200, 100 + init_mean_factor = 0.2 + init_logstd = -1 + network_config = dict() + # Optimization + update_every = 25 + policy_optimizer = 'AdamOptimizer' + value_optimizer = 'AdamOptimizer' + update_epochs_policy = 25 + update_epochs_value = 25 + value_lr = 1e-3 + policy_lr = 1e-4 + # Losses + discount = 0.99 + kl_target = 1e-2 + kl_cutoff_factor = 2 + kl_cutoff_coef = 1000 + kl_init_penalty = 1 + return locals() + + +def pybullet_pendulum(): + locals().update(default()) + env = 'InvertedPendulumBulletEnv-v0' + max_length = 200 + steps = 5e7 # 50M + return locals() + +def pybullet_doublependulum(): + locals().update(default()) + env = 'InvertedDoublePendulumBulletEnv-v0' + max_length = 1000 + steps = 5e7 # 50M + return locals() + +def pybullet_pendulumswingup(): + locals().update(default()) + env = 'InvertedPendulumSwingupBulletEnv-v0' + max_length = 1000 + steps = 5e7 # 50M + return locals() + +def pybullet_cheetah(): + """Configuration for MuJoCo's half cheetah task.""" + locals().update(default()) + # Environment + env = 'HalfCheetahBulletEnv-v0' + max_length = 1000 + steps = 1e8 # 100M + return locals() + +def pybullet_ant(): + locals().update(default()) + env = 'AntBulletEnv-v0' + max_length = 1000 + steps = 5e7 # 50M + return locals() + +def pybullet_racecar(): + """Configuration for Bullet MIT Racecar task.""" + locals().update(default()) + # Environment + env = 'RacecarBulletEnv-v0' #functools.partial(racecarGymEnv.RacecarGymEnv, isDiscrete=False, renders=True) + max_length = 10 + steps = 1e7 # 10M + return locals() + + +def pybullet_minitaur(): + """Configuration specific to minitaur_gym_env.MinitaurBulletEnv class.""" + locals().update(default()) + randomizer = (minitaur_env_randomizer.MinitaurBulletRandomizer()) + env = functools.partial( + minitaur_gym_env.MinitaurGymEnv, + accurate_motor_model_enabled=True, + motor_overheat_protection=True, + pd_control_enabled=True, + env_randomizer=randomizer, + render=False) + max_length = 1000 + steps = 3e7 # 30M + return locals() + + diff --git a/examples/pybullet/gym/pybullet_envs/agents/train_ppo.py b/examples/pybullet/gym/pybullet_envs/agents/train_ppo.py new file mode 100644 index 000000000..195df7df8 --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/agents/train_ppo.py @@ -0,0 +1,48 @@ +r"""Script to use Proximal Policy Gradient for the minitaur environments. + +Run: + python train_ppo.py --logdif=/tmp/train --config=minitaur_pybullet + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import datetime +import os +import tensorflow as tf + +from agents import tools +from agents.scripts import train +from agents.scripts import utility +from . import config_ppo + + +flags = tf.app.flags +FLAGS = tf.app.flags.FLAGS + +flags.DEFINE_string( + 'logdir', None, + 'Base directory to store logs.') +flags.DEFINE_string( + 'config', None, + 'Configuration to execute.') +flags.DEFINE_string( + 'timestamp', datetime.datetime.now().strftime('%Y%m%dT%H%M%S'), + 'Sub directory to store logs.') + + +def main(_): + """Create or load configuration and launch the trainer.""" + config = tools.AttrDict(getattr(config_ppo, FLAGS.config)()) + logdir = FLAGS.logdir and os.path.join( + FLAGS.logdir, '{}-{}'.format(FLAGS.timestamp, FLAGS.config)) + utility.save_config(config, logdir) + for score in train.train(config, env_processes=True): + tf.logging.info(str(score)) + + +if __name__ == '__main__': + tf.app.run() + diff --git a/examples/pybullet/gym/pybullet_envs/agents/visualize_ppo.py b/examples/pybullet/gym/pybullet_envs/agents/visualize_ppo.py new file mode 100644 index 000000000..fa5f7b21c --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/agents/visualize_ppo.py @@ -0,0 +1,42 @@ + +r"""Script to visualize the trained PPO agent. + +python -m pybullet_envs.agents.visualize \ +--logdir=ppo +--outdir=/tmp/video/ + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +from agents.scripts import visualize + + +flags = tf.app.flags +FLAGS = tf.app.flags.FLAGS +flags.DEFINE_string("logdir", None, + "Directory to the checkpoint of a training run.") +flags.DEFINE_string("outdir", None, + "Local directory for storing the monitoring outdir.") +flags.DEFINE_string("checkpoint", None, + "Checkpoint name to load; defaults to most recent.") +flags.DEFINE_integer("num_agents", 1, + "How many environments to step in parallel.") +flags.DEFINE_integer("num_episodes", 1, "Minimum number of episodes to render.") +flags.DEFINE_boolean( + "env_processes", False, + "Step environments in separate processes to circumvent the GIL.") + + +def main(_): + visualize.visualize(FLAGS.logdir, FLAGS.outdir, FLAGS.num_agents, + FLAGS.num_episodes, FLAGS.checkpoint, FLAGS.env_processes) + + +if __name__ == "__main__": + tf.app.run() +