Merge pull request #1346 from erwincoumans/master

revert to original agents train/visualize scripts (from github)
This commit is contained in:
erwincoumans
2017-09-28 11:36:15 -07:00
committed by GitHub
3 changed files with 306 additions and 58 deletions

View File

@@ -1,8 +1,25 @@
"""The PPO training configuration file for minitaur environments.""" # Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Example configurations using the PPO algorithm."""
from __future__ import absolute_import from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import functools import functools
from agents import ppo from agents import ppo
from agents.scripts import networks from agents.scripts import networks
from pybullet_envs.bullet import minitaur_gym_env from pybullet_envs.bullet import minitaur_gym_env
@@ -11,33 +28,33 @@ import pybullet_envs.bullet.minitaur_gym_env as minitaur_gym_env
import pybullet_envs import pybullet_envs
# pylint: disable=unused-variable
def default(): def default():
"""The default configurations.""" """Default configuration for PPO."""
# General # General
algorithm = ppo.PPOAlgorithm algorithm = ppo.PPOAlgorithm
num_agents = 25 num_agents = 10
eval_episodes = 25 eval_episodes = 25
use_gpu = False use_gpu = False
# Network # Network
network = networks.ForwardGaussianPolicy network = networks.ForwardGaussianPolicy
weight_summaries = dict( weight_summaries = dict(
all=r'.*', policy=r'.*/policy/.*', value=r'.*/value/.*') all=r'.*',
policy=r'.*/policy/.*',
value=r'.*/value/.*')
policy_layers = 200, 100 policy_layers = 200, 100
value_layers = 200, 100 value_layers = 200, 100
init_mean_factor = 0.2 init_mean_factor = 0.05
init_logstd = -1 init_logstd = -1
network_config = dict()
# Optimization # Optimization
update_every = 25 update_every = 25
policy_optimizer = 'AdamOptimizer' policy_optimizer = 'AdamOptimizer'
value_optimizer = 'AdamOptimizer' value_optimizer = 'AdamOptimizer'
update_epochs_policy = 25 update_epochs_policy = 50
update_epochs_value = 25 update_epochs_value = 50
value_lr = 1e-3
policy_lr = 1e-4 policy_lr = 1e-4
value_lr = 3e-4
# Losses # Losses
discount = 0.99 discount = 0.985
kl_target = 1e-2 kl_target = 1e-2
kl_cutoff_factor = 2 kl_cutoff_factor = 2
kl_cutoff_coef = 1000 kl_cutoff_coef = 1000
@@ -107,4 +124,3 @@ def pybullet_minitaur():
steps = 3e7 # 30M steps = 3e7 # 30M
return locals() return locals()

View File

@@ -1,8 +1,22 @@
r"""Script to use Proximal Policy Gradient for the minitaur environments. # Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Run: r"""Script to train a batch reinforcement learning algorithm.
python train_ppo.py --logdif=/tmp/train --config=minitaur_pybullet
Command line:
python3 -m agents.scripts.train --logdir=/path/to/logdir --config=pendulum
""" """
from __future__ import absolute_import from __future__ import absolute_import
@@ -10,39 +24,142 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import datetime import datetime
import functools
import os import os
import gym
import tensorflow as tf import tensorflow as tf
from agents import tools from agents import tools
from agents.scripts import train from . import configs
from agents.scripts import utility from agents.scripts import utility
from . import config_ppo
flags = tf.app.flags def _create_environment(config):
FLAGS = tf.app.flags.FLAGS """Constructor for an instance of the environment.
flags.DEFINE_string( Args:
'logdir', None, config: Object providing configurations via attributes.
'Base directory to store logs.')
flags.DEFINE_string( Returns:
'config', None, Wrapped OpenAI Gym environment.
'Configuration to execute.') """
flags.DEFINE_string( if isinstance(config.env, str):
'timestamp', datetime.datetime.now().strftime('%Y%m%dT%H%M%S'), env = gym.make(config.env)
'Sub directory to store logs.') else:
env = config.env()
if config.max_length:
env = tools.wrappers.LimitDuration(env, config.max_length)
env = tools.wrappers.RangeNormalize(env)
env = tools.wrappers.ClipAction(env)
env = tools.wrappers.ConvertTo32Bit(env)
return env
def _define_loop(graph, logdir, train_steps, eval_steps):
"""Create and configure a training loop with training and evaluation phases.
Args:
graph: Object providing graph elements via attributes.
logdir: Log directory for storing checkpoints and summaries.
train_steps: Number of training steps per epoch.
eval_steps: Number of evaluation steps per epoch.
Returns:
Loop object.
"""
loop = tools.Loop(
logdir, graph.step, graph.should_log, graph.do_report,
graph.force_reset)
loop.add_phase(
'train', graph.done, graph.score, graph.summary, train_steps,
report_every=None,
log_every=train_steps // 2,
checkpoint_every=None,
feed={graph.is_training: True})
loop.add_phase(
'eval', graph.done, graph.score, graph.summary, eval_steps,
report_every=eval_steps,
log_every=eval_steps // 2,
checkpoint_every=10 * eval_steps,
feed={graph.is_training: False})
return loop
def train(config, env_processes):
"""Training and evaluation entry point yielding scores.
Resolves some configuration attributes, creates environments, graph, and
training loop. By default, assigns all operations to the CPU.
Args:
config: Object providing configurations via attributes.
env_processes: Whether to step environments in separate processes.
Yields:
Evaluation scores.
"""
tf.reset_default_graph()
with config.unlocked:
config.network = functools.partial(
utility.define_network, config.network, config)
config.policy_optimizer = getattr(tf.train, config.policy_optimizer)
config.value_optimizer = getattr(tf.train, config.value_optimizer)
if config.update_every % config.num_agents:
tf.logging.warn('Number of agents should divide episodes per update.')
with tf.device('/cpu:0'):
batch_env = utility.define_batch_env(
lambda: _create_environment(config),
config.num_agents, env_processes)
graph = utility.define_simulation_graph(
batch_env, config.algorithm, config)
loop = _define_loop(
graph, config.logdir,
config.update_every * config.max_length,
config.eval_episodes * config.max_length)
total_steps = int(
config.steps / config.update_every *
(config.update_every + config.eval_episodes))
# Exclude episode related variables since the Python state of environments is
# not checkpointed and thus new episodes start after resuming.
saver = utility.define_saver(exclude=(r'.*_temporary/.*',))
sess_config = tf.ConfigProto(allow_soft_placement=True)
sess_config.gpu_options.allow_growth = True
with tf.Session(config=sess_config) as sess:
utility.initialize_variables(sess, saver, config.logdir)
for score in loop.run(sess, saver, total_steps):
yield score
batch_env.close()
def main(_): def main(_):
"""Create or load configuration and launch the trainer.""" """Create or load configuration and launch the trainer."""
config = tools.AttrDict(getattr(config_ppo, FLAGS.config)()) utility.set_up_logging()
logdir = FLAGS.logdir and os.path.join( if not FLAGS.config:
FLAGS.logdir, '{}-{}'.format(FLAGS.timestamp, FLAGS.config)) raise KeyError('You must specify a configuration.')
utility.save_config(config, logdir) logdir = FLAGS.logdir and os.path.expanduser(os.path.join(
for score in train.train(config, env_processes=True): FLAGS.logdir, '{}-{}'.format(FLAGS.timestamp, FLAGS.config)))
tf.logging.info(str(score)) try:
config = utility.load_config(logdir)
except IOError:
config = tools.AttrDict(getattr(configs, FLAGS.config)())
config = utility.save_config(config, logdir)
for score in train(config, FLAGS.env_processes):
tf.logging.info('Score {}.'.format(score))
if __name__ == '__main__': if __name__ == '__main__':
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string(
'logdir', None,
'Base directory to store logs.')
tf.app.flags.DEFINE_string(
'timestamp', datetime.datetime.now().strftime('%Y%m%dT%H%M%S'),
'Sub directory to store logs.')
tf.app.flags.DEFINE_string(
'config', None,
'Configuration to execute.')
tf.app.flags.DEFINE_boolean(
'env_processes', True,
'Step environments in separate processes to circumvent the GIL.')
tf.app.run() tf.app.run()

View File

@@ -1,42 +1,157 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
r"""Script to visualize the trained PPO agent. r"""Script to render videos of the Proximal Policy Gradient algorithm.
python -m pybullet_envs.agents.visualize \ Command line:
--logdir=ppo
--outdir=/tmp/video/
python3 -m agents.scripts.visualize \
--logdir=/path/to/logdir/<time>-<config> --outdir=/path/to/outdir/
""" """
from __future__ import absolute_import from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import functools
import os
import gym
import tensorflow as tf import tensorflow as tf
from agents.scripts import visualize from agents import tools
from agents.scripts import utility
flags = tf.app.flags def _create_environment(config, outdir):
FLAGS = tf.app.flags.FLAGS """Constructor for an instance of the environment.
flags.DEFINE_string("logdir", None,
"Directory to the checkpoint of a training run.") Args:
flags.DEFINE_string("outdir", None, config: Object providing configurations via attributes.
"Local directory for storing the monitoring outdir.") outdir: Directory to store videos in.
flags.DEFINE_string("checkpoint", None,
"Checkpoint name to load; defaults to most recent.") Returns:
flags.DEFINE_integer("num_agents", 1, Wrapped OpenAI Gym environment.
"How many environments to step in parallel.") """
flags.DEFINE_integer("num_episodes", 1, "Minimum number of episodes to render.") if isinstance(config.env, str):
flags.DEFINE_boolean( env = gym.make(config.env)
"env_processes", False, else:
"Step environments in separate processes to circumvent the GIL.") env = config.env()
# Ensure that the environment has the specification attribute set as expected
# by the monitor wrapper.
if not hasattr(env, 'spec'):
setattr(env, 'spec', getattr(env, 'spec', None))
if config.max_length:
env = tools.wrappers.LimitDuration(env, config.max_length)
env = gym.wrappers.Monitor(
env, outdir, lambda unused_episode_number: True)
env = tools.wrappers.RangeNormalize(env)
env = tools.wrappers.ClipAction(env)
env = tools.wrappers.ConvertTo32Bit(env)
return env
def _define_loop(graph, eval_steps):
"""Create and configure an evaluation loop.
Args:
graph: Object providing graph elements via attributes.
eval_steps: Number of evaluation steps per epoch.
Returns:
Loop object.
"""
loop = tools.Loop(
None, graph.step, graph.should_log, graph.do_report, graph.force_reset)
loop.add_phase(
'eval', graph.done, graph.score, graph.summary, eval_steps,
report_every=eval_steps,
log_every=None,
checkpoint_every=None,
feed={graph.is_training: False})
return loop
def visualize(
logdir, outdir, num_agents, num_episodes, checkpoint=None,
env_processes=True):
"""Recover checkpoint and render videos from it.
Args:
logdir: Logging directory of the trained algorithm.
outdir: Directory to store rendered videos in.
num_agents: Number of environments to simulate in parallel.
num_episodes: Total number of episodes to simulate.
checkpoint: Checkpoint name to load; defaults to most recent.
env_processes: Whether to step environments in separate processes.
"""
config = utility.load_config(logdir)
with config.unlocked:
config.network = functools.partial(
utility.define_network, config.network, config)
config.policy_optimizer = getattr(tf.train, config.policy_optimizer)
config.value_optimizer = getattr(tf.train, config.value_optimizer)
with tf.device('/cpu:0'):
batch_env = utility.define_batch_env(
lambda: _create_environment(config, outdir),
num_agents, env_processes)
graph = utility.define_simulation_graph(
batch_env, config.algorithm, config)
total_steps = num_episodes * config.max_length
loop = _define_loop(graph, total_steps)
saver = utility.define_saver(
exclude=(r'.*_temporary/.*', r'global_step'))
sess_config = tf.ConfigProto(allow_soft_placement=True)
sess_config.gpu_options.allow_growth = True
with tf.Session(config=sess_config) as sess:
utility.initialize_variables(
sess, saver, config.logdir, checkpoint, resume=True)
for unused_score in loop.run(sess, saver, total_steps):
pass
batch_env.close()
def main(_): def main(_):
visualize.visualize(FLAGS.logdir, FLAGS.outdir, FLAGS.num_agents, """Load a trained algorithm and render videos."""
FLAGS.num_episodes, FLAGS.checkpoint, FLAGS.env_processes) utility.set_up_logging()
if not FLAGS.logdir or not FLAGS.outdir:
raise KeyError('You must specify logging and outdirs directories.')
FLAGS.logdir = os.path.expanduser(FLAGS.logdir)
FLAGS.outdir = os.path.expanduser(FLAGS.outdir)
visualize(
FLAGS.logdir, FLAGS.outdir, FLAGS.num_agents, FLAGS.num_episodes,
FLAGS.checkpoint, FLAGS.env_processes)
if __name__ == "__main__": if __name__ == '__main__':
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string(
'logdir', None,
'Directory to the checkpoint of a training run.')
tf.app.flags.DEFINE_string(
'outdir', None,
'Local directory for storing the monitoring outdir.')
tf.app.flags.DEFINE_string(
'checkpoint', None,
'Checkpoint name to load; defaults to most recent.')
tf.app.flags.DEFINE_integer(
'num_agents', 1,
'How many environments to step in parallel.')
tf.app.flags.DEFINE_integer(
'num_episodes', 5,
'Minimum number of episodes to render.')
tf.app.flags.DEFINE_boolean(
'env_processes', True,
'Step environments in separate processes to circumvent the GIL.')
tf.app.run() tf.app.run()