Merge pull request #1346 from erwincoumans/master
revert to original agents train/visualize scripts (from github)
This commit is contained in:
@@ -1,8 +1,25 @@
|
|||||||
"""The PPO training configuration file for minitaur environments."""
|
# Copyright 2017 The TensorFlow Agents Authors.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""Example configurations using the PPO algorithm."""
|
||||||
|
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
import functools
|
import functools
|
||||||
|
|
||||||
from agents import ppo
|
from agents import ppo
|
||||||
from agents.scripts import networks
|
from agents.scripts import networks
|
||||||
from pybullet_envs.bullet import minitaur_gym_env
|
from pybullet_envs.bullet import minitaur_gym_env
|
||||||
@@ -11,33 +28,33 @@ import pybullet_envs.bullet.minitaur_gym_env as minitaur_gym_env
|
|||||||
import pybullet_envs
|
import pybullet_envs
|
||||||
|
|
||||||
|
|
||||||
# pylint: disable=unused-variable
|
|
||||||
def default():
|
def default():
|
||||||
"""The default configurations."""
|
"""Default configuration for PPO."""
|
||||||
# General
|
# General
|
||||||
algorithm = ppo.PPOAlgorithm
|
algorithm = ppo.PPOAlgorithm
|
||||||
num_agents = 25
|
num_agents = 10
|
||||||
eval_episodes = 25
|
eval_episodes = 25
|
||||||
use_gpu = False
|
use_gpu = False
|
||||||
# Network
|
# Network
|
||||||
network = networks.ForwardGaussianPolicy
|
network = networks.ForwardGaussianPolicy
|
||||||
weight_summaries = dict(
|
weight_summaries = dict(
|
||||||
all=r'.*', policy=r'.*/policy/.*', value=r'.*/value/.*')
|
all=r'.*',
|
||||||
|
policy=r'.*/policy/.*',
|
||||||
|
value=r'.*/value/.*')
|
||||||
policy_layers = 200, 100
|
policy_layers = 200, 100
|
||||||
value_layers = 200, 100
|
value_layers = 200, 100
|
||||||
init_mean_factor = 0.2
|
init_mean_factor = 0.05
|
||||||
init_logstd = -1
|
init_logstd = -1
|
||||||
network_config = dict()
|
|
||||||
# Optimization
|
# Optimization
|
||||||
update_every = 25
|
update_every = 25
|
||||||
policy_optimizer = 'AdamOptimizer'
|
policy_optimizer = 'AdamOptimizer'
|
||||||
value_optimizer = 'AdamOptimizer'
|
value_optimizer = 'AdamOptimizer'
|
||||||
update_epochs_policy = 25
|
update_epochs_policy = 50
|
||||||
update_epochs_value = 25
|
update_epochs_value = 50
|
||||||
value_lr = 1e-3
|
|
||||||
policy_lr = 1e-4
|
policy_lr = 1e-4
|
||||||
|
value_lr = 3e-4
|
||||||
# Losses
|
# Losses
|
||||||
discount = 0.99
|
discount = 0.985
|
||||||
kl_target = 1e-2
|
kl_target = 1e-2
|
||||||
kl_cutoff_factor = 2
|
kl_cutoff_factor = 2
|
||||||
kl_cutoff_coef = 1000
|
kl_cutoff_coef = 1000
|
||||||
@@ -107,4 +124,3 @@ def pybullet_minitaur():
|
|||||||
steps = 3e7 # 30M
|
steps = 3e7 # 30M
|
||||||
return locals()
|
return locals()
|
||||||
|
|
||||||
|
|
||||||
@@ -1,8 +1,22 @@
|
|||||||
r"""Script to use Proximal Policy Gradient for the minitaur environments.
|
# Copyright 2017 The TensorFlow Agents Authors.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
Run:
|
r"""Script to train a batch reinforcement learning algorithm.
|
||||||
python train_ppo.py --logdif=/tmp/train --config=minitaur_pybullet
|
|
||||||
|
|
||||||
|
Command line:
|
||||||
|
|
||||||
|
python3 -m agents.scripts.train --logdir=/path/to/logdir --config=pendulum
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
@@ -10,39 +24,142 @@ from __future__ import division
|
|||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
|
import functools
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
import gym
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
|
|
||||||
from agents import tools
|
from agents import tools
|
||||||
from agents.scripts import train
|
from . import configs
|
||||||
from agents.scripts import utility
|
from agents.scripts import utility
|
||||||
from . import config_ppo
|
|
||||||
|
|
||||||
|
|
||||||
flags = tf.app.flags
|
def _create_environment(config):
|
||||||
FLAGS = tf.app.flags.FLAGS
|
"""Constructor for an instance of the environment.
|
||||||
|
|
||||||
flags.DEFINE_string(
|
Args:
|
||||||
'logdir', None,
|
config: Object providing configurations via attributes.
|
||||||
'Base directory to store logs.')
|
|
||||||
flags.DEFINE_string(
|
Returns:
|
||||||
'config', None,
|
Wrapped OpenAI Gym environment.
|
||||||
'Configuration to execute.')
|
"""
|
||||||
flags.DEFINE_string(
|
if isinstance(config.env, str):
|
||||||
'timestamp', datetime.datetime.now().strftime('%Y%m%dT%H%M%S'),
|
env = gym.make(config.env)
|
||||||
'Sub directory to store logs.')
|
else:
|
||||||
|
env = config.env()
|
||||||
|
if config.max_length:
|
||||||
|
env = tools.wrappers.LimitDuration(env, config.max_length)
|
||||||
|
env = tools.wrappers.RangeNormalize(env)
|
||||||
|
env = tools.wrappers.ClipAction(env)
|
||||||
|
env = tools.wrappers.ConvertTo32Bit(env)
|
||||||
|
return env
|
||||||
|
|
||||||
|
|
||||||
|
def _define_loop(graph, logdir, train_steps, eval_steps):
|
||||||
|
"""Create and configure a training loop with training and evaluation phases.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
graph: Object providing graph elements via attributes.
|
||||||
|
logdir: Log directory for storing checkpoints and summaries.
|
||||||
|
train_steps: Number of training steps per epoch.
|
||||||
|
eval_steps: Number of evaluation steps per epoch.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Loop object.
|
||||||
|
"""
|
||||||
|
loop = tools.Loop(
|
||||||
|
logdir, graph.step, graph.should_log, graph.do_report,
|
||||||
|
graph.force_reset)
|
||||||
|
loop.add_phase(
|
||||||
|
'train', graph.done, graph.score, graph.summary, train_steps,
|
||||||
|
report_every=None,
|
||||||
|
log_every=train_steps // 2,
|
||||||
|
checkpoint_every=None,
|
||||||
|
feed={graph.is_training: True})
|
||||||
|
loop.add_phase(
|
||||||
|
'eval', graph.done, graph.score, graph.summary, eval_steps,
|
||||||
|
report_every=eval_steps,
|
||||||
|
log_every=eval_steps // 2,
|
||||||
|
checkpoint_every=10 * eval_steps,
|
||||||
|
feed={graph.is_training: False})
|
||||||
|
return loop
|
||||||
|
|
||||||
|
|
||||||
|
def train(config, env_processes):
|
||||||
|
"""Training and evaluation entry point yielding scores.
|
||||||
|
|
||||||
|
Resolves some configuration attributes, creates environments, graph, and
|
||||||
|
training loop. By default, assigns all operations to the CPU.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Object providing configurations via attributes.
|
||||||
|
env_processes: Whether to step environments in separate processes.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Evaluation scores.
|
||||||
|
"""
|
||||||
|
tf.reset_default_graph()
|
||||||
|
with config.unlocked:
|
||||||
|
config.network = functools.partial(
|
||||||
|
utility.define_network, config.network, config)
|
||||||
|
config.policy_optimizer = getattr(tf.train, config.policy_optimizer)
|
||||||
|
config.value_optimizer = getattr(tf.train, config.value_optimizer)
|
||||||
|
if config.update_every % config.num_agents:
|
||||||
|
tf.logging.warn('Number of agents should divide episodes per update.')
|
||||||
|
with tf.device('/cpu:0'):
|
||||||
|
batch_env = utility.define_batch_env(
|
||||||
|
lambda: _create_environment(config),
|
||||||
|
config.num_agents, env_processes)
|
||||||
|
graph = utility.define_simulation_graph(
|
||||||
|
batch_env, config.algorithm, config)
|
||||||
|
loop = _define_loop(
|
||||||
|
graph, config.logdir,
|
||||||
|
config.update_every * config.max_length,
|
||||||
|
config.eval_episodes * config.max_length)
|
||||||
|
total_steps = int(
|
||||||
|
config.steps / config.update_every *
|
||||||
|
(config.update_every + config.eval_episodes))
|
||||||
|
# Exclude episode related variables since the Python state of environments is
|
||||||
|
# not checkpointed and thus new episodes start after resuming.
|
||||||
|
saver = utility.define_saver(exclude=(r'.*_temporary/.*',))
|
||||||
|
sess_config = tf.ConfigProto(allow_soft_placement=True)
|
||||||
|
sess_config.gpu_options.allow_growth = True
|
||||||
|
with tf.Session(config=sess_config) as sess:
|
||||||
|
utility.initialize_variables(sess, saver, config.logdir)
|
||||||
|
for score in loop.run(sess, saver, total_steps):
|
||||||
|
yield score
|
||||||
|
batch_env.close()
|
||||||
|
|
||||||
|
|
||||||
def main(_):
|
def main(_):
|
||||||
"""Create or load configuration and launch the trainer."""
|
"""Create or load configuration and launch the trainer."""
|
||||||
config = tools.AttrDict(getattr(config_ppo, FLAGS.config)())
|
utility.set_up_logging()
|
||||||
logdir = FLAGS.logdir and os.path.join(
|
if not FLAGS.config:
|
||||||
FLAGS.logdir, '{}-{}'.format(FLAGS.timestamp, FLAGS.config))
|
raise KeyError('You must specify a configuration.')
|
||||||
utility.save_config(config, logdir)
|
logdir = FLAGS.logdir and os.path.expanduser(os.path.join(
|
||||||
for score in train.train(config, env_processes=True):
|
FLAGS.logdir, '{}-{}'.format(FLAGS.timestamp, FLAGS.config)))
|
||||||
tf.logging.info(str(score))
|
try:
|
||||||
|
config = utility.load_config(logdir)
|
||||||
|
except IOError:
|
||||||
|
config = tools.AttrDict(getattr(configs, FLAGS.config)())
|
||||||
|
config = utility.save_config(config, logdir)
|
||||||
|
for score in train(config, FLAGS.env_processes):
|
||||||
|
tf.logging.info('Score {}.'.format(score))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
FLAGS = tf.app.flags.FLAGS
|
||||||
|
tf.app.flags.DEFINE_string(
|
||||||
|
'logdir', None,
|
||||||
|
'Base directory to store logs.')
|
||||||
|
tf.app.flags.DEFINE_string(
|
||||||
|
'timestamp', datetime.datetime.now().strftime('%Y%m%dT%H%M%S'),
|
||||||
|
'Sub directory to store logs.')
|
||||||
|
tf.app.flags.DEFINE_string(
|
||||||
|
'config', None,
|
||||||
|
'Configuration to execute.')
|
||||||
|
tf.app.flags.DEFINE_boolean(
|
||||||
|
'env_processes', True,
|
||||||
|
'Step environments in separate processes to circumvent the GIL.')
|
||||||
tf.app.run()
|
tf.app.run()
|
||||||
|
|
||||||
|
|||||||
@@ -1,42 +1,157 @@
|
|||||||
|
# Copyright 2017 The TensorFlow Agents Authors.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
r"""Script to visualize the trained PPO agent.
|
r"""Script to render videos of the Proximal Policy Gradient algorithm.
|
||||||
|
|
||||||
python -m pybullet_envs.agents.visualize \
|
Command line:
|
||||||
--logdir=ppo
|
|
||||||
--outdir=/tmp/video/
|
|
||||||
|
|
||||||
|
python3 -m agents.scripts.visualize \
|
||||||
|
--logdir=/path/to/logdir/<time>-<config> --outdir=/path/to/outdir/
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import functools
|
||||||
|
import os
|
||||||
|
|
||||||
|
import gym
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
|
|
||||||
from agents.scripts import visualize
|
from agents import tools
|
||||||
|
from agents.scripts import utility
|
||||||
|
|
||||||
|
|
||||||
flags = tf.app.flags
|
def _create_environment(config, outdir):
|
||||||
FLAGS = tf.app.flags.FLAGS
|
"""Constructor for an instance of the environment.
|
||||||
flags.DEFINE_string("logdir", None,
|
|
||||||
"Directory to the checkpoint of a training run.")
|
Args:
|
||||||
flags.DEFINE_string("outdir", None,
|
config: Object providing configurations via attributes.
|
||||||
"Local directory for storing the monitoring outdir.")
|
outdir: Directory to store videos in.
|
||||||
flags.DEFINE_string("checkpoint", None,
|
|
||||||
"Checkpoint name to load; defaults to most recent.")
|
Returns:
|
||||||
flags.DEFINE_integer("num_agents", 1,
|
Wrapped OpenAI Gym environment.
|
||||||
"How many environments to step in parallel.")
|
"""
|
||||||
flags.DEFINE_integer("num_episodes", 1, "Minimum number of episodes to render.")
|
if isinstance(config.env, str):
|
||||||
flags.DEFINE_boolean(
|
env = gym.make(config.env)
|
||||||
"env_processes", False,
|
else:
|
||||||
"Step environments in separate processes to circumvent the GIL.")
|
env = config.env()
|
||||||
|
# Ensure that the environment has the specification attribute set as expected
|
||||||
|
# by the monitor wrapper.
|
||||||
|
if not hasattr(env, 'spec'):
|
||||||
|
setattr(env, 'spec', getattr(env, 'spec', None))
|
||||||
|
if config.max_length:
|
||||||
|
env = tools.wrappers.LimitDuration(env, config.max_length)
|
||||||
|
env = gym.wrappers.Monitor(
|
||||||
|
env, outdir, lambda unused_episode_number: True)
|
||||||
|
env = tools.wrappers.RangeNormalize(env)
|
||||||
|
env = tools.wrappers.ClipAction(env)
|
||||||
|
env = tools.wrappers.ConvertTo32Bit(env)
|
||||||
|
return env
|
||||||
|
|
||||||
|
|
||||||
|
def _define_loop(graph, eval_steps):
|
||||||
|
"""Create and configure an evaluation loop.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
graph: Object providing graph elements via attributes.
|
||||||
|
eval_steps: Number of evaluation steps per epoch.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Loop object.
|
||||||
|
"""
|
||||||
|
loop = tools.Loop(
|
||||||
|
None, graph.step, graph.should_log, graph.do_report, graph.force_reset)
|
||||||
|
loop.add_phase(
|
||||||
|
'eval', graph.done, graph.score, graph.summary, eval_steps,
|
||||||
|
report_every=eval_steps,
|
||||||
|
log_every=None,
|
||||||
|
checkpoint_every=None,
|
||||||
|
feed={graph.is_training: False})
|
||||||
|
return loop
|
||||||
|
|
||||||
|
|
||||||
|
def visualize(
|
||||||
|
logdir, outdir, num_agents, num_episodes, checkpoint=None,
|
||||||
|
env_processes=True):
|
||||||
|
"""Recover checkpoint and render videos from it.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
logdir: Logging directory of the trained algorithm.
|
||||||
|
outdir: Directory to store rendered videos in.
|
||||||
|
num_agents: Number of environments to simulate in parallel.
|
||||||
|
num_episodes: Total number of episodes to simulate.
|
||||||
|
checkpoint: Checkpoint name to load; defaults to most recent.
|
||||||
|
env_processes: Whether to step environments in separate processes.
|
||||||
|
"""
|
||||||
|
config = utility.load_config(logdir)
|
||||||
|
with config.unlocked:
|
||||||
|
config.network = functools.partial(
|
||||||
|
utility.define_network, config.network, config)
|
||||||
|
config.policy_optimizer = getattr(tf.train, config.policy_optimizer)
|
||||||
|
config.value_optimizer = getattr(tf.train, config.value_optimizer)
|
||||||
|
with tf.device('/cpu:0'):
|
||||||
|
batch_env = utility.define_batch_env(
|
||||||
|
lambda: _create_environment(config, outdir),
|
||||||
|
num_agents, env_processes)
|
||||||
|
graph = utility.define_simulation_graph(
|
||||||
|
batch_env, config.algorithm, config)
|
||||||
|
total_steps = num_episodes * config.max_length
|
||||||
|
loop = _define_loop(graph, total_steps)
|
||||||
|
saver = utility.define_saver(
|
||||||
|
exclude=(r'.*_temporary/.*', r'global_step'))
|
||||||
|
sess_config = tf.ConfigProto(allow_soft_placement=True)
|
||||||
|
sess_config.gpu_options.allow_growth = True
|
||||||
|
with tf.Session(config=sess_config) as sess:
|
||||||
|
utility.initialize_variables(
|
||||||
|
sess, saver, config.logdir, checkpoint, resume=True)
|
||||||
|
for unused_score in loop.run(sess, saver, total_steps):
|
||||||
|
pass
|
||||||
|
batch_env.close()
|
||||||
|
|
||||||
|
|
||||||
def main(_):
|
def main(_):
|
||||||
visualize.visualize(FLAGS.logdir, FLAGS.outdir, FLAGS.num_agents,
|
"""Load a trained algorithm and render videos."""
|
||||||
FLAGS.num_episodes, FLAGS.checkpoint, FLAGS.env_processes)
|
utility.set_up_logging()
|
||||||
|
if not FLAGS.logdir or not FLAGS.outdir:
|
||||||
|
raise KeyError('You must specify logging and outdirs directories.')
|
||||||
|
FLAGS.logdir = os.path.expanduser(FLAGS.logdir)
|
||||||
|
FLAGS.outdir = os.path.expanduser(FLAGS.outdir)
|
||||||
|
visualize(
|
||||||
|
FLAGS.logdir, FLAGS.outdir, FLAGS.num_agents, FLAGS.num_episodes,
|
||||||
|
FLAGS.checkpoint, FLAGS.env_processes)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == '__main__':
|
||||||
|
FLAGS = tf.app.flags.FLAGS
|
||||||
|
tf.app.flags.DEFINE_string(
|
||||||
|
'logdir', None,
|
||||||
|
'Directory to the checkpoint of a training run.')
|
||||||
|
tf.app.flags.DEFINE_string(
|
||||||
|
'outdir', None,
|
||||||
|
'Local directory for storing the monitoring outdir.')
|
||||||
|
tf.app.flags.DEFINE_string(
|
||||||
|
'checkpoint', None,
|
||||||
|
'Checkpoint name to load; defaults to most recent.')
|
||||||
|
tf.app.flags.DEFINE_integer(
|
||||||
|
'num_agents', 1,
|
||||||
|
'How many environments to step in parallel.')
|
||||||
|
tf.app.flags.DEFINE_integer(
|
||||||
|
'num_episodes', 5,
|
||||||
|
'Minimum number of episodes to render.')
|
||||||
|
tf.app.flags.DEFINE_boolean(
|
||||||
|
'env_processes', True,
|
||||||
|
'Step environments in separate processes to circumvent the GIL.')
|
||||||
tf.app.run()
|
tf.app.run()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user