make sure that the pre-trained galloping and trotting policies work for the minitaur_reactive_env and minitaur_trotting_env environments.

This commit is contained in:
Jie
2018-04-24 21:48:27 -07:00
parent 982453abc6
commit a375a349ce
43 changed files with 4447 additions and 42 deletions

View File

@@ -0,0 +1,23 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Executable scripts for reinforcement learning."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from . import train
from . import utility
from . import visualize

View File

@@ -0,0 +1,128 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Example configurations using the PPO algorithm."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
# pylint: disable=unused-variable
from pybullet_envs.minitaur.agents import ppo
from pybullet_envs.minitaur.agents.scripts import networks
def default():
"""Default configuration for PPO."""
# General
algorithm = ppo.PPOAlgorithm
num_agents = 10
eval_episodes = 25
use_gpu = False
# Network
network = networks.ForwardGaussianPolicy
weight_summaries = dict(
all=r'.*',
policy=r'.*/policy/.*',
value=r'.*/value/.*')
policy_layers = 200, 100
value_layers = 200, 100
init_mean_factor = 0.05
init_logstd = -1
# Optimization
update_every = 25
policy_optimizer = 'AdamOptimizer'
value_optimizer = 'AdamOptimizer'
update_epochs_policy = 50
update_epochs_value = 50
policy_lr = 1e-4
value_lr = 3e-4
# Losses
discount = 0.985
kl_target = 1e-2
kl_cutoff_factor = 2
kl_cutoff_coef = 1000
kl_init_penalty = 1
return locals()
def pendulum():
"""Configuration for the pendulum classic control task."""
locals().update(default())
# Environment
env = 'Pendulum-v0'
max_length = 200
steps = 1e6 # 1M
return locals()
def cheetah():
"""Configuration for MuJoCo's half cheetah task."""
locals().update(default())
# Environment
env = 'HalfCheetah-v1'
max_length = 1000
steps = 1e7 # 10M
return locals()
def walker():
"""Configuration for MuJoCo's walker task."""
locals().update(default())
# Environment
env = 'Walker2d-v1'
max_length = 1000
steps = 1e7 # 10M
return locals()
def reacher():
"""Configuration for MuJoCo's reacher task."""
locals().update(default())
# Environment
env = 'Reacher-v1'
max_length = 1000
steps = 1e7 # 10M
return locals()
def hopper():
"""Configuration for MuJoCo's hopper task."""
locals().update(default())
# Environment
env = 'Hopper-v1'
max_length = 1000
steps = 2e7 # 20M
return locals()
def ant():
"""Configuration for MuJoCo's ant task."""
locals().update(default())
# Environment
env = 'Ant-v1'
max_length = 1000
steps = 5e7 # 50M
return locals()
def humanoid():
"""Configuration for MuJoCo's humanoid task."""
locals().update(default())
# Environment
env = 'Humanoid-v1'
max_length = 1000
steps = 5e7 # 50M
return locals()

View File

@@ -0,0 +1,167 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Networks for the PPO algorithm defined as recurrent cells."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
_MEAN_WEIGHTS_INITIALIZER = tf.contrib.layers.variance_scaling_initializer(
factor=0.1)
_LOGSTD_INITIALIZER = tf.random_normal_initializer(-1, 1e-10)
class LinearGaussianPolicy(tf.contrib.rnn.RNNCell):
"""Indepent linear network with a tanh at the end for policy and feedforward network for the value.
The policy network outputs the mean action and the log standard deviation
is learned as indepent parameter vector.
"""
def __init__(self,
policy_layers,
value_layers,
action_size,
mean_weights_initializer=_MEAN_WEIGHTS_INITIALIZER,
logstd_initializer=_LOGSTD_INITIALIZER):
self._policy_layers = policy_layers
self._value_layers = value_layers
self._action_size = action_size
self._mean_weights_initializer = mean_weights_initializer
self._logstd_initializer = logstd_initializer
@property
def state_size(self):
unused_state_size = 1
return unused_state_size
@property
def output_size(self):
return (self._action_size, self._action_size, tf.TensorShape([]))
def __call__(self, observation, state):
with tf.variable_scope('policy'):
x = tf.contrib.layers.flatten(observation)
mean = tf.contrib.layers.fully_connected(
x,
self._action_size,
tf.tanh,
weights_initializer=self._mean_weights_initializer)
logstd = tf.get_variable('logstd', mean.shape[1:], tf.float32,
self._logstd_initializer)
logstd = tf.tile(logstd[None, ...],
[tf.shape(mean)[0]] + [1] * logstd.shape.ndims)
with tf.variable_scope('value'):
x = tf.contrib.layers.flatten(observation)
for size in self._value_layers:
x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
value = tf.contrib.layers.fully_connected(x, 1, None)[:, 0]
return (mean, logstd, value), state
class ForwardGaussianPolicy(tf.contrib.rnn.RNNCell):
"""Independent feed forward networks for policy and value.
The policy network outputs the mean action and the log standard deviation
is learned as independent parameter vector.
"""
def __init__(
self, policy_layers, value_layers, action_size,
mean_weights_initializer=_MEAN_WEIGHTS_INITIALIZER,
logstd_initializer=_LOGSTD_INITIALIZER):
self._policy_layers = policy_layers
self._value_layers = value_layers
self._action_size = action_size
self._mean_weights_initializer = mean_weights_initializer
self._logstd_initializer = logstd_initializer
@property
def state_size(self):
unused_state_size = 1
return unused_state_size
@property
def output_size(self):
return (self._action_size, self._action_size, tf.TensorShape([]))
def __call__(self, observation, state):
with tf.variable_scope('policy'):
x = tf.contrib.layers.flatten(observation)
for size in self._policy_layers:
x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
mean = tf.contrib.layers.fully_connected(
x, self._action_size, tf.tanh,
weights_initializer=self._mean_weights_initializer)
logstd = tf.get_variable(
'logstd', mean.shape[1:], tf.float32, self._logstd_initializer)
logstd = tf.tile(
logstd[None, ...], [tf.shape(mean)[0]] + [1] * logstd.shape.ndims)
with tf.variable_scope('value'):
x = tf.contrib.layers.flatten(observation)
for size in self._value_layers:
x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
value = tf.contrib.layers.fully_connected(x, 1, None)[:, 0]
return (mean, logstd, value), state
class RecurrentGaussianPolicy(tf.contrib.rnn.RNNCell):
"""Independent recurrent policy and feed forward value networks.
The policy network outputs the mean action and the log standard deviation
is learned as independent parameter vector. The last policy layer is recurrent
and uses a GRU cell.
"""
def __init__(
self, policy_layers, value_layers, action_size,
mean_weights_initializer=_MEAN_WEIGHTS_INITIALIZER,
logstd_initializer=_LOGSTD_INITIALIZER):
self._policy_layers = policy_layers
self._value_layers = value_layers
self._action_size = action_size
self._mean_weights_initializer = mean_weights_initializer
self._logstd_initializer = logstd_initializer
self._cell = tf.contrib.rnn.GRUBlockCell(100)
@property
def state_size(self):
return self._cell.state_size
@property
def output_size(self):
return (self._action_size, self._action_size, tf.TensorShape([]))
def __call__(self, observation, state):
with tf.variable_scope('policy'):
x = tf.contrib.layers.flatten(observation)
for size in self._policy_layers[:-1]:
x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
x, state = self._cell(x, state)
mean = tf.contrib.layers.fully_connected(
x, self._action_size, tf.tanh,
weights_initializer=self._mean_weights_initializer)
logstd = tf.get_variable(
'logstd', mean.shape[1:], tf.float32, self._logstd_initializer)
logstd = tf.tile(
logstd[None, ...], [tf.shape(mean)[0]] + [1] * logstd.shape.ndims)
with tf.variable_scope('value'):
x = tf.contrib.layers.flatten(observation)
for size in self._value_layers:
x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
value = tf.contrib.layers.fully_connected(x, 1, None)[:, 0]
return (mean, logstd, value), state

View File

@@ -0,0 +1,165 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
r"""Script to train a batch reinforcement learning algorithm.
Command line:
python3 -m agents.scripts.train --logdir=/path/to/logdir --config=pendulum
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import datetime
import functools
import os
import gym
import tensorflow as tf
from pybullet_envs.minitaur.agents import tools
from pybullet_envs.minitaur.agents.scripts import configs
from pybullet_envs.minitaur.agents.scripts import utility
def _create_environment(config):
"""Constructor for an instance of the environment.
Args:
config: Object providing configurations via attributes.
Returns:
Wrapped OpenAI Gym environment.
"""
if isinstance(config.env, str):
env = gym.make(config.env)
else:
env = config.env()
if config.max_length:
env = tools.wrappers.LimitDuration(env, config.max_length)
env = tools.wrappers.RangeNormalize(env)
env = tools.wrappers.ClipAction(env)
env = tools.wrappers.ConvertTo32Bit(env)
return env
def _define_loop(graph, logdir, train_steps, eval_steps):
"""Create and configure a training loop with training and evaluation phases.
Args:
graph: Object providing graph elements via attributes.
logdir: Log directory for storing checkpoints and summaries.
train_steps: Number of training steps per epoch.
eval_steps: Number of evaluation steps per epoch.
Returns:
Loop object.
"""
loop = tools.Loop(
logdir, graph.step, graph.should_log, graph.do_report,
graph.force_reset)
loop.add_phase(
'train', graph.done, graph.score, graph.summary, train_steps,
report_every=None,
log_every=train_steps // 2,
checkpoint_every=None,
feed={graph.is_training: True})
loop.add_phase(
'eval', graph.done, graph.score, graph.summary, eval_steps,
report_every=eval_steps,
log_every=eval_steps // 2,
checkpoint_every=10 * eval_steps,
feed={graph.is_training: False})
return loop
def train(config, env_processes):
"""Training and evaluation entry point yielding scores.
Resolves some configuration attributes, creates environments, graph, and
training loop. By default, assigns all operations to the CPU.
Args:
config: Object providing configurations via attributes.
env_processes: Whether to step environments in separate processes.
Yields:
Evaluation scores.
"""
tf.reset_default_graph()
with config.unlocked:
config.network = functools.partial(
utility.define_network, config.network, config)
config.policy_optimizer = getattr(tf.train, config.policy_optimizer)
config.value_optimizer = getattr(tf.train, config.value_optimizer)
if config.update_every % config.num_agents:
tf.logging.warn('Number of agents should divide episodes per update.')
with tf.device('/cpu:0'):
batch_env = utility.define_batch_env(
lambda: _create_environment(config),
config.num_agents, env_processes)
graph = utility.define_simulation_graph(
batch_env, config.algorithm, config)
loop = _define_loop(
graph, config.logdir,
config.update_every * config.max_length,
config.eval_episodes * config.max_length)
total_steps = int(
config.steps / config.update_every *
(config.update_every + config.eval_episodes))
# Exclude episode related variables since the Python state of environments is
# not checkpointed and thus new episodes start after resuming.
saver = utility.define_saver(exclude=(r'.*_temporary/.*',))
sess_config = tf.ConfigProto(allow_soft_placement=True)
sess_config.gpu_options.allow_growth = True
with tf.Session(config=sess_config) as sess:
utility.initialize_variables(sess, saver, config.logdir)
for score in loop.run(sess, saver, total_steps):
yield score
batch_env.close()
def main(_):
"""Create or load configuration and launch the trainer."""
utility.set_up_logging()
if not FLAGS.config:
raise KeyError('You must specify a configuration.')
logdir = FLAGS.logdir and os.path.expanduser(os.path.join(
FLAGS.logdir, '{}-{}'.format(FLAGS.timestamp, FLAGS.config)))
try:
config = utility.load_config(logdir)
except IOError:
config = tools.AttrDict(getattr(configs, FLAGS.config)())
config = utility.save_config(config, logdir)
for score in train(config, FLAGS.env_processes):
tf.logging.info('Score {}.'.format(score))
if __name__ == '__main__':
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string(
'logdir', None,
'Base directory to store logs.')
tf.app.flags.DEFINE_string(
'timestamp', datetime.datetime.now().strftime('%Y%m%dT%H%M%S'),
'Sub directory to store logs.')
tf.app.flags.DEFINE_string(
'config', None,
'Configuration to execute.')
tf.app.flags.DEFINE_boolean(
'env_processes', True,
'Step environments in separate processes to circumvent the GIL.')
tf.app.run()

View File

@@ -0,0 +1,110 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for the PPO algorithm usage example."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import itertools
import tensorflow as tf
from google3.robotics.reinforcement_learning.agents import ppo
from google3.robotics.reinforcement_learning.agents import tools
from google3.robotics.reinforcement_learning.agents.scripts import configs
from google3.robotics.reinforcement_learning.agents.scripts import networks
from google3.robotics.reinforcement_learning.agents.scripts import train
FLAGS = tf.app.flags.FLAGS
class PPOTest(tf.test.TestCase):
def test_no_crash_cheetah(self):
nets = networks.ForwardGaussianPolicy, networks.RecurrentGaussianPolicy
for network in nets:
config = self._define_config()
with config.unlocked:
config.env = 'HalfCheetah-v1'
config.max_length = 200
config.steps = 1000
config.network = network
for score in train.train(config, env_processes=True):
float(score)
def test_no_crash_ant(self):
nets = networks.ForwardGaussianPolicy, networks.RecurrentGaussianPolicy
for network in nets:
config = self._define_config()
with config.unlocked:
config.env = 'Ant-v1'
config.max_length = 200
config.steps = 1000
config.network = network
for score in train.train(config, env_processes=True):
float(score)
def test_no_crash_observation_shape(self):
nets = networks.ForwardGaussianPolicy, networks.RecurrentGaussianPolicy
observ_shapes = (1,), (2, 3), (2, 3, 4)
for network, observ_shape in itertools.product(nets, observ_shapes):
config = self._define_config()
with config.unlocked:
config.env = functools.partial(
tools.MockEnvironment, observ_shape, action_shape=(3,),
min_duration=15, max_duration=15)
config.max_length = 20
config.steps = 100
config.network = network
for score in train.train(config, env_processes=False):
float(score)
def test_no_crash_variable_duration(self):
config = self._define_config()
with config.unlocked:
config.env = functools.partial(
tools.MockEnvironment, observ_shape=(2, 3), action_shape=(3,),
min_duration=5, max_duration=25)
config.max_length = 25
config.steps = 200
config.network = networks.RecurrentGaussianPolicy
for score in train.train(config, env_processes=False):
float(score)
def _define_config(self):
# Start from the example configuration.
locals().update(configs.default())
# pylint: disable=unused-variable
# General
algorithm = ppo.PPOAlgorithm
num_agents = 2
update_every = 4
use_gpu = False
# Network
policy_layers = 20, 10
value_layers = 20, 10
# Optimization
update_epochs_policy = 2
update_epochs_value = 2
# pylint: enable=unused-variable
return tools.AttrDict(locals())
if __name__ == '__main__':
FLAGS.config = 'unused'
tf.test.main()

View File

@@ -0,0 +1,213 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utilities for using reinforcement learning algorithms."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
import os
import re
import ruamel.yaml as yaml
import tensorflow as tf
from pybullet_envs.minitaur.agents import tools
def define_simulation_graph(batch_env, algo_cls, config):
"""Define the algortihm and environment interaction.
Args:
batch_env: In-graph environments object.
algo_cls: Constructor of a batch algorithm.
config: Configuration object for the algorithm.
Returns:
Object providing graph elements via attributes.
"""
# pylint: disable=unused-variable
step = tf.Variable(0, False, dtype=tf.int32, name='global_step')
is_training = tf.placeholder(tf.bool, name='is_training')
should_log = tf.placeholder(tf.bool, name='should_log')
do_report = tf.placeholder(tf.bool, name='do_report')
force_reset = tf.placeholder(tf.bool, name='force_reset')
algo = algo_cls(batch_env, step, is_training, should_log, config)
done, score, summary = tools.simulate(
batch_env, algo, should_log, force_reset)
message = 'Graph contains {} trainable variables.'
tf.logging.info(message.format(tools.count_weights()))
# pylint: enable=unused-variable
return tools.AttrDict(locals())
def define_batch_env(constructor, num_agents, env_processes):
"""Create environments and apply all desired wrappers.
Args:
constructor: Constructor of an OpenAI gym environment.
num_agents: Number of environments to combine in the batch.
env_processes: Whether to step environment in external processes.
Returns:
In-graph environments object.
"""
with tf.variable_scope('environments'):
if env_processes:
envs = [
tools.wrappers.ExternalProcess(constructor)
for _ in range(num_agents)]
else:
envs = [constructor() for _ in range(num_agents)]
batch_env = tools.BatchEnv(envs, blocking=not env_processes)
batch_env = tools.InGraphBatchEnv(batch_env)
return batch_env
def define_saver(exclude=None):
"""Create a saver for the variables we want to checkpoint.
Args:
exclude: List of regexes to match variable names to exclude.
Returns:
Saver object.
"""
variables = []
exclude = exclude or []
exclude = [re.compile(regex) for regex in exclude]
for variable in tf.global_variables():
if any(regex.match(variable.name) for regex in exclude):
continue
variables.append(variable)
saver = tf.train.Saver(variables, keep_checkpoint_every_n_hours=5)
return saver
def define_network(constructor, config, action_size):
"""Constructor for the recurrent cell for the algorithm.
Args:
constructor: Callable returning the network as RNNCell.
config: Object providing configurations via attributes.
action_size: Integer indicating the amount of action dimensions.
Returns:
Created recurrent cell object.
"""
mean_weights_initializer = (
tf.contrib.layers.variance_scaling_initializer(
factor=config.init_mean_factor))
logstd_initializer = tf.random_normal_initializer(
config.init_logstd, 1e-10)
network = constructor(
config.policy_layers, config.value_layers, action_size,
mean_weights_initializer=mean_weights_initializer,
logstd_initializer=logstd_initializer)
return network
def initialize_variables(sess, saver, logdir, checkpoint=None, resume=None):
"""Initialize or restore variables from a checkpoint if available.
Args:
sess: Session to initialize variables in.
saver: Saver to restore variables.
logdir: Directory to search for checkpoints.
checkpoint: Specify what checkpoint name to use; defaults to most recent.
resume: Whether to expect recovering a checkpoint or starting a new run.
Raises:
ValueError: If resume expected but no log directory specified.
RuntimeError: If no resume expected but a checkpoint was found.
"""
sess.run(tf.group(
tf.local_variables_initializer(),
tf.global_variables_initializer()))
if resume and not (logdir or checkpoint):
raise ValueError('Need to specify logdir to resume a checkpoint.')
if logdir:
state = tf.train.get_checkpoint_state(logdir)
if checkpoint:
checkpoint = os.path.join(logdir, checkpoint)
if not checkpoint and state and state.model_checkpoint_path:
checkpoint = state.model_checkpoint_path
if checkpoint and resume is False:
message = 'Found unexpected checkpoint when starting a new run.'
raise RuntimeError(message)
if checkpoint:
saver.restore(sess, checkpoint)
def save_config(config, logdir=None):
"""Save a new configuration by name.
If a logging directory is specified, is will be created and the configuration
will be stored there. Otherwise, a log message will be printed.
Args:
config: Configuration object.
logdir: Location for writing summaries and checkpoints if specified.
Returns:
Configuration object.
"""
if logdir:
with config.unlocked:
config.logdir = logdir
message = 'Start a new run and write summaries and checkpoints to {}.'
tf.logging.info(message.format(config.logdir))
tf.gfile.MakeDirs(config.logdir)
config_path = os.path.join(config.logdir, 'config.yaml')
with tf.gfile.FastGFile(config_path, 'w') as file_:
yaml.dump(config, file_, default_flow_style=False)
else:
message = (
'Start a new run without storing summaries and checkpoints since no '
'logging directory was specified.')
tf.logging.info(message)
return config
def load_config(logdir):
"""Load a configuration from the log directory.
Args:
logdir: The logging directory containing the configuration file.
Raises:
IOError: The logging directory does not contain a configuration file.
Returns:
Configuration object.
"""
config_path = logdir and os.path.join(logdir, 'config.yaml')
if not config_path or not tf.gfile.Exists(config_path):
message = (
'Cannot resume an existing run since the logging directory does not '
'contain a configuration file.')
raise IOError(message)
with tf.gfile.FastGFile(config_path, 'r') as file_:
config = yaml.load(file_)
message = 'Resume run and write summaries and checkpoints to {}.'
tf.logging.info(message.format(config.logdir))
return config
def set_up_logging():
"""Configure the TensorFlow logger."""
tf.logging.set_verbosity(tf.logging.INFO)
logging.getLogger('tensorflow').propagate = False

View File

@@ -0,0 +1,157 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
r"""Script to render videos of the Proximal Policy Gradient algorithm.
Command line:
python3 -m agents.scripts.visualize \
--logdir=/path/to/logdir/<time>-<config> --outdir=/path/to/outdir/
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import os
import gym
import tensorflow as tf
from pybullet_envs.minitaur.agents import tools
from pybullet_envs.minitaur.agents.scripts import utility
def _create_environment(config, outdir):
"""Constructor for an instance of the environment.
Args:
config: Object providing configurations via attributes.
outdir: Directory to store videos in.
Returns:
Wrapped OpenAI Gym environment.
"""
if isinstance(config.env, str):
env = gym.make(config.env)
else:
env = config.env()
# Ensure that the environment has the specification attribute set as expected
# by the monitor wrapper.
if not hasattr(env, 'spec'):
setattr(env, 'spec', getattr(env, 'spec', None))
if config.max_length:
env = tools.wrappers.LimitDuration(env, config.max_length)
# env = gym.wrappers.Monitor(
# env, outdir, lambda unused_episode_number: True)
env = tools.wrappers.RangeNormalize(env)
env = tools.wrappers.ClipAction(env)
env = tools.wrappers.ConvertTo32Bit(env)
return env
def _define_loop(graph, eval_steps):
"""Create and configure an evaluation loop.
Args:
graph: Object providing graph elements via attributes.
eval_steps: Number of evaluation steps per epoch.
Returns:
Loop object.
"""
loop = tools.Loop(
None, graph.step, graph.should_log, graph.do_report, graph.force_reset)
loop.add_phase(
'eval', graph.done, graph.score, graph.summary, eval_steps,
report_every=eval_steps,
log_every=None,
checkpoint_every=None,
feed={graph.is_training: False})
return loop
def visualize(
logdir, outdir, num_agents, num_episodes, checkpoint=None,
env_processes=True):
"""Recover checkpoint and render videos from it.
Args:
logdir: Logging directory of the trained algorithm.
outdir: Directory to store rendered videos in.
num_agents: Number of environments to simulate in parallel.
num_episodes: Total number of episodes to simulate.
checkpoint: Checkpoint name to load; defaults to most recent.
env_processes: Whether to step environments in separate processes.
"""
config = utility.load_config(logdir)
with config.unlocked:
config.network = functools.partial(
utility.define_network, config.network, config)
config.policy_optimizer = getattr(tf.train, config.policy_optimizer)
config.value_optimizer = getattr(tf.train, config.value_optimizer)
with tf.device('/cpu:0'):
batch_env = utility.define_batch_env(
lambda: _create_environment(config, outdir),
num_agents, env_processes)
graph = utility.define_simulation_graph(
batch_env, config.algorithm, config)
total_steps = num_episodes * config.max_length
loop = _define_loop(graph, total_steps)
saver = utility.define_saver(
exclude=(r'.*_temporary/.*', r'global_step'))
sess_config = tf.ConfigProto(allow_soft_placement=True)
sess_config.gpu_options.allow_growth = True
with tf.Session(config=sess_config) as sess:
utility.initialize_variables(
sess, saver, config.logdir, checkpoint, resume=True)
for unused_score in loop.run(sess, saver, total_steps):
pass
batch_env.close()
def main(_):
"""Load a trained algorithm and render videos."""
utility.set_up_logging()
if not FLAGS.logdir or not FLAGS.outdir:
raise KeyError('You must specify logging and outdirs directories.')
FLAGS.logdir = os.path.expanduser(FLAGS.logdir)
FLAGS.outdir = os.path.expanduser(FLAGS.outdir)
visualize(
FLAGS.logdir, FLAGS.outdir, FLAGS.num_agents, FLAGS.num_episodes,
FLAGS.checkpoint, FLAGS.env_processes)
if __name__ == '__main__':
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string(
'logdir', None,
'Directory to the checkpoint of a training run.')
tf.app.flags.DEFINE_string(
'outdir', None,
'Local directory for storing the monitoring outdir.')
tf.app.flags.DEFINE_string(
'checkpoint', None,
'Checkpoint name to load; defaults to most recent.')
tf.app.flags.DEFINE_integer(
'num_agents', 1,
'How many environments to step in parallel.')
tf.app.flags.DEFINE_integer(
'num_episodes', 5,
'Minimum number of episodes to render.')
tf.app.flags.DEFINE_boolean(
'env_processes', True,
'Step environments in separate processes to circumvent the GIL.')
tf.app.run()