make sure that the pre-trained galloping and trotting policies work for the minitaur_reactive_env and minitaur_trotting_env environments.
This commit is contained in:
@@ -0,0 +1,23 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Executable scripts for reinforcement learning."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from . import train
|
||||
from . import utility
|
||||
from . import visualize
|
||||
@@ -0,0 +1,128 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Example configurations using the PPO algorithm."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
# pylint: disable=unused-variable
|
||||
|
||||
from pybullet_envs.minitaur.agents import ppo
|
||||
from pybullet_envs.minitaur.agents.scripts import networks
|
||||
|
||||
|
||||
def default():
|
||||
"""Default configuration for PPO."""
|
||||
# General
|
||||
algorithm = ppo.PPOAlgorithm
|
||||
num_agents = 10
|
||||
eval_episodes = 25
|
||||
use_gpu = False
|
||||
# Network
|
||||
network = networks.ForwardGaussianPolicy
|
||||
weight_summaries = dict(
|
||||
all=r'.*',
|
||||
policy=r'.*/policy/.*',
|
||||
value=r'.*/value/.*')
|
||||
policy_layers = 200, 100
|
||||
value_layers = 200, 100
|
||||
init_mean_factor = 0.05
|
||||
init_logstd = -1
|
||||
# Optimization
|
||||
update_every = 25
|
||||
policy_optimizer = 'AdamOptimizer'
|
||||
value_optimizer = 'AdamOptimizer'
|
||||
update_epochs_policy = 50
|
||||
update_epochs_value = 50
|
||||
policy_lr = 1e-4
|
||||
value_lr = 3e-4
|
||||
# Losses
|
||||
discount = 0.985
|
||||
kl_target = 1e-2
|
||||
kl_cutoff_factor = 2
|
||||
kl_cutoff_coef = 1000
|
||||
kl_init_penalty = 1
|
||||
return locals()
|
||||
|
||||
|
||||
def pendulum():
|
||||
"""Configuration for the pendulum classic control task."""
|
||||
locals().update(default())
|
||||
# Environment
|
||||
env = 'Pendulum-v0'
|
||||
max_length = 200
|
||||
steps = 1e6 # 1M
|
||||
return locals()
|
||||
|
||||
|
||||
def cheetah():
|
||||
"""Configuration for MuJoCo's half cheetah task."""
|
||||
locals().update(default())
|
||||
# Environment
|
||||
env = 'HalfCheetah-v1'
|
||||
max_length = 1000
|
||||
steps = 1e7 # 10M
|
||||
return locals()
|
||||
|
||||
|
||||
def walker():
|
||||
"""Configuration for MuJoCo's walker task."""
|
||||
locals().update(default())
|
||||
# Environment
|
||||
env = 'Walker2d-v1'
|
||||
max_length = 1000
|
||||
steps = 1e7 # 10M
|
||||
return locals()
|
||||
|
||||
|
||||
def reacher():
|
||||
"""Configuration for MuJoCo's reacher task."""
|
||||
locals().update(default())
|
||||
# Environment
|
||||
env = 'Reacher-v1'
|
||||
max_length = 1000
|
||||
steps = 1e7 # 10M
|
||||
return locals()
|
||||
|
||||
|
||||
def hopper():
|
||||
"""Configuration for MuJoCo's hopper task."""
|
||||
locals().update(default())
|
||||
# Environment
|
||||
env = 'Hopper-v1'
|
||||
max_length = 1000
|
||||
steps = 2e7 # 20M
|
||||
return locals()
|
||||
|
||||
|
||||
def ant():
|
||||
"""Configuration for MuJoCo's ant task."""
|
||||
locals().update(default())
|
||||
# Environment
|
||||
env = 'Ant-v1'
|
||||
max_length = 1000
|
||||
steps = 5e7 # 50M
|
||||
return locals()
|
||||
|
||||
|
||||
def humanoid():
|
||||
"""Configuration for MuJoCo's humanoid task."""
|
||||
locals().update(default())
|
||||
# Environment
|
||||
env = 'Humanoid-v1'
|
||||
max_length = 1000
|
||||
steps = 5e7 # 50M
|
||||
return locals()
|
||||
@@ -0,0 +1,167 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Networks for the PPO algorithm defined as recurrent cells."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
_MEAN_WEIGHTS_INITIALIZER = tf.contrib.layers.variance_scaling_initializer(
|
||||
factor=0.1)
|
||||
_LOGSTD_INITIALIZER = tf.random_normal_initializer(-1, 1e-10)
|
||||
|
||||
class LinearGaussianPolicy(tf.contrib.rnn.RNNCell):
|
||||
"""Indepent linear network with a tanh at the end for policy and feedforward network for the value.
|
||||
|
||||
The policy network outputs the mean action and the log standard deviation
|
||||
is learned as indepent parameter vector.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
policy_layers,
|
||||
value_layers,
|
||||
action_size,
|
||||
mean_weights_initializer=_MEAN_WEIGHTS_INITIALIZER,
|
||||
logstd_initializer=_LOGSTD_INITIALIZER):
|
||||
self._policy_layers = policy_layers
|
||||
self._value_layers = value_layers
|
||||
self._action_size = action_size
|
||||
self._mean_weights_initializer = mean_weights_initializer
|
||||
self._logstd_initializer = logstd_initializer
|
||||
|
||||
@property
|
||||
def state_size(self):
|
||||
unused_state_size = 1
|
||||
return unused_state_size
|
||||
|
||||
@property
|
||||
def output_size(self):
|
||||
return (self._action_size, self._action_size, tf.TensorShape([]))
|
||||
|
||||
def __call__(self, observation, state):
|
||||
with tf.variable_scope('policy'):
|
||||
x = tf.contrib.layers.flatten(observation)
|
||||
mean = tf.contrib.layers.fully_connected(
|
||||
x,
|
||||
self._action_size,
|
||||
tf.tanh,
|
||||
weights_initializer=self._mean_weights_initializer)
|
||||
logstd = tf.get_variable('logstd', mean.shape[1:], tf.float32,
|
||||
self._logstd_initializer)
|
||||
logstd = tf.tile(logstd[None, ...],
|
||||
[tf.shape(mean)[0]] + [1] * logstd.shape.ndims)
|
||||
with tf.variable_scope('value'):
|
||||
x = tf.contrib.layers.flatten(observation)
|
||||
for size in self._value_layers:
|
||||
x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
|
||||
value = tf.contrib.layers.fully_connected(x, 1, None)[:, 0]
|
||||
return (mean, logstd, value), state
|
||||
|
||||
|
||||
class ForwardGaussianPolicy(tf.contrib.rnn.RNNCell):
|
||||
"""Independent feed forward networks for policy and value.
|
||||
|
||||
The policy network outputs the mean action and the log standard deviation
|
||||
is learned as independent parameter vector.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, policy_layers, value_layers, action_size,
|
||||
mean_weights_initializer=_MEAN_WEIGHTS_INITIALIZER,
|
||||
logstd_initializer=_LOGSTD_INITIALIZER):
|
||||
self._policy_layers = policy_layers
|
||||
self._value_layers = value_layers
|
||||
self._action_size = action_size
|
||||
self._mean_weights_initializer = mean_weights_initializer
|
||||
self._logstd_initializer = logstd_initializer
|
||||
|
||||
@property
|
||||
def state_size(self):
|
||||
unused_state_size = 1
|
||||
return unused_state_size
|
||||
|
||||
@property
|
||||
def output_size(self):
|
||||
return (self._action_size, self._action_size, tf.TensorShape([]))
|
||||
|
||||
def __call__(self, observation, state):
|
||||
with tf.variable_scope('policy'):
|
||||
x = tf.contrib.layers.flatten(observation)
|
||||
for size in self._policy_layers:
|
||||
x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
|
||||
mean = tf.contrib.layers.fully_connected(
|
||||
x, self._action_size, tf.tanh,
|
||||
weights_initializer=self._mean_weights_initializer)
|
||||
logstd = tf.get_variable(
|
||||
'logstd', mean.shape[1:], tf.float32, self._logstd_initializer)
|
||||
logstd = tf.tile(
|
||||
logstd[None, ...], [tf.shape(mean)[0]] + [1] * logstd.shape.ndims)
|
||||
with tf.variable_scope('value'):
|
||||
x = tf.contrib.layers.flatten(observation)
|
||||
for size in self._value_layers:
|
||||
x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
|
||||
value = tf.contrib.layers.fully_connected(x, 1, None)[:, 0]
|
||||
return (mean, logstd, value), state
|
||||
|
||||
|
||||
class RecurrentGaussianPolicy(tf.contrib.rnn.RNNCell):
|
||||
"""Independent recurrent policy and feed forward value networks.
|
||||
|
||||
The policy network outputs the mean action and the log standard deviation
|
||||
is learned as independent parameter vector. The last policy layer is recurrent
|
||||
and uses a GRU cell.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, policy_layers, value_layers, action_size,
|
||||
mean_weights_initializer=_MEAN_WEIGHTS_INITIALIZER,
|
||||
logstd_initializer=_LOGSTD_INITIALIZER):
|
||||
self._policy_layers = policy_layers
|
||||
self._value_layers = value_layers
|
||||
self._action_size = action_size
|
||||
self._mean_weights_initializer = mean_weights_initializer
|
||||
self._logstd_initializer = logstd_initializer
|
||||
self._cell = tf.contrib.rnn.GRUBlockCell(100)
|
||||
|
||||
@property
|
||||
def state_size(self):
|
||||
return self._cell.state_size
|
||||
|
||||
@property
|
||||
def output_size(self):
|
||||
return (self._action_size, self._action_size, tf.TensorShape([]))
|
||||
|
||||
def __call__(self, observation, state):
|
||||
with tf.variable_scope('policy'):
|
||||
x = tf.contrib.layers.flatten(observation)
|
||||
for size in self._policy_layers[:-1]:
|
||||
x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
|
||||
x, state = self._cell(x, state)
|
||||
mean = tf.contrib.layers.fully_connected(
|
||||
x, self._action_size, tf.tanh,
|
||||
weights_initializer=self._mean_weights_initializer)
|
||||
logstd = tf.get_variable(
|
||||
'logstd', mean.shape[1:], tf.float32, self._logstd_initializer)
|
||||
logstd = tf.tile(
|
||||
logstd[None, ...], [tf.shape(mean)[0]] + [1] * logstd.shape.ndims)
|
||||
with tf.variable_scope('value'):
|
||||
x = tf.contrib.layers.flatten(observation)
|
||||
for size in self._value_layers:
|
||||
x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
|
||||
value = tf.contrib.layers.fully_connected(x, 1, None)[:, 0]
|
||||
return (mean, logstd, value), state
|
||||
@@ -0,0 +1,165 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
r"""Script to train a batch reinforcement learning algorithm.
|
||||
|
||||
Command line:
|
||||
|
||||
python3 -m agents.scripts.train --logdir=/path/to/logdir --config=pendulum
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import datetime
|
||||
import functools
|
||||
import os
|
||||
|
||||
import gym
|
||||
import tensorflow as tf
|
||||
|
||||
from pybullet_envs.minitaur.agents import tools
|
||||
from pybullet_envs.minitaur.agents.scripts import configs
|
||||
from pybullet_envs.minitaur.agents.scripts import utility
|
||||
|
||||
|
||||
def _create_environment(config):
|
||||
"""Constructor for an instance of the environment.
|
||||
|
||||
Args:
|
||||
config: Object providing configurations via attributes.
|
||||
|
||||
Returns:
|
||||
Wrapped OpenAI Gym environment.
|
||||
"""
|
||||
if isinstance(config.env, str):
|
||||
env = gym.make(config.env)
|
||||
else:
|
||||
env = config.env()
|
||||
if config.max_length:
|
||||
env = tools.wrappers.LimitDuration(env, config.max_length)
|
||||
env = tools.wrappers.RangeNormalize(env)
|
||||
env = tools.wrappers.ClipAction(env)
|
||||
env = tools.wrappers.ConvertTo32Bit(env)
|
||||
return env
|
||||
|
||||
|
||||
def _define_loop(graph, logdir, train_steps, eval_steps):
|
||||
"""Create and configure a training loop with training and evaluation phases.
|
||||
|
||||
Args:
|
||||
graph: Object providing graph elements via attributes.
|
||||
logdir: Log directory for storing checkpoints and summaries.
|
||||
train_steps: Number of training steps per epoch.
|
||||
eval_steps: Number of evaluation steps per epoch.
|
||||
|
||||
Returns:
|
||||
Loop object.
|
||||
"""
|
||||
loop = tools.Loop(
|
||||
logdir, graph.step, graph.should_log, graph.do_report,
|
||||
graph.force_reset)
|
||||
loop.add_phase(
|
||||
'train', graph.done, graph.score, graph.summary, train_steps,
|
||||
report_every=None,
|
||||
log_every=train_steps // 2,
|
||||
checkpoint_every=None,
|
||||
feed={graph.is_training: True})
|
||||
loop.add_phase(
|
||||
'eval', graph.done, graph.score, graph.summary, eval_steps,
|
||||
report_every=eval_steps,
|
||||
log_every=eval_steps // 2,
|
||||
checkpoint_every=10 * eval_steps,
|
||||
feed={graph.is_training: False})
|
||||
return loop
|
||||
|
||||
|
||||
def train(config, env_processes):
|
||||
"""Training and evaluation entry point yielding scores.
|
||||
|
||||
Resolves some configuration attributes, creates environments, graph, and
|
||||
training loop. By default, assigns all operations to the CPU.
|
||||
|
||||
Args:
|
||||
config: Object providing configurations via attributes.
|
||||
env_processes: Whether to step environments in separate processes.
|
||||
|
||||
Yields:
|
||||
Evaluation scores.
|
||||
"""
|
||||
tf.reset_default_graph()
|
||||
with config.unlocked:
|
||||
config.network = functools.partial(
|
||||
utility.define_network, config.network, config)
|
||||
config.policy_optimizer = getattr(tf.train, config.policy_optimizer)
|
||||
config.value_optimizer = getattr(tf.train, config.value_optimizer)
|
||||
if config.update_every % config.num_agents:
|
||||
tf.logging.warn('Number of agents should divide episodes per update.')
|
||||
with tf.device('/cpu:0'):
|
||||
batch_env = utility.define_batch_env(
|
||||
lambda: _create_environment(config),
|
||||
config.num_agents, env_processes)
|
||||
graph = utility.define_simulation_graph(
|
||||
batch_env, config.algorithm, config)
|
||||
loop = _define_loop(
|
||||
graph, config.logdir,
|
||||
config.update_every * config.max_length,
|
||||
config.eval_episodes * config.max_length)
|
||||
total_steps = int(
|
||||
config.steps / config.update_every *
|
||||
(config.update_every + config.eval_episodes))
|
||||
# Exclude episode related variables since the Python state of environments is
|
||||
# not checkpointed and thus new episodes start after resuming.
|
||||
saver = utility.define_saver(exclude=(r'.*_temporary/.*',))
|
||||
sess_config = tf.ConfigProto(allow_soft_placement=True)
|
||||
sess_config.gpu_options.allow_growth = True
|
||||
with tf.Session(config=sess_config) as sess:
|
||||
utility.initialize_variables(sess, saver, config.logdir)
|
||||
for score in loop.run(sess, saver, total_steps):
|
||||
yield score
|
||||
batch_env.close()
|
||||
|
||||
|
||||
def main(_):
|
||||
"""Create or load configuration and launch the trainer."""
|
||||
utility.set_up_logging()
|
||||
if not FLAGS.config:
|
||||
raise KeyError('You must specify a configuration.')
|
||||
logdir = FLAGS.logdir and os.path.expanduser(os.path.join(
|
||||
FLAGS.logdir, '{}-{}'.format(FLAGS.timestamp, FLAGS.config)))
|
||||
try:
|
||||
config = utility.load_config(logdir)
|
||||
except IOError:
|
||||
config = tools.AttrDict(getattr(configs, FLAGS.config)())
|
||||
config = utility.save_config(config, logdir)
|
||||
for score in train(config, FLAGS.env_processes):
|
||||
tf.logging.info('Score {}.'.format(score))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
FLAGS = tf.app.flags.FLAGS
|
||||
tf.app.flags.DEFINE_string(
|
||||
'logdir', None,
|
||||
'Base directory to store logs.')
|
||||
tf.app.flags.DEFINE_string(
|
||||
'timestamp', datetime.datetime.now().strftime('%Y%m%dT%H%M%S'),
|
||||
'Sub directory to store logs.')
|
||||
tf.app.flags.DEFINE_string(
|
||||
'config', None,
|
||||
'Configuration to execute.')
|
||||
tf.app.flags.DEFINE_boolean(
|
||||
'env_processes', True,
|
||||
'Step environments in separate processes to circumvent the GIL.')
|
||||
tf.app.run()
|
||||
@@ -0,0 +1,110 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Tests for the PPO algorithm usage example."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import functools
|
||||
import itertools
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from google3.robotics.reinforcement_learning.agents import ppo
|
||||
from google3.robotics.reinforcement_learning.agents import tools
|
||||
from google3.robotics.reinforcement_learning.agents.scripts import configs
|
||||
from google3.robotics.reinforcement_learning.agents.scripts import networks
|
||||
from google3.robotics.reinforcement_learning.agents.scripts import train
|
||||
|
||||
|
||||
FLAGS = tf.app.flags.FLAGS
|
||||
|
||||
|
||||
class PPOTest(tf.test.TestCase):
|
||||
|
||||
def test_no_crash_cheetah(self):
|
||||
nets = networks.ForwardGaussianPolicy, networks.RecurrentGaussianPolicy
|
||||
for network in nets:
|
||||
config = self._define_config()
|
||||
with config.unlocked:
|
||||
config.env = 'HalfCheetah-v1'
|
||||
config.max_length = 200
|
||||
config.steps = 1000
|
||||
config.network = network
|
||||
for score in train.train(config, env_processes=True):
|
||||
float(score)
|
||||
|
||||
def test_no_crash_ant(self):
|
||||
nets = networks.ForwardGaussianPolicy, networks.RecurrentGaussianPolicy
|
||||
for network in nets:
|
||||
config = self._define_config()
|
||||
with config.unlocked:
|
||||
config.env = 'Ant-v1'
|
||||
config.max_length = 200
|
||||
config.steps = 1000
|
||||
config.network = network
|
||||
for score in train.train(config, env_processes=True):
|
||||
float(score)
|
||||
|
||||
def test_no_crash_observation_shape(self):
|
||||
nets = networks.ForwardGaussianPolicy, networks.RecurrentGaussianPolicy
|
||||
observ_shapes = (1,), (2, 3), (2, 3, 4)
|
||||
for network, observ_shape in itertools.product(nets, observ_shapes):
|
||||
config = self._define_config()
|
||||
with config.unlocked:
|
||||
config.env = functools.partial(
|
||||
tools.MockEnvironment, observ_shape, action_shape=(3,),
|
||||
min_duration=15, max_duration=15)
|
||||
config.max_length = 20
|
||||
config.steps = 100
|
||||
config.network = network
|
||||
for score in train.train(config, env_processes=False):
|
||||
float(score)
|
||||
|
||||
def test_no_crash_variable_duration(self):
|
||||
config = self._define_config()
|
||||
with config.unlocked:
|
||||
config.env = functools.partial(
|
||||
tools.MockEnvironment, observ_shape=(2, 3), action_shape=(3,),
|
||||
min_duration=5, max_duration=25)
|
||||
config.max_length = 25
|
||||
config.steps = 200
|
||||
config.network = networks.RecurrentGaussianPolicy
|
||||
for score in train.train(config, env_processes=False):
|
||||
float(score)
|
||||
|
||||
def _define_config(self):
|
||||
# Start from the example configuration.
|
||||
locals().update(configs.default())
|
||||
# pylint: disable=unused-variable
|
||||
# General
|
||||
algorithm = ppo.PPOAlgorithm
|
||||
num_agents = 2
|
||||
update_every = 4
|
||||
use_gpu = False
|
||||
# Network
|
||||
policy_layers = 20, 10
|
||||
value_layers = 20, 10
|
||||
# Optimization
|
||||
update_epochs_policy = 2
|
||||
update_epochs_value = 2
|
||||
# pylint: enable=unused-variable
|
||||
return tools.AttrDict(locals())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
FLAGS.config = 'unused'
|
||||
tf.test.main()
|
||||
@@ -0,0 +1,213 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Utilities for using reinforcement learning algorithms."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
|
||||
import ruamel.yaml as yaml
|
||||
import tensorflow as tf
|
||||
|
||||
from pybullet_envs.minitaur.agents import tools
|
||||
|
||||
|
||||
def define_simulation_graph(batch_env, algo_cls, config):
|
||||
"""Define the algortihm and environment interaction.
|
||||
|
||||
Args:
|
||||
batch_env: In-graph environments object.
|
||||
algo_cls: Constructor of a batch algorithm.
|
||||
config: Configuration object for the algorithm.
|
||||
|
||||
Returns:
|
||||
Object providing graph elements via attributes.
|
||||
"""
|
||||
# pylint: disable=unused-variable
|
||||
step = tf.Variable(0, False, dtype=tf.int32, name='global_step')
|
||||
is_training = tf.placeholder(tf.bool, name='is_training')
|
||||
should_log = tf.placeholder(tf.bool, name='should_log')
|
||||
do_report = tf.placeholder(tf.bool, name='do_report')
|
||||
force_reset = tf.placeholder(tf.bool, name='force_reset')
|
||||
algo = algo_cls(batch_env, step, is_training, should_log, config)
|
||||
done, score, summary = tools.simulate(
|
||||
batch_env, algo, should_log, force_reset)
|
||||
message = 'Graph contains {} trainable variables.'
|
||||
tf.logging.info(message.format(tools.count_weights()))
|
||||
# pylint: enable=unused-variable
|
||||
return tools.AttrDict(locals())
|
||||
|
||||
|
||||
def define_batch_env(constructor, num_agents, env_processes):
|
||||
"""Create environments and apply all desired wrappers.
|
||||
|
||||
Args:
|
||||
constructor: Constructor of an OpenAI gym environment.
|
||||
num_agents: Number of environments to combine in the batch.
|
||||
env_processes: Whether to step environment in external processes.
|
||||
|
||||
Returns:
|
||||
In-graph environments object.
|
||||
"""
|
||||
with tf.variable_scope('environments'):
|
||||
if env_processes:
|
||||
envs = [
|
||||
tools.wrappers.ExternalProcess(constructor)
|
||||
for _ in range(num_agents)]
|
||||
else:
|
||||
envs = [constructor() for _ in range(num_agents)]
|
||||
batch_env = tools.BatchEnv(envs, blocking=not env_processes)
|
||||
batch_env = tools.InGraphBatchEnv(batch_env)
|
||||
return batch_env
|
||||
|
||||
|
||||
def define_saver(exclude=None):
|
||||
"""Create a saver for the variables we want to checkpoint.
|
||||
|
||||
Args:
|
||||
exclude: List of regexes to match variable names to exclude.
|
||||
|
||||
Returns:
|
||||
Saver object.
|
||||
"""
|
||||
variables = []
|
||||
exclude = exclude or []
|
||||
exclude = [re.compile(regex) for regex in exclude]
|
||||
for variable in tf.global_variables():
|
||||
if any(regex.match(variable.name) for regex in exclude):
|
||||
continue
|
||||
variables.append(variable)
|
||||
saver = tf.train.Saver(variables, keep_checkpoint_every_n_hours=5)
|
||||
return saver
|
||||
|
||||
|
||||
def define_network(constructor, config, action_size):
|
||||
"""Constructor for the recurrent cell for the algorithm.
|
||||
|
||||
Args:
|
||||
constructor: Callable returning the network as RNNCell.
|
||||
config: Object providing configurations via attributes.
|
||||
action_size: Integer indicating the amount of action dimensions.
|
||||
|
||||
Returns:
|
||||
Created recurrent cell object.
|
||||
"""
|
||||
mean_weights_initializer = (
|
||||
tf.contrib.layers.variance_scaling_initializer(
|
||||
factor=config.init_mean_factor))
|
||||
logstd_initializer = tf.random_normal_initializer(
|
||||
config.init_logstd, 1e-10)
|
||||
network = constructor(
|
||||
config.policy_layers, config.value_layers, action_size,
|
||||
mean_weights_initializer=mean_weights_initializer,
|
||||
logstd_initializer=logstd_initializer)
|
||||
return network
|
||||
|
||||
|
||||
def initialize_variables(sess, saver, logdir, checkpoint=None, resume=None):
|
||||
"""Initialize or restore variables from a checkpoint if available.
|
||||
|
||||
Args:
|
||||
sess: Session to initialize variables in.
|
||||
saver: Saver to restore variables.
|
||||
logdir: Directory to search for checkpoints.
|
||||
checkpoint: Specify what checkpoint name to use; defaults to most recent.
|
||||
resume: Whether to expect recovering a checkpoint or starting a new run.
|
||||
|
||||
Raises:
|
||||
ValueError: If resume expected but no log directory specified.
|
||||
RuntimeError: If no resume expected but a checkpoint was found.
|
||||
"""
|
||||
sess.run(tf.group(
|
||||
tf.local_variables_initializer(),
|
||||
tf.global_variables_initializer()))
|
||||
if resume and not (logdir or checkpoint):
|
||||
raise ValueError('Need to specify logdir to resume a checkpoint.')
|
||||
if logdir:
|
||||
state = tf.train.get_checkpoint_state(logdir)
|
||||
if checkpoint:
|
||||
checkpoint = os.path.join(logdir, checkpoint)
|
||||
if not checkpoint and state and state.model_checkpoint_path:
|
||||
checkpoint = state.model_checkpoint_path
|
||||
if checkpoint and resume is False:
|
||||
message = 'Found unexpected checkpoint when starting a new run.'
|
||||
raise RuntimeError(message)
|
||||
if checkpoint:
|
||||
saver.restore(sess, checkpoint)
|
||||
|
||||
|
||||
def save_config(config, logdir=None):
|
||||
"""Save a new configuration by name.
|
||||
|
||||
If a logging directory is specified, is will be created and the configuration
|
||||
will be stored there. Otherwise, a log message will be printed.
|
||||
|
||||
Args:
|
||||
config: Configuration object.
|
||||
logdir: Location for writing summaries and checkpoints if specified.
|
||||
|
||||
Returns:
|
||||
Configuration object.
|
||||
"""
|
||||
if logdir:
|
||||
with config.unlocked:
|
||||
config.logdir = logdir
|
||||
message = 'Start a new run and write summaries and checkpoints to {}.'
|
||||
tf.logging.info(message.format(config.logdir))
|
||||
tf.gfile.MakeDirs(config.logdir)
|
||||
config_path = os.path.join(config.logdir, 'config.yaml')
|
||||
with tf.gfile.FastGFile(config_path, 'w') as file_:
|
||||
yaml.dump(config, file_, default_flow_style=False)
|
||||
else:
|
||||
message = (
|
||||
'Start a new run without storing summaries and checkpoints since no '
|
||||
'logging directory was specified.')
|
||||
tf.logging.info(message)
|
||||
return config
|
||||
|
||||
|
||||
def load_config(logdir):
|
||||
"""Load a configuration from the log directory.
|
||||
|
||||
Args:
|
||||
logdir: The logging directory containing the configuration file.
|
||||
|
||||
Raises:
|
||||
IOError: The logging directory does not contain a configuration file.
|
||||
|
||||
Returns:
|
||||
Configuration object.
|
||||
"""
|
||||
config_path = logdir and os.path.join(logdir, 'config.yaml')
|
||||
if not config_path or not tf.gfile.Exists(config_path):
|
||||
message = (
|
||||
'Cannot resume an existing run since the logging directory does not '
|
||||
'contain a configuration file.')
|
||||
raise IOError(message)
|
||||
with tf.gfile.FastGFile(config_path, 'r') as file_:
|
||||
config = yaml.load(file_)
|
||||
message = 'Resume run and write summaries and checkpoints to {}.'
|
||||
tf.logging.info(message.format(config.logdir))
|
||||
return config
|
||||
|
||||
|
||||
def set_up_logging():
|
||||
"""Configure the TensorFlow logger."""
|
||||
tf.logging.set_verbosity(tf.logging.INFO)
|
||||
logging.getLogger('tensorflow').propagate = False
|
||||
@@ -0,0 +1,157 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
r"""Script to render videos of the Proximal Policy Gradient algorithm.
|
||||
|
||||
Command line:
|
||||
|
||||
python3 -m agents.scripts.visualize \
|
||||
--logdir=/path/to/logdir/<time>-<config> --outdir=/path/to/outdir/
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import functools
|
||||
import os
|
||||
|
||||
import gym
|
||||
import tensorflow as tf
|
||||
|
||||
from pybullet_envs.minitaur.agents import tools
|
||||
from pybullet_envs.minitaur.agents.scripts import utility
|
||||
|
||||
|
||||
def _create_environment(config, outdir):
|
||||
"""Constructor for an instance of the environment.
|
||||
|
||||
Args:
|
||||
config: Object providing configurations via attributes.
|
||||
outdir: Directory to store videos in.
|
||||
|
||||
Returns:
|
||||
Wrapped OpenAI Gym environment.
|
||||
"""
|
||||
if isinstance(config.env, str):
|
||||
env = gym.make(config.env)
|
||||
else:
|
||||
env = config.env()
|
||||
# Ensure that the environment has the specification attribute set as expected
|
||||
# by the monitor wrapper.
|
||||
if not hasattr(env, 'spec'):
|
||||
setattr(env, 'spec', getattr(env, 'spec', None))
|
||||
if config.max_length:
|
||||
env = tools.wrappers.LimitDuration(env, config.max_length)
|
||||
# env = gym.wrappers.Monitor(
|
||||
# env, outdir, lambda unused_episode_number: True)
|
||||
env = tools.wrappers.RangeNormalize(env)
|
||||
env = tools.wrappers.ClipAction(env)
|
||||
env = tools.wrappers.ConvertTo32Bit(env)
|
||||
return env
|
||||
|
||||
|
||||
def _define_loop(graph, eval_steps):
|
||||
"""Create and configure an evaluation loop.
|
||||
|
||||
Args:
|
||||
graph: Object providing graph elements via attributes.
|
||||
eval_steps: Number of evaluation steps per epoch.
|
||||
|
||||
Returns:
|
||||
Loop object.
|
||||
"""
|
||||
loop = tools.Loop(
|
||||
None, graph.step, graph.should_log, graph.do_report, graph.force_reset)
|
||||
loop.add_phase(
|
||||
'eval', graph.done, graph.score, graph.summary, eval_steps,
|
||||
report_every=eval_steps,
|
||||
log_every=None,
|
||||
checkpoint_every=None,
|
||||
feed={graph.is_training: False})
|
||||
return loop
|
||||
|
||||
|
||||
def visualize(
|
||||
logdir, outdir, num_agents, num_episodes, checkpoint=None,
|
||||
env_processes=True):
|
||||
"""Recover checkpoint and render videos from it.
|
||||
|
||||
Args:
|
||||
logdir: Logging directory of the trained algorithm.
|
||||
outdir: Directory to store rendered videos in.
|
||||
num_agents: Number of environments to simulate in parallel.
|
||||
num_episodes: Total number of episodes to simulate.
|
||||
checkpoint: Checkpoint name to load; defaults to most recent.
|
||||
env_processes: Whether to step environments in separate processes.
|
||||
"""
|
||||
config = utility.load_config(logdir)
|
||||
with config.unlocked:
|
||||
config.network = functools.partial(
|
||||
utility.define_network, config.network, config)
|
||||
config.policy_optimizer = getattr(tf.train, config.policy_optimizer)
|
||||
config.value_optimizer = getattr(tf.train, config.value_optimizer)
|
||||
with tf.device('/cpu:0'):
|
||||
batch_env = utility.define_batch_env(
|
||||
lambda: _create_environment(config, outdir),
|
||||
num_agents, env_processes)
|
||||
graph = utility.define_simulation_graph(
|
||||
batch_env, config.algorithm, config)
|
||||
total_steps = num_episodes * config.max_length
|
||||
loop = _define_loop(graph, total_steps)
|
||||
saver = utility.define_saver(
|
||||
exclude=(r'.*_temporary/.*', r'global_step'))
|
||||
sess_config = tf.ConfigProto(allow_soft_placement=True)
|
||||
sess_config.gpu_options.allow_growth = True
|
||||
with tf.Session(config=sess_config) as sess:
|
||||
utility.initialize_variables(
|
||||
sess, saver, config.logdir, checkpoint, resume=True)
|
||||
for unused_score in loop.run(sess, saver, total_steps):
|
||||
pass
|
||||
batch_env.close()
|
||||
|
||||
|
||||
def main(_):
|
||||
"""Load a trained algorithm and render videos."""
|
||||
utility.set_up_logging()
|
||||
if not FLAGS.logdir or not FLAGS.outdir:
|
||||
raise KeyError('You must specify logging and outdirs directories.')
|
||||
FLAGS.logdir = os.path.expanduser(FLAGS.logdir)
|
||||
FLAGS.outdir = os.path.expanduser(FLAGS.outdir)
|
||||
visualize(
|
||||
FLAGS.logdir, FLAGS.outdir, FLAGS.num_agents, FLAGS.num_episodes,
|
||||
FLAGS.checkpoint, FLAGS.env_processes)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
FLAGS = tf.app.flags.FLAGS
|
||||
tf.app.flags.DEFINE_string(
|
||||
'logdir', None,
|
||||
'Directory to the checkpoint of a training run.')
|
||||
tf.app.flags.DEFINE_string(
|
||||
'outdir', None,
|
||||
'Local directory for storing the monitoring outdir.')
|
||||
tf.app.flags.DEFINE_string(
|
||||
'checkpoint', None,
|
||||
'Checkpoint name to load; defaults to most recent.')
|
||||
tf.app.flags.DEFINE_integer(
|
||||
'num_agents', 1,
|
||||
'How many environments to step in parallel.')
|
||||
tf.app.flags.DEFINE_integer(
|
||||
'num_episodes', 5,
|
||||
'Minimum number of episodes to render.')
|
||||
tf.app.flags.DEFINE_boolean(
|
||||
'env_processes', True,
|
||||
'Step environments in separate processes to circumvent the GIL.')
|
||||
tf.app.run()
|
||||
Reference in New Issue
Block a user