diff --git a/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_reactive_env/config.yaml b/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_reactive_env/config.yaml new file mode 100644 index 000000000..a2e6baedd --- /dev/null +++ b/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_reactive_env/config.yaml @@ -0,0 +1,54 @@ +!!python/object/new:pybullet_envs.minitaur.agents.tools.attr_dict.AttrDict +dictitems: + algorithm: !!python/name:pybullet_envs.minitaur.agents.ppo.algorithm.PPOAlgorithm '' + discount: 0.9868209124499899 + env: !!python/object/apply:functools.partial + args: + - &id001 !!python/name:pybullet_envs.minitaur.envs.minitaur_reactive_env.MinitaurReactiveEnv '' + state: !!python/tuple + - *id001 + - !!python/tuple [] + - accurate_motor_model_enabled: true + control_latency: 0.02 + energy_weight: 0.005 + env_randomizer: null + motor_kd: 0.015 + num_steps_to_log: 1000 + pd_latency: 0.003 + remove_default_joint_damping: true + render: false + urdf_version: rainbow_dash_v0 + - null + eval_episodes: 25 + init_logstd: -1.1579536194508315 + init_mean_factor: 0.3084392491563408 + kl_cutoff_coef: 1000 + kl_cutoff_factor: 2 + kl_init_penalty: 1 + kl_target: 0.01 + logdir: /cns/ij-d/home/jietan/experiment/minitaur_vizier_study_ppo/minreact_nonexp_nr_02_186515603_186518344/333 + max_length: 1000 + network: !!python/name:pybullet_envs.minitaur.agents.scripts.networks.ForwardGaussianPolicy '' + network_config: {} + num_agents: 25 + policy_layers: !!python/tuple + - 114 + - 45 + policy_lr: 0.00023516695218031146 + policy_optimizer: AdamOptimizer + steps: 7000000.0 + update_epochs_policy: 25 + update_epochs_value: 25 + update_every: 25 + use_gpu: false + value_layers: !!python/tuple + - 170 + - 78 + value_lr: 0.00031014032715987193 + value_optimizer: AdamOptimizer + weight_summaries: + all: .* + policy: .*/policy/.* + value: .*/value/.* +state: + _mutable: false \ No newline at end of file diff --git a/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_reactive_env/model.ckpt-14000000.data-00000-of-00001 b/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_reactive_env/model.ckpt-14000000.data-00000-of-00001 new file mode 100644 index 000000000..70c3900ea Binary files /dev/null and b/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_reactive_env/model.ckpt-14000000.data-00000-of-00001 differ diff --git a/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_reactive_env/model.ckpt-14000000.index b/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_reactive_env/model.ckpt-14000000.index new file mode 100644 index 000000000..00c1187b8 Binary files /dev/null and b/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_reactive_env/model.ckpt-14000000.index differ diff --git a/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_reactive_env/model.ckpt-14000000.meta b/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_reactive_env/model.ckpt-14000000.meta new file mode 100644 index 000000000..4cf3b431d Binary files /dev/null and b/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_reactive_env/model.ckpt-14000000.meta differ diff --git a/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_trotting_env/config.yaml b/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_trotting_env/config.yaml new file mode 100644 index 000000000..232150020 --- /dev/null +++ b/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_trotting_env/config.yaml @@ -0,0 +1,51 @@ +!!python/object/new:pybullet_envs.minitaur.agents.tools.attr_dict.AttrDict +dictitems: + algorithm: !!python/name:pybullet_envs.minitaur.agents.ppo.algorithm.PPOAlgorithm '' + discount: 0.9899764168788918 + env: !!python/object/apply:functools.partial + args: + - &id001 !!python/name:pybullet_envs.minitaur.envs.minitaur_trotting_env.MinitaurTrottingEnv '' + state: !!python/tuple + - *id001 + - !!python/tuple [] + - env_randomizer: null + motor_kd: 0.015 + num_steps_to_log: 1000 + pd_latency: 0.003 + remove_default_joint_damping: true + render: false + urdf_version: rainbow_dash_v0 + - null + eval_episodes: 25 + init_logstd: -0.6325707791047228 + init_mean_factor: 0.6508531688665261 + kl_cutoff_coef: 1000 + kl_cutoff_factor: 2 + kl_init_penalty: 1 + kl_target: 0.01 + logdir: /cns/ij-d/home/jietan/experiment/minitaur_vizier_study_ppo/mintrot_nonexp_nr_01_186515603_186518344/373 + max_length: 1000 + network: !!python/name:pybullet_envs.minitaur.agents.scripts.networks.ForwardGaussianPolicy '' + network_config: {} + num_agents: 25 + policy_layers: !!python/tuple + - 133 + - 100 + policy_lr: 0.00048104185841752015 + policy_optimizer: AdamOptimizer + steps: 7000000.0 + update_epochs_policy: 25 + update_epochs_value: 25 + update_every: 25 + use_gpu: false + value_layers: !!python/tuple + - 64 + - 57 + value_lr: 0.0012786382882055453 + value_optimizer: AdamOptimizer + weight_summaries: + all: .* + policy: .*/policy/.* + value: .*/value/.* +state: + _mutable: false diff --git a/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_trotting_env/model.ckpt-14000000.data-00000-of-00001 b/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_trotting_env/model.ckpt-14000000.data-00000-of-00001 new file mode 100644 index 000000000..e8f1a38df Binary files /dev/null and b/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_trotting_env/model.ckpt-14000000.data-00000-of-00001 differ diff --git a/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_trotting_env/model.ckpt-14000000.index b/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_trotting_env/model.ckpt-14000000.index new file mode 100644 index 000000000..449d15c10 Binary files /dev/null and b/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_trotting_env/model.ckpt-14000000.index differ diff --git a/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_trotting_env/model.ckpt-14000000.meta b/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_trotting_env/model.ckpt-14000000.meta new file mode 100644 index 000000000..024c73aa5 Binary files /dev/null and b/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_trotting_env/model.ckpt-14000000.meta differ diff --git a/examples/pybullet/gym/pybullet_envs/minitaur/__init__.py b/examples/pybullet/gym/pybullet_envs/minitaur/__init__.py index 39e44ec8a..b28b04f64 100644 --- a/examples/pybullet/gym/pybullet_envs/minitaur/__init__.py +++ b/examples/pybullet/gym/pybullet_envs/minitaur/__init__.py @@ -1 +1,3 @@ -import gym + + + diff --git a/examples/pybullet/gym/pybullet_envs/minitaur/agents/__init__.py b/examples/pybullet/gym/pybullet_envs/minitaur/agents/__init__.py new file mode 100644 index 000000000..8d1c8b69c --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/__init__.py @@ -0,0 +1 @@ + diff --git a/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/__init__.py b/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/__init__.py new file mode 100644 index 000000000..26a87baf9 --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/__init__.py @@ -0,0 +1,21 @@ +# Copyright 2017 The TensorFlow Agents Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Proximal Policy Optimization algorithm.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from .algorithm import PPOAlgorithm diff --git a/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/algorithm.py b/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/algorithm.py new file mode 100644 index 000000000..81b18b7b4 --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/algorithm.py @@ -0,0 +1,558 @@ +# Copyright 2017 The TensorFlow Agents Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Proximal Policy Optimization algorithm. + +Based on John Schulman's implementation in Python and Theano: +https://github.com/joschu/modular_rl/blob/master/modular_rl/ppo.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + +import tensorflow as tf + +from . import memory +from . import normalize +from . import utility + + +_NetworkOutput = collections.namedtuple( + 'NetworkOutput', 'policy, mean, logstd, value, state') + + +class PPOAlgorithm(object): + """A vectorized implementation of the PPO algorithm by John Schulman.""" + + def __init__(self, batch_env, step, is_training, should_log, config): + """Create an instance of the PPO algorithm. + + Args: + batch_env: In-graph batch environment. + step: Integer tensor holding the current training step. + is_training: Boolean tensor for whether the algorithm should train. + should_log: Boolean tensor for whether summaries should be returned. + config: Object containing the agent configuration as attributes. + """ + self._batch_env = batch_env + self._step = step + self._is_training = is_training + self._should_log = should_log + self._config = config + self._observ_filter = normalize.StreamingNormalize( + self._batch_env.observ[0], center=True, scale=True, clip=5, + name='normalize_observ') + self._reward_filter = normalize.StreamingNormalize( + self._batch_env.reward[0], center=False, scale=True, clip=10, + name='normalize_reward') + # Memory stores tuple of observ, action, mean, logstd, reward. + template = ( + self._batch_env.observ[0], self._batch_env.action[0], + self._batch_env.action[0], self._batch_env.action[0], + self._batch_env.reward[0]) + self._memory = memory.EpisodeMemory( + template, config.update_every, config.max_length, 'memory') + self._memory_index = tf.Variable(0, False) + use_gpu = self._config.use_gpu and utility.available_gpus() + with tf.device('/gpu:0' if use_gpu else '/cpu:0'): + # Create network variables for later calls to reuse. + self._network( + tf.zeros_like(self._batch_env.observ)[:, None], + tf.ones(len(self._batch_env)), reuse=None) + cell = self._config.network(self._batch_env.action.shape[1].value) + with tf.variable_scope('ppo_temporary'): + self._episodes = memory.EpisodeMemory( + template, len(batch_env), config.max_length, 'episodes') + self._last_state = utility.create_nested_vars( + cell.zero_state(len(batch_env), tf.float32)) + self._last_action = tf.Variable( + tf.zeros_like(self._batch_env.action), False, name='last_action') + self._last_mean = tf.Variable( + tf.zeros_like(self._batch_env.action), False, name='last_mean') + self._last_logstd = tf.Variable( + tf.zeros_like(self._batch_env.action), False, name='last_logstd') + self._penalty = tf.Variable( + self._config.kl_init_penalty, False, dtype=tf.float32) + self._policy_optimizer = self._config.policy_optimizer( + self._config.policy_lr, name='policy_optimizer') + self._value_optimizer = self._config.value_optimizer( + self._config.value_lr, name='value_optimizer') + + def begin_episode(self, agent_indices): + """Reset the recurrent states and stored episode. + + Args: + agent_indices: 1D tensor of batch indices for agents starting an episode. + + Returns: + Summary tensor. + """ + with tf.name_scope('begin_episode/'): + reset_state = utility.reinit_nested_vars(self._last_state, agent_indices) + reset_buffer = self._episodes.clear(agent_indices) + with tf.control_dependencies([reset_state, reset_buffer]): + return tf.constant('') + + def perform(self, observ): + """Compute batch of actions and a summary for a batch of observation. + + Args: + observ: Tensor of a batch of observations for all agents. + + Returns: + Tuple of action batch tensor and summary tensor. + """ + with tf.name_scope('perform/'): + observ = self._observ_filter.transform(observ) + network = self._network( + observ[:, None], tf.ones(observ.shape[0]), self._last_state) + action = tf.cond( + self._is_training, network.policy.sample, lambda: network.mean) + logprob = network.policy.log_prob(action)[:, 0] + # pylint: disable=g-long-lambda + summary = tf.cond(self._should_log, lambda: tf.summary.merge([ + tf.summary.histogram('mean', network.mean[:, 0]), + tf.summary.histogram('std', tf.exp(network.logstd[:, 0])), + tf.summary.histogram('action', action[:, 0]), + tf.summary.histogram('logprob', logprob)]), str) + # Remember current policy to append to memory in the experience callback. + with tf.control_dependencies([ + utility.assign_nested_vars(self._last_state, network.state), + self._last_action.assign(action[:, 0]), + self._last_mean.assign(network.mean[:, 0]), + self._last_logstd.assign(network.logstd[:, 0])]): + return tf.check_numerics(action[:, 0], 'action'), tf.identity(summary) + + def experience(self, observ, action, reward, unused_done, unused_nextob): + """Process the transition tuple of the current step. + + When training, add the current transition tuple to the memory and update + the streaming statistics for observations and rewards. A summary string is + returned if requested at this step. + + Args: + observ: Batch tensor of observations. + action: Batch tensor of actions. + reward: Batch tensor of rewards. + unused_done: Batch tensor of done flags. + unused_nextob: Batch tensor of successor observations. + + Returns: + Summary tensor. + """ + with tf.name_scope('experience/'): + return tf.cond( + self._is_training, + lambda: self._define_experience(observ, action, reward), str) + + def _define_experience(self, observ, action, reward): + """Implement the branch of experience() entered during training.""" + update_filters = tf.summary.merge([ + self._observ_filter.update(observ), + self._reward_filter.update(reward)]) + with tf.control_dependencies([update_filters]): + if self._config.train_on_agent_action: + # NOTE: Doesn't seem to change much. + action = self._last_action + batch = observ, action, self._last_mean, self._last_logstd, reward + append = self._episodes.append(batch, tf.range(len(self._batch_env))) + with tf.control_dependencies([append]): + norm_observ = self._observ_filter.transform(observ) + norm_reward = tf.reduce_mean(self._reward_filter.transform(reward)) + # pylint: disable=g-long-lambda + summary = tf.cond(self._should_log, lambda: tf.summary.merge([ + update_filters, + self._observ_filter.summary(), + self._reward_filter.summary(), + tf.summary.scalar('memory_size', self._memory_index), + tf.summary.histogram('normalized_observ', norm_observ), + tf.summary.histogram('action', self._last_action), + tf.summary.scalar('normalized_reward', norm_reward)]), str) + return summary + + def end_episode(self, agent_indices): + """Add episodes to the memory and perform update steps if memory is full. + + During training, add the collected episodes of the batch indices that + finished their episode to the memory. If the memory is full, train on it, + and then clear the memory. A summary string is returned if requested at + this step. + + Args: + agent_indices: 1D tensor of batch indices for agents starting an episode. + + Returns: + Summary tensor. + """ + with tf.name_scope('end_episode/'): + return tf.cond( + self._is_training, + lambda: self._define_end_episode(agent_indices), str) + + def _define_end_episode(self, agent_indices): + """Implement the branch of end_episode() entered during training.""" + episodes, length = self._episodes.data(agent_indices) + space_left = self._config.update_every - self._memory_index + use_episodes = tf.range(tf.minimum( + tf.shape(agent_indices)[0], space_left)) + episodes = [tf.gather(elem, use_episodes) for elem in episodes] + append = self._memory.replace( + episodes, tf.gather(length, use_episodes), + use_episodes + self._memory_index) + with tf.control_dependencies([append]): + inc_index = self._memory_index.assign_add(tf.shape(use_episodes)[0]) + with tf.control_dependencies([inc_index]): + memory_full = self._memory_index >= self._config.update_every + return tf.cond(memory_full, self._training, str) + + def _training(self): + """Perform multiple training iterations of both policy and value baseline. + + Training on the episodes collected in the memory. Reset the memory + afterwards. Always returns a summary string. + + Returns: + Summary tensor. + """ + with tf.name_scope('training'): + assert_full = tf.assert_equal( + self._memory_index, self._config.update_every) + with tf.control_dependencies([assert_full]): + data = self._memory.data() + (observ, action, old_mean, old_logstd, reward), length = data + with tf.control_dependencies([tf.assert_greater(length, 0)]): + length = tf.identity(length) + observ = self._observ_filter.transform(observ) + reward = self._reward_filter.transform(reward) + policy_summary = self._update_policy( + observ, action, old_mean, old_logstd, reward, length) + with tf.control_dependencies([policy_summary]): + value_summary = self._update_value(observ, reward, length) + with tf.control_dependencies([value_summary]): + penalty_summary = self._adjust_penalty( + observ, old_mean, old_logstd, length) + with tf.control_dependencies([penalty_summary]): + clear_memory = tf.group( + self._memory.clear(), self._memory_index.assign(0)) + with tf.control_dependencies([clear_memory]): + weight_summary = utility.variable_summaries( + tf.trainable_variables(), self._config.weight_summaries) + return tf.summary.merge([ + policy_summary, value_summary, penalty_summary, weight_summary]) + + def _update_value(self, observ, reward, length): + """Perform multiple update steps of the value baseline. + + We need to decide for the summary of one iteration, and thus choose the one + after half of the iterations. + + Args: + observ: Sequences of observations. + reward: Sequences of reward. + length: Batch of sequence lengths. + + Returns: + Summary tensor. + """ + with tf.name_scope('update_value'): + loss, summary = tf.scan( + lambda _1, _2: self._update_value_step(observ, reward, length), + tf.range(self._config.update_epochs_value), + [0., ''], parallel_iterations=1) + print_loss = tf.Print(0, [tf.reduce_mean(loss)], 'value loss: ') + with tf.control_dependencies([loss, print_loss]): + return summary[self._config.update_epochs_value // 2] + + def _update_value_step(self, observ, reward, length): + """Compute the current value loss and perform a gradient update step. + + Args: + observ: Sequences of observations. + reward: Sequences of reward. + length: Batch of sequence lengths. + + Returns: + Tuple of loss tensor and summary tensor. + """ + loss, summary = self._value_loss(observ, reward, length) + gradients, variables = ( + zip(*self._value_optimizer.compute_gradients(loss))) + optimize = self._value_optimizer.apply_gradients( + zip(gradients, variables)) + summary = tf.summary.merge([ + summary, + tf.summary.scalar('gradient_norm', tf.global_norm(gradients)), + utility.gradient_summaries( + zip(gradients, variables), dict(value=r'.*'))]) + with tf.control_dependencies([optimize]): + return [tf.identity(loss), tf.identity(summary)] + + def _value_loss(self, observ, reward, length): + """Compute the loss function for the value baseline. + + The value loss is the difference between empirical and approximated returns + over the collected episodes. Returns the loss tensor and a summary strin. + + Args: + observ: Sequences of observations. + reward: Sequences of reward. + length: Batch of sequence lengths. + + Returns: + Tuple of loss tensor and summary tensor. + """ + with tf.name_scope('value_loss'): + value = self._network(observ, length).value + return_ = utility.discounted_return( + reward, length, self._config.discount) + advantage = return_ - value + value_loss = 0.5 * self._mask(advantage ** 2, length) + summary = tf.summary.merge([ + tf.summary.histogram('value_loss', value_loss), + tf.summary.scalar('avg_value_loss', tf.reduce_mean(value_loss))]) + value_loss = tf.reduce_mean(value_loss) + return tf.check_numerics(value_loss, 'value_loss'), summary + + def _update_policy( + self, observ, action, old_mean, old_logstd, reward, length): + """Perform multiple update steps of the policy. + + The advantage is computed once at the beginning and shared across + iterations. We need to decide for the summary of one iteration, and thus + choose the one after half of the iterations. + + Args: + observ: Sequences of observations. + action: Sequences of actions. + old_mean: Sequences of action means of the behavioral policy. + old_logstd: Sequences of action log stddevs of the behavioral policy. + reward: Sequences of rewards. + length: Batch of sequence lengths. + + Returns: + Summary tensor. + """ + with tf.name_scope('update_policy'): + return_ = utility.discounted_return( + reward, length, self._config.discount) + value = self._network(observ, length).value + if self._config.gae_lambda: + advantage = utility.lambda_return( + reward, value, length, self._config.discount, + self._config.gae_lambda) + else: + advantage = return_ - value + mean, variance = tf.nn.moments(advantage, axes=[0, 1], keep_dims=True) + advantage = (advantage - mean) / (tf.sqrt(variance) + 1e-8) + advantage = tf.Print( + advantage, [tf.reduce_mean(return_), tf.reduce_mean(value)], + 'return and value: ') + advantage = tf.Print( + advantage, [tf.reduce_mean(advantage)], + 'normalized advantage: ') + # pylint: disable=g-long-lambda + loss, summary = tf.scan( + lambda _1, _2: self._update_policy_step( + observ, action, old_mean, old_logstd, advantage, length), + tf.range(self._config.update_epochs_policy), + [0., ''], parallel_iterations=1) + print_loss = tf.Print(0, [tf.reduce_mean(loss)], 'policy loss: ') + with tf.control_dependencies([loss, print_loss]): + return summary[self._config.update_epochs_policy // 2] + + def _update_policy_step( + self, observ, action, old_mean, old_logstd, advantage, length): + """Compute the current policy loss and perform a gradient update step. + + Args: + observ: Sequences of observations. + action: Sequences of actions. + old_mean: Sequences of action means of the behavioral policy. + old_logstd: Sequences of action log stddevs of the behavioral policy. + advantage: Sequences of advantages. + length: Batch of sequence lengths. + + Returns: + Tuple of loss tensor and summary tensor. + """ + network = self._network(observ, length) + loss, summary = self._policy_loss( + network.mean, network.logstd, old_mean, old_logstd, action, + advantage, length) + gradients, variables = ( + zip(*self._policy_optimizer.compute_gradients(loss))) + optimize = self._policy_optimizer.apply_gradients( + zip(gradients, variables)) + summary = tf.summary.merge([ + summary, + tf.summary.scalar('gradient_norm', tf.global_norm(gradients)), + utility.gradient_summaries( + zip(gradients, variables), dict(policy=r'.*'))]) + with tf.control_dependencies([optimize]): + return [tf.identity(loss), tf.identity(summary)] + + def _policy_loss( + self, mean, logstd, old_mean, old_logstd, action, advantage, length): + """Compute the policy loss composed of multiple components. + + 1. The policy gradient loss is importance sampled from the data-collecting + policy at the beginning of training. + 2. The second term is a KL penalty between the policy at the beginning of + training and the current policy. + 3. Additionally, if this KL already changed more than twice the target + amount, we activate a strong penalty discouraging further divergence. + + Args: + mean: Sequences of action means of the current policy. + logstd: Sequences of action log stddevs of the current policy. + old_mean: Sequences of action means of the behavioral policy. + old_logstd: Sequences of action log stddevs of the behavioral policy. + action: Sequences of actions. + advantage: Sequences of advantages. + length: Batch of sequence lengths. + + Returns: + Tuple of loss tensor and summary tensor. + """ + with tf.name_scope('policy_loss'): + entropy = utility.diag_normal_entropy(mean, logstd) + kl = tf.reduce_mean(self._mask(utility.diag_normal_kl( + old_mean, old_logstd, mean, logstd), length), 1) + policy_gradient = tf.exp( + utility.diag_normal_logpdf(mean, logstd, action) - + utility.diag_normal_logpdf(old_mean, old_logstd, action)) + surrogate_loss = -tf.reduce_mean(self._mask( + policy_gradient * tf.stop_gradient(advantage), length), 1) + kl_penalty = self._penalty * kl + cutoff_threshold = self._config.kl_target * self._config.kl_cutoff_factor + cutoff_count = tf.reduce_sum( + tf.cast(kl > cutoff_threshold, tf.int32)) + with tf.control_dependencies([tf.cond( + cutoff_count > 0, + lambda: tf.Print(0, [cutoff_count], 'kl cutoff! '), int)]): + kl_cutoff = ( + self._config.kl_cutoff_coef * + tf.cast(kl > cutoff_threshold, tf.float32) * + (kl - cutoff_threshold) ** 2) + policy_loss = surrogate_loss + kl_penalty + kl_cutoff + summary = tf.summary.merge([ + tf.summary.histogram('entropy', entropy), + tf.summary.histogram('kl', kl), + tf.summary.histogram('surrogate_loss', surrogate_loss), + tf.summary.histogram('kl_penalty', kl_penalty), + tf.summary.histogram('kl_cutoff', kl_cutoff), + tf.summary.histogram('kl_penalty_combined', kl_penalty + kl_cutoff), + tf.summary.histogram('policy_loss', policy_loss), + tf.summary.scalar('avg_surr_loss', tf.reduce_mean(surrogate_loss)), + tf.summary.scalar('avg_kl_penalty', tf.reduce_mean(kl_penalty)), + tf.summary.scalar('avg_policy_loss', tf.reduce_mean(policy_loss))]) + policy_loss = tf.reduce_mean(policy_loss, 0) + return tf.check_numerics(policy_loss, 'policy_loss'), summary + + def _adjust_penalty(self, observ, old_mean, old_logstd, length): + """Adjust the KL policy between the behavioral and current policy. + + Compute how much the policy actually changed during the multiple + update steps. Adjust the penalty strength for the next training phase if we + overshot or undershot the target divergence too much. + + Args: + observ: Sequences of observations. + old_mean: Sequences of action means of the behavioral policy. + old_logstd: Sequences of action log stddevs of the behavioral policy. + length: Batch of sequence lengths. + + Returns: + Summary tensor. + """ + with tf.name_scope('adjust_penalty'): + network = self._network(observ, length) + assert_change = tf.assert_equal( + tf.reduce_all(tf.equal(network.mean, old_mean)), False, + message='policy should change') + print_penalty = tf.Print(0, [self._penalty], 'current penalty: ') + with tf.control_dependencies([assert_change, print_penalty]): + kl_change = tf.reduce_mean(self._mask(utility.diag_normal_kl( + old_mean, old_logstd, network.mean, network.logstd), length)) + kl_change = tf.Print(kl_change, [kl_change], 'kl change: ') + maybe_increase = tf.cond( + kl_change > 1.3 * self._config.kl_target, + # pylint: disable=g-long-lambda + lambda: tf.Print(self._penalty.assign( + self._penalty * 1.5), [0], 'increase penalty '), + float) + maybe_decrease = tf.cond( + kl_change < 0.7 * self._config.kl_target, + # pylint: disable=g-long-lambda + lambda: tf.Print(self._penalty.assign( + self._penalty / 1.5), [0], 'decrease penalty '), + float) + with tf.control_dependencies([maybe_increase, maybe_decrease]): + return tf.summary.merge([ + tf.summary.scalar('kl_change', kl_change), + tf.summary.scalar('penalty', self._penalty)]) + + def _mask(self, tensor, length): + """Set padding elements of a batch of sequences to zero. + + Useful to then safely sum along the time dimension. + + Args: + tensor: Tensor of sequences. + length: Batch of sequence lengths. + + Returns: + Masked sequences. + """ + with tf.name_scope('mask'): + range_ = tf.range(tensor.shape[1].value) + mask = tf.cast(range_[None, :] < length[:, None], tf.float32) + masked = tensor * mask + return tf.check_numerics(masked, 'masked') + + def _network(self, observ, length=None, state=None, reuse=True): + """Compute the network output for a batched sequence of observations. + + Optionally, the initial state can be specified. The weights should be + reused for all calls, except for the first one. Output is a named tuple + containing the policy as a TensorFlow distribution, the policy mean and log + standard deviation, the approximated state value, and the new recurrent + state. + + Args: + observ: Sequences of observations. + length: Batch of sequence lengths. + state: Batch of initial recurrent states. + reuse: Python boolean whether to reuse previous variables. + + Returns: + NetworkOutput tuple. + """ + with tf.variable_scope('network', reuse=reuse): + observ = tf.convert_to_tensor(observ) + use_gpu = self._config.use_gpu and utility.available_gpus() + with tf.device('/gpu:0' if use_gpu else '/cpu:0'): + observ = tf.check_numerics(observ, 'observ') + cell = self._config.network(self._batch_env.action.shape[1].value) + (mean, logstd, value), state = tf.nn.dynamic_rnn( + cell, observ, length, state, tf.float32, swap_memory=True) + mean = tf.check_numerics(mean, 'mean') + logstd = tf.check_numerics(logstd, 'logstd') + value = tf.check_numerics(value, 'value') + policy = tf.contrib.distributions.MultivariateNormalDiag( + mean, tf.exp(logstd)) + return _NetworkOutput(policy, mean, logstd, value, state) diff --git a/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/memory.py b/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/memory.py new file mode 100644 index 000000000..81e0f73d7 --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/memory.py @@ -0,0 +1,152 @@ +# Copyright 2017 The TensorFlow Agents Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Memory that stores episodes.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + + +class EpisodeMemory(object): + """Memory that stores episodes.""" + + def __init__(self, template, capacity, max_length, scope): + """Create a memory that stores episodes. + + Each transition tuple consists of quantities specified by the template. + These quantities would typically be be observartions, actions, rewards, and + done indicators. + + Args: + template: List of tensors to derive shapes and dtypes of each transition. + capacity: Number of episodes, or rows, hold by the memory. + max_length: Allocated sequence length for the episodes. + scope: Variable scope to use for internal variables. + """ + self._capacity = capacity + self._max_length = max_length + with tf.variable_scope(scope) as scope: + self._scope = scope + self._length = tf.Variable(tf.zeros(capacity, tf.int32), False) + self._buffers = [ + tf.Variable(tf.zeros( + [capacity, max_length] + elem.shape.as_list(), + elem.dtype), False) + for elem in template] + + def length(self, rows=None): + """Tensor holding the current length of episodes. + + Args: + rows: Episodes to select length from, defaults to all. + + Returns: + Batch tensor of sequence lengths. + """ + rows = tf.range(self._capacity) if rows is None else rows + return tf.gather(self._length, rows) + + def append(self, transitions, rows=None): + """Append a batch of transitions to rows of the memory. + + Args: + transitions: Tuple of transition quantities with batch dimension. + rows: Episodes to append to, defaults to all. + + Returns: + Operation. + """ + rows = tf.range(self._capacity) if rows is None else rows + assert rows.shape.ndims == 1 + assert_capacity = tf.assert_less( + rows, self._capacity, + message='capacity exceeded') + with tf.control_dependencies([assert_capacity]): + assert_max_length = tf.assert_less( + tf.gather(self._length, rows), self._max_length, + message='max length exceeded') + append_ops = [] + with tf.control_dependencies([assert_max_length]): + for buffer_, elements in zip(self._buffers, transitions): + timestep = tf.gather(self._length, rows) + indices = tf.stack([rows, timestep], 1) + append_ops.append(tf.scatter_nd_update(buffer_, indices, elements)) + with tf.control_dependencies(append_ops): + episode_mask = tf.reduce_sum(tf.one_hot( + rows, self._capacity, dtype=tf.int32), 0) + return self._length.assign_add(episode_mask) + + def replace(self, episodes, length, rows=None): + """Replace full episodes. + + Args: + episodes: Tuple of transition quantities with batch and time dimensions. + length: Batch of sequence lengths. + rows: Episodes to replace, defaults to all. + + Returns: + Operation. + """ + rows = tf.range(self._capacity) if rows is None else rows + assert rows.shape.ndims == 1 + assert_capacity = tf.assert_less( + rows, self._capacity, message='capacity exceeded') + with tf.control_dependencies([assert_capacity]): + assert_max_length = tf.assert_less_equal( + length, self._max_length, message='max length exceeded') + replace_ops = [] + with tf.control_dependencies([assert_max_length]): + for buffer_, elements in zip(self._buffers, episodes): + replace_op = tf.scatter_update(buffer_, rows, elements) + replace_ops.append(replace_op) + with tf.control_dependencies(replace_ops): + return tf.scatter_update(self._length, rows, length) + + def data(self, rows=None): + """Access a batch of episodes from the memory. + + Padding elements after the length of each episode are unspecified and might + contain old data. + + Args: + rows: Episodes to select, defaults to all. + + Returns: + Tuple containing a tuple of transition quantiries with batch and time + dimensions, and a batch of sequence lengths. + """ + rows = tf.range(self._capacity) if rows is None else rows + assert rows.shape.ndims == 1 + episode = [tf.gather(buffer_, rows) for buffer_ in self._buffers] + length = tf.gather(self._length, rows) + return episode, length + + def clear(self, rows=None): + """Reset episodes in the memory. + + Internally, this only sets their lengths to zero. The memory entries will + be overridden by future calls to append() or replace(). + + Args: + rows: Episodes to clear, defaults to all. + + Returns: + Operation. + """ + rows = tf.range(self._capacity) if rows is None else rows + assert rows.shape.ndims == 1 + return tf.scatter_update(self._length, rows, tf.zeros_like(rows)) diff --git a/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/normalize.py b/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/normalize.py new file mode 100644 index 000000000..6c4170519 --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/normalize.py @@ -0,0 +1,168 @@ +# Copyright 2017 The TensorFlow Agents Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Normalize tensors based on streaming estimates of mean and variance.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + + +class StreamingNormalize(object): + """Normalize tensors based on streaming estimates of mean and variance.""" + + def __init__( + self, template, center=True, scale=True, clip=10, name='normalize'): + """Normalize tensors based on streaming estimates of mean and variance. + + Centering the value, scaling it by the standard deviation, and clipping + outlier values are optional. + + Args: + template: Example tensor providing shape and dtype of the vaule to track. + center: Python boolean indicating whether to subtract mean from values. + scale: Python boolean indicating whether to scale values by stddev. + clip: If and when to clip normalized values. + name: Parent scope of operations provided by this class. + """ + self._center = center + self._scale = scale + self._clip = clip + self._name = name + with tf.name_scope(name): + self._count = tf.Variable(0, False) + self._mean = tf.Variable(tf.zeros_like(template), False) + self._var_sum = tf.Variable(tf.zeros_like(template), False) + + def transform(self, value): + """Normalize a single or batch tensor. + + Applies the activated transformations in the constructor using current + estimates of mean and variance. + + Args: + value: Batch or single value tensor. + + Returns: + Normalized batch or single value tensor. + """ + with tf.name_scope(self._name + '/transform'): + no_batch_dim = value.shape.ndims == self._mean.shape.ndims + if no_batch_dim: + # Add a batch dimension if necessary. + value = value[None, ...] + if self._center: + value -= self._mean[None, ...] + if self._scale: + # We cannot scale before seeing at least two samples. + value /= tf.cond( + self._count > 1, lambda: self._std() + 1e-8, + lambda: tf.ones_like(self._var_sum))[None] + if self._clip: + value = tf.clip_by_value(value, -self._clip, self._clip) + # Remove batch dimension if necessary. + if no_batch_dim: + value = value[0] + return tf.check_numerics(value, 'value') + + def update(self, value): + """Update the mean and variance estimates. + + Args: + value: Batch or single value tensor. + + Returns: + Summary tensor. + """ + with tf.name_scope(self._name + '/update'): + if value.shape.ndims == self._mean.shape.ndims: + # Add a batch dimension if necessary. + value = value[None, ...] + count = tf.shape(value)[0] + with tf.control_dependencies([self._count.assign_add(count)]): + step = tf.cast(self._count, tf.float32) + mean_delta = tf.reduce_sum(value - self._mean[None, ...], 0) + new_mean = self._mean + mean_delta / step + new_mean = tf.cond(self._count > 1, lambda: new_mean, lambda: value[0]) + var_delta = ( + value - self._mean[None, ...]) * (value - new_mean[None, ...]) + new_var_sum = self._var_sum + tf.reduce_sum(var_delta, 0) + with tf.control_dependencies([new_mean, new_var_sum]): + update = self._mean.assign(new_mean), self._var_sum.assign(new_var_sum) + with tf.control_dependencies(update): + if value.shape.ndims == 1: + value = tf.reduce_mean(value) + return self._summary('value', tf.reduce_mean(value)) + + def reset(self): + """Reset the estimates of mean and variance. + + Resets the full state of this class. + + Returns: + Operation. + """ + with tf.name_scope(self._name + '/reset'): + return tf.group( + self._count.assign(0), + self._mean.assign(tf.zeros_like(self._mean)), + self._var_sum.assign(tf.zeros_like(self._var_sum))) + + def summary(self): + """Summary string of mean and standard deviation. + + Returns: + Summary tensor. + """ + with tf.name_scope(self._name + '/summary'): + mean_summary = tf.cond( + self._count > 0, lambda: self._summary('mean', self._mean), str) + std_summary = tf.cond( + self._count > 1, lambda: self._summary('stddev', self._std()), str) + return tf.summary.merge([mean_summary, std_summary]) + + def _std(self): + """Computes the current estimate of the standard deviation. + + Note that the standard deviation is not defined until at least two samples + were seen. + + Returns: + Tensor of current variance. + """ + variance = tf.cond( + self._count > 1, + lambda: self._var_sum / tf.cast(self._count - 1, tf.float32), + lambda: tf.ones_like(self._var_sum) * float('nan')) + # The epsilon corrects for small negative variance values caused by + # the algorithm. It was empirically chosen to work with all environments + # tested. + return tf.sqrt(variance + 1e-4) + + def _summary(self, name, tensor): + """Create a scalar or histogram summary matching the rank of the tensor. + + Args: + name: Name for the summary. + tensor: Tensor to summarize. + + Returns: + Summary tensor. + """ + if tensor.shape.ndims == 0: + return tf.summary.scalar(name, tensor) + else: + return tf.summary.histogram(name, tensor) diff --git a/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/utility.py b/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/utility.py new file mode 100644 index 000000000..79645b16b --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/utility.py @@ -0,0 +1,222 @@ +# Copyright 2017 The TensorFlow Agents Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities for the PPO algorithm.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import math +import re + +import tensorflow as tf +from tensorflow.python.client import device_lib + + +def create_nested_vars(tensors): + """Create variables matching a nested tuple of tensors. + + Args: + tensors: Nested tuple of list of tensors. + + Returns: + Nested tuple or list of variables. + """ + if isinstance(tensors, (tuple, list)): + return type(tensors)(create_nested_vars(tensor) for tensor in tensors) + return tf.Variable(tensors, False) + + +def reinit_nested_vars(variables, indices=None): + """Reset all variables in a nested tuple to zeros. + + Args: + variables: Nested tuple or list of variaables. + indices: Indices along the first dimension to reset, defaults to all. + + Returns: + Operation. + """ + if isinstance(variables, (tuple, list)): + return tf.group(*[ + reinit_nested_vars(variable, indices) for variable in variables]) + if indices is None: + return variables.assign(tf.zeros_like(variables)) + else: + zeros = tf.zeros([tf.shape(indices)[0]] + variables.shape[1:].as_list()) + return tf.scatter_update(variables, indices, zeros) + + +def assign_nested_vars(variables, tensors): + """Assign tensors to matching nested tuple of variables. + + Args: + variables: Nested tuple or list of variables to update. + tensors: Nested tuple or list of tensors to assign. + + Returns: + Operation. + """ + if isinstance(variables, (tuple, list)): + return tf.group(*[ + assign_nested_vars(variable, tensor) + for variable, tensor in zip(variables, tensors)]) + return variables.assign(tensors) + + +def discounted_return(reward, length, discount): + """Discounted Monte-Carlo returns.""" + timestep = tf.range(reward.shape[1].value) + mask = tf.cast(timestep[None, :] < length[:, None], tf.float32) + return_ = tf.reverse(tf.transpose(tf.scan( + lambda agg, cur: cur + discount * agg, + tf.transpose(tf.reverse(mask * reward, [1]), [1, 0]), + tf.zeros_like(reward[:, -1]), 1, False), [1, 0]), [1]) + return tf.check_numerics(tf.stop_gradient(return_), 'return') + + +def fixed_step_return(reward, value, length, discount, window): + """N-step discounted return.""" + timestep = tf.range(reward.shape[1].value) + mask = tf.cast(timestep[None, :] < length[:, None], tf.float32) + return_ = tf.zeros_like(reward) + for _ in range(window): + return_ += reward + reward = discount * tf.concat( + [reward[:, 1:], tf.zeros_like(reward[:, -1:])], 1) + return_ += discount ** window * tf.concat( + [value[:, window:], tf.zeros_like(value[:, -window:]), 1]) + return tf.check_numerics(tf.stop_gradient(mask * return_), 'return') + + +def lambda_return(reward, value, length, discount, lambda_): + """TD-lambda returns.""" + timestep = tf.range(reward.shape[1].value) + mask = tf.cast(timestep[None, :] < length[:, None], tf.float32) + sequence = mask * reward + discount * value * (1 - lambda_) + discount = mask * discount * lambda_ + sequence = tf.stack([sequence, discount], 2) + return_ = tf.reverse(tf.transpose(tf.scan( + lambda agg, cur: cur[0] + cur[1] * agg, + tf.transpose(tf.reverse(sequence, [1]), [1, 2, 0]), + tf.zeros_like(value[:, -1]), 1, False), [1, 0]), [1]) + return tf.check_numerics(tf.stop_gradient(return_), 'return') + + +def lambda_advantage(reward, value, length, discount): + """Generalized Advantage Estimation.""" + timestep = tf.range(reward.shape[1].value) + mask = tf.cast(timestep[None, :] < length[:, None], tf.float32) + next_value = tf.concat([value[:, 1:], tf.zeros_like(value[:, -1:])], 1) + delta = reward + discount * next_value - value + advantage = tf.reverse(tf.transpose(tf.scan( + lambda agg, cur: cur + discount * agg, + tf.transpose(tf.reverse(mask * delta, [1]), [1, 0]), + tf.zeros_like(delta[:, -1]), 1, False), [1, 0]), [1]) + return tf.check_numerics(tf.stop_gradient(advantage), 'advantage') + + +def diag_normal_kl(mean0, logstd0, mean1, logstd1): + """Epirical KL divergence of two normals with diagonal covariance.""" + logstd0_2, logstd1_2 = 2 * logstd0, 2 * logstd1 + return 0.5 * ( + tf.reduce_sum(tf.exp(logstd0_2 - logstd1_2), -1) + + tf.reduce_sum((mean1 - mean0) ** 2 / tf.exp(logstd1_2), -1) + + tf.reduce_sum(logstd1_2, -1) - tf.reduce_sum(logstd0_2, -1) - + mean0.shape[-1].value) + + +def diag_normal_logpdf(mean, logstd, loc): + """Log density of a normal with diagonal covariance.""" + constant = -0.5 * (math.log(2 * math.pi) + logstd) + value = -0.5 * ((loc - mean) / tf.exp(logstd)) ** 2 + return tf.reduce_sum(constant + value, -1) + + +def diag_normal_entropy(mean, logstd): + """Empirical entropy of a normal with diagonal covariance.""" + constant = mean.shape[-1].value * math.log(2 * math.pi * math.e) + return (constant + tf.reduce_sum(2 * logstd, 1)) / 2 + + +def available_gpus(): + """List of GPU device names detected by TensorFlow.""" + local_device_protos = device_lib.list_local_devices() + return [x.name for x in local_device_protos if x.device_type == 'GPU'] + + +def gradient_summaries(grad_vars, groups=None, scope='gradients'): + """Create histogram summaries of the gradient. + + Summaries can be grouped via regexes matching variables names. + + Args: + grad_vars: List of (gradient, variable) tuples as returned by optimizers. + groups: Mapping of name to regex for grouping summaries. + scope: Name scope for this operation. + + Returns: + Summary tensor. + """ + groups = groups or {r'all': r'.*'} + grouped = collections.defaultdict(list) + for grad, var in grad_vars: + if grad is None: + continue + for name, pattern in groups.items(): + if re.match(pattern, var.name): + name = re.sub(pattern, name, var.name) + grouped[name].append(grad) + for name in groups: + if name not in grouped: + tf.logging.warn("No variables matching '{}' group.".format(name)) + summaries = [] + for name, grads in grouped.items(): + grads = [tf.reshape(grad, [-1]) for grad in grads] + grads = tf.concat(grads, 0) + summaries.append(tf.summary.histogram(scope + '/' + name, grads)) + return tf.summary.merge(summaries) + + +def variable_summaries(vars_, groups=None, scope='weights'): + """Create histogram summaries for the provided variables. + + Summaries can be grouped via regexes matching variables names. + + Args: + vars_: List of variables to summarize. + groups: Mapping of name to regex for grouping summaries. + scope: Name scope for this operation. + + Returns: + Summary tensor. + """ + groups = groups or {r'all': r'.*'} + grouped = collections.defaultdict(list) + for var in vars_: + for name, pattern in groups.items(): + if re.match(pattern, var.name): + name = re.sub(pattern, name, var.name) + grouped[name].append(var) + for name in groups: + if name not in grouped: + tf.logging.warn("No variables matching '{}' group.".format(name)) + summaries = [] + for name, vars_ in grouped.items(): + vars_ = [tf.reshape(var, [-1]) for var in vars_] + vars_ = tf.concat(vars_, 0) + summaries.append(tf.summary.histogram(scope + '/' + name, vars_)) + return tf.summary.merge(summaries) diff --git a/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/__init__.py b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/__init__.py new file mode 100644 index 000000000..05ee82406 --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/__init__.py @@ -0,0 +1,23 @@ +# Copyright 2017 The TensorFlow Agents Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Executable scripts for reinforcement learning.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from . import train +from . import utility +from . import visualize diff --git a/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/configs.py b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/configs.py new file mode 100644 index 000000000..669f61827 --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/configs.py @@ -0,0 +1,128 @@ +# Copyright 2017 The TensorFlow Agents Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Example configurations using the PPO algorithm.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# pylint: disable=unused-variable + +from pybullet_envs.minitaur.agents import ppo +from pybullet_envs.minitaur.agents.scripts import networks + + +def default(): + """Default configuration for PPO.""" + # General + algorithm = ppo.PPOAlgorithm + num_agents = 10 + eval_episodes = 25 + use_gpu = False + # Network + network = networks.ForwardGaussianPolicy + weight_summaries = dict( + all=r'.*', + policy=r'.*/policy/.*', + value=r'.*/value/.*') + policy_layers = 200, 100 + value_layers = 200, 100 + init_mean_factor = 0.05 + init_logstd = -1 + # Optimization + update_every = 25 + policy_optimizer = 'AdamOptimizer' + value_optimizer = 'AdamOptimizer' + update_epochs_policy = 50 + update_epochs_value = 50 + policy_lr = 1e-4 + value_lr = 3e-4 + # Losses + discount = 0.985 + kl_target = 1e-2 + kl_cutoff_factor = 2 + kl_cutoff_coef = 1000 + kl_init_penalty = 1 + return locals() + + +def pendulum(): + """Configuration for the pendulum classic control task.""" + locals().update(default()) + # Environment + env = 'Pendulum-v0' + max_length = 200 + steps = 1e6 # 1M + return locals() + + +def cheetah(): + """Configuration for MuJoCo's half cheetah task.""" + locals().update(default()) + # Environment + env = 'HalfCheetah-v1' + max_length = 1000 + steps = 1e7 # 10M + return locals() + + +def walker(): + """Configuration for MuJoCo's walker task.""" + locals().update(default()) + # Environment + env = 'Walker2d-v1' + max_length = 1000 + steps = 1e7 # 10M + return locals() + + +def reacher(): + """Configuration for MuJoCo's reacher task.""" + locals().update(default()) + # Environment + env = 'Reacher-v1' + max_length = 1000 + steps = 1e7 # 10M + return locals() + + +def hopper(): + """Configuration for MuJoCo's hopper task.""" + locals().update(default()) + # Environment + env = 'Hopper-v1' + max_length = 1000 + steps = 2e7 # 20M + return locals() + + +def ant(): + """Configuration for MuJoCo's ant task.""" + locals().update(default()) + # Environment + env = 'Ant-v1' + max_length = 1000 + steps = 5e7 # 50M + return locals() + + +def humanoid(): + """Configuration for MuJoCo's humanoid task.""" + locals().update(default()) + # Environment + env = 'Humanoid-v1' + max_length = 1000 + steps = 5e7 # 50M + return locals() diff --git a/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/networks.py b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/networks.py new file mode 100644 index 000000000..fe46e6b8f --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/networks.py @@ -0,0 +1,167 @@ +# Copyright 2017 The TensorFlow Agents Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Networks for the PPO algorithm defined as recurrent cells.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + + +_MEAN_WEIGHTS_INITIALIZER = tf.contrib.layers.variance_scaling_initializer( + factor=0.1) +_LOGSTD_INITIALIZER = tf.random_normal_initializer(-1, 1e-10) + +class LinearGaussianPolicy(tf.contrib.rnn.RNNCell): + """Indepent linear network with a tanh at the end for policy and feedforward network for the value. + + The policy network outputs the mean action and the log standard deviation + is learned as indepent parameter vector. + """ + + def __init__(self, + policy_layers, + value_layers, + action_size, + mean_weights_initializer=_MEAN_WEIGHTS_INITIALIZER, + logstd_initializer=_LOGSTD_INITIALIZER): + self._policy_layers = policy_layers + self._value_layers = value_layers + self._action_size = action_size + self._mean_weights_initializer = mean_weights_initializer + self._logstd_initializer = logstd_initializer + + @property + def state_size(self): + unused_state_size = 1 + return unused_state_size + + @property + def output_size(self): + return (self._action_size, self._action_size, tf.TensorShape([])) + + def __call__(self, observation, state): + with tf.variable_scope('policy'): + x = tf.contrib.layers.flatten(observation) + mean = tf.contrib.layers.fully_connected( + x, + self._action_size, + tf.tanh, + weights_initializer=self._mean_weights_initializer) + logstd = tf.get_variable('logstd', mean.shape[1:], tf.float32, + self._logstd_initializer) + logstd = tf.tile(logstd[None, ...], + [tf.shape(mean)[0]] + [1] * logstd.shape.ndims) + with tf.variable_scope('value'): + x = tf.contrib.layers.flatten(observation) + for size in self._value_layers: + x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu) + value = tf.contrib.layers.fully_connected(x, 1, None)[:, 0] + return (mean, logstd, value), state + + +class ForwardGaussianPolicy(tf.contrib.rnn.RNNCell): + """Independent feed forward networks for policy and value. + + The policy network outputs the mean action and the log standard deviation + is learned as independent parameter vector. + """ + + def __init__( + self, policy_layers, value_layers, action_size, + mean_weights_initializer=_MEAN_WEIGHTS_INITIALIZER, + logstd_initializer=_LOGSTD_INITIALIZER): + self._policy_layers = policy_layers + self._value_layers = value_layers + self._action_size = action_size + self._mean_weights_initializer = mean_weights_initializer + self._logstd_initializer = logstd_initializer + + @property + def state_size(self): + unused_state_size = 1 + return unused_state_size + + @property + def output_size(self): + return (self._action_size, self._action_size, tf.TensorShape([])) + + def __call__(self, observation, state): + with tf.variable_scope('policy'): + x = tf.contrib.layers.flatten(observation) + for size in self._policy_layers: + x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu) + mean = tf.contrib.layers.fully_connected( + x, self._action_size, tf.tanh, + weights_initializer=self._mean_weights_initializer) + logstd = tf.get_variable( + 'logstd', mean.shape[1:], tf.float32, self._logstd_initializer) + logstd = tf.tile( + logstd[None, ...], [tf.shape(mean)[0]] + [1] * logstd.shape.ndims) + with tf.variable_scope('value'): + x = tf.contrib.layers.flatten(observation) + for size in self._value_layers: + x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu) + value = tf.contrib.layers.fully_connected(x, 1, None)[:, 0] + return (mean, logstd, value), state + + +class RecurrentGaussianPolicy(tf.contrib.rnn.RNNCell): + """Independent recurrent policy and feed forward value networks. + + The policy network outputs the mean action and the log standard deviation + is learned as independent parameter vector. The last policy layer is recurrent + and uses a GRU cell. + """ + + def __init__( + self, policy_layers, value_layers, action_size, + mean_weights_initializer=_MEAN_WEIGHTS_INITIALIZER, + logstd_initializer=_LOGSTD_INITIALIZER): + self._policy_layers = policy_layers + self._value_layers = value_layers + self._action_size = action_size + self._mean_weights_initializer = mean_weights_initializer + self._logstd_initializer = logstd_initializer + self._cell = tf.contrib.rnn.GRUBlockCell(100) + + @property + def state_size(self): + return self._cell.state_size + + @property + def output_size(self): + return (self._action_size, self._action_size, tf.TensorShape([])) + + def __call__(self, observation, state): + with tf.variable_scope('policy'): + x = tf.contrib.layers.flatten(observation) + for size in self._policy_layers[:-1]: + x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu) + x, state = self._cell(x, state) + mean = tf.contrib.layers.fully_connected( + x, self._action_size, tf.tanh, + weights_initializer=self._mean_weights_initializer) + logstd = tf.get_variable( + 'logstd', mean.shape[1:], tf.float32, self._logstd_initializer) + logstd = tf.tile( + logstd[None, ...], [tf.shape(mean)[0]] + [1] * logstd.shape.ndims) + with tf.variable_scope('value'): + x = tf.contrib.layers.flatten(observation) + for size in self._value_layers: + x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu) + value = tf.contrib.layers.fully_connected(x, 1, None)[:, 0] + return (mean, logstd, value), state diff --git a/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/train.py b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/train.py new file mode 100644 index 000000000..e7048c2d9 --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/train.py @@ -0,0 +1,165 @@ +# Copyright 2017 The TensorFlow Agents Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r"""Script to train a batch reinforcement learning algorithm. + +Command line: + + python3 -m agents.scripts.train --logdir=/path/to/logdir --config=pendulum +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import datetime +import functools +import os + +import gym +import tensorflow as tf + +from pybullet_envs.minitaur.agents import tools +from pybullet_envs.minitaur.agents.scripts import configs +from pybullet_envs.minitaur.agents.scripts import utility + + +def _create_environment(config): + """Constructor for an instance of the environment. + + Args: + config: Object providing configurations via attributes. + + Returns: + Wrapped OpenAI Gym environment. + """ + if isinstance(config.env, str): + env = gym.make(config.env) + else: + env = config.env() + if config.max_length: + env = tools.wrappers.LimitDuration(env, config.max_length) + env = tools.wrappers.RangeNormalize(env) + env = tools.wrappers.ClipAction(env) + env = tools.wrappers.ConvertTo32Bit(env) + return env + + +def _define_loop(graph, logdir, train_steps, eval_steps): + """Create and configure a training loop with training and evaluation phases. + + Args: + graph: Object providing graph elements via attributes. + logdir: Log directory for storing checkpoints and summaries. + train_steps: Number of training steps per epoch. + eval_steps: Number of evaluation steps per epoch. + + Returns: + Loop object. + """ + loop = tools.Loop( + logdir, graph.step, graph.should_log, graph.do_report, + graph.force_reset) + loop.add_phase( + 'train', graph.done, graph.score, graph.summary, train_steps, + report_every=None, + log_every=train_steps // 2, + checkpoint_every=None, + feed={graph.is_training: True}) + loop.add_phase( + 'eval', graph.done, graph.score, graph.summary, eval_steps, + report_every=eval_steps, + log_every=eval_steps // 2, + checkpoint_every=10 * eval_steps, + feed={graph.is_training: False}) + return loop + + +def train(config, env_processes): + """Training and evaluation entry point yielding scores. + + Resolves some configuration attributes, creates environments, graph, and + training loop. By default, assigns all operations to the CPU. + + Args: + config: Object providing configurations via attributes. + env_processes: Whether to step environments in separate processes. + + Yields: + Evaluation scores. + """ + tf.reset_default_graph() + with config.unlocked: + config.network = functools.partial( + utility.define_network, config.network, config) + config.policy_optimizer = getattr(tf.train, config.policy_optimizer) + config.value_optimizer = getattr(tf.train, config.value_optimizer) + if config.update_every % config.num_agents: + tf.logging.warn('Number of agents should divide episodes per update.') + with tf.device('/cpu:0'): + batch_env = utility.define_batch_env( + lambda: _create_environment(config), + config.num_agents, env_processes) + graph = utility.define_simulation_graph( + batch_env, config.algorithm, config) + loop = _define_loop( + graph, config.logdir, + config.update_every * config.max_length, + config.eval_episodes * config.max_length) + total_steps = int( + config.steps / config.update_every * + (config.update_every + config.eval_episodes)) + # Exclude episode related variables since the Python state of environments is + # not checkpointed and thus new episodes start after resuming. + saver = utility.define_saver(exclude=(r'.*_temporary/.*',)) + sess_config = tf.ConfigProto(allow_soft_placement=True) + sess_config.gpu_options.allow_growth = True + with tf.Session(config=sess_config) as sess: + utility.initialize_variables(sess, saver, config.logdir) + for score in loop.run(sess, saver, total_steps): + yield score + batch_env.close() + + +def main(_): + """Create or load configuration and launch the trainer.""" + utility.set_up_logging() + if not FLAGS.config: + raise KeyError('You must specify a configuration.') + logdir = FLAGS.logdir and os.path.expanduser(os.path.join( + FLAGS.logdir, '{}-{}'.format(FLAGS.timestamp, FLAGS.config))) + try: + config = utility.load_config(logdir) + except IOError: + config = tools.AttrDict(getattr(configs, FLAGS.config)()) + config = utility.save_config(config, logdir) + for score in train(config, FLAGS.env_processes): + tf.logging.info('Score {}.'.format(score)) + + +if __name__ == '__main__': + FLAGS = tf.app.flags.FLAGS + tf.app.flags.DEFINE_string( + 'logdir', None, + 'Base directory to store logs.') + tf.app.flags.DEFINE_string( + 'timestamp', datetime.datetime.now().strftime('%Y%m%dT%H%M%S'), + 'Sub directory to store logs.') + tf.app.flags.DEFINE_string( + 'config', None, + 'Configuration to execute.') + tf.app.flags.DEFINE_boolean( + 'env_processes', True, + 'Step environments in separate processes to circumvent the GIL.') + tf.app.run() diff --git a/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/train_ppo_test.py b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/train_ppo_test.py new file mode 100644 index 000000000..df5095825 --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/train_ppo_test.py @@ -0,0 +1,110 @@ +# Copyright 2017 The TensorFlow Agents Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for the PPO algorithm usage example.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import functools +import itertools + +import tensorflow as tf + +from google3.robotics.reinforcement_learning.agents import ppo +from google3.robotics.reinforcement_learning.agents import tools +from google3.robotics.reinforcement_learning.agents.scripts import configs +from google3.robotics.reinforcement_learning.agents.scripts import networks +from google3.robotics.reinforcement_learning.agents.scripts import train + + +FLAGS = tf.app.flags.FLAGS + + +class PPOTest(tf.test.TestCase): + + def test_no_crash_cheetah(self): + nets = networks.ForwardGaussianPolicy, networks.RecurrentGaussianPolicy + for network in nets: + config = self._define_config() + with config.unlocked: + config.env = 'HalfCheetah-v1' + config.max_length = 200 + config.steps = 1000 + config.network = network + for score in train.train(config, env_processes=True): + float(score) + + def test_no_crash_ant(self): + nets = networks.ForwardGaussianPolicy, networks.RecurrentGaussianPolicy + for network in nets: + config = self._define_config() + with config.unlocked: + config.env = 'Ant-v1' + config.max_length = 200 + config.steps = 1000 + config.network = network + for score in train.train(config, env_processes=True): + float(score) + + def test_no_crash_observation_shape(self): + nets = networks.ForwardGaussianPolicy, networks.RecurrentGaussianPolicy + observ_shapes = (1,), (2, 3), (2, 3, 4) + for network, observ_shape in itertools.product(nets, observ_shapes): + config = self._define_config() + with config.unlocked: + config.env = functools.partial( + tools.MockEnvironment, observ_shape, action_shape=(3,), + min_duration=15, max_duration=15) + config.max_length = 20 + config.steps = 100 + config.network = network + for score in train.train(config, env_processes=False): + float(score) + + def test_no_crash_variable_duration(self): + config = self._define_config() + with config.unlocked: + config.env = functools.partial( + tools.MockEnvironment, observ_shape=(2, 3), action_shape=(3,), + min_duration=5, max_duration=25) + config.max_length = 25 + config.steps = 200 + config.network = networks.RecurrentGaussianPolicy + for score in train.train(config, env_processes=False): + float(score) + + def _define_config(self): + # Start from the example configuration. + locals().update(configs.default()) + # pylint: disable=unused-variable + # General + algorithm = ppo.PPOAlgorithm + num_agents = 2 + update_every = 4 + use_gpu = False + # Network + policy_layers = 20, 10 + value_layers = 20, 10 + # Optimization + update_epochs_policy = 2 + update_epochs_value = 2 + # pylint: enable=unused-variable + return tools.AttrDict(locals()) + + +if __name__ == '__main__': + FLAGS.config = 'unused' + tf.test.main() diff --git a/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/utility.py b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/utility.py new file mode 100644 index 000000000..201ea8988 --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/utility.py @@ -0,0 +1,213 @@ +# Copyright 2017 The TensorFlow Agents Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities for using reinforcement learning algorithms.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import logging +import os +import re + +import ruamel.yaml as yaml +import tensorflow as tf + +from pybullet_envs.minitaur.agents import tools + + +def define_simulation_graph(batch_env, algo_cls, config): + """Define the algortihm and environment interaction. + + Args: + batch_env: In-graph environments object. + algo_cls: Constructor of a batch algorithm. + config: Configuration object for the algorithm. + + Returns: + Object providing graph elements via attributes. + """ + # pylint: disable=unused-variable + step = tf.Variable(0, False, dtype=tf.int32, name='global_step') + is_training = tf.placeholder(tf.bool, name='is_training') + should_log = tf.placeholder(tf.bool, name='should_log') + do_report = tf.placeholder(tf.bool, name='do_report') + force_reset = tf.placeholder(tf.bool, name='force_reset') + algo = algo_cls(batch_env, step, is_training, should_log, config) + done, score, summary = tools.simulate( + batch_env, algo, should_log, force_reset) + message = 'Graph contains {} trainable variables.' + tf.logging.info(message.format(tools.count_weights())) + # pylint: enable=unused-variable + return tools.AttrDict(locals()) + + +def define_batch_env(constructor, num_agents, env_processes): + """Create environments and apply all desired wrappers. + + Args: + constructor: Constructor of an OpenAI gym environment. + num_agents: Number of environments to combine in the batch. + env_processes: Whether to step environment in external processes. + + Returns: + In-graph environments object. + """ + with tf.variable_scope('environments'): + if env_processes: + envs = [ + tools.wrappers.ExternalProcess(constructor) + for _ in range(num_agents)] + else: + envs = [constructor() for _ in range(num_agents)] + batch_env = tools.BatchEnv(envs, blocking=not env_processes) + batch_env = tools.InGraphBatchEnv(batch_env) + return batch_env + + +def define_saver(exclude=None): + """Create a saver for the variables we want to checkpoint. + + Args: + exclude: List of regexes to match variable names to exclude. + + Returns: + Saver object. + """ + variables = [] + exclude = exclude or [] + exclude = [re.compile(regex) for regex in exclude] + for variable in tf.global_variables(): + if any(regex.match(variable.name) for regex in exclude): + continue + variables.append(variable) + saver = tf.train.Saver(variables, keep_checkpoint_every_n_hours=5) + return saver + + +def define_network(constructor, config, action_size): + """Constructor for the recurrent cell for the algorithm. + + Args: + constructor: Callable returning the network as RNNCell. + config: Object providing configurations via attributes. + action_size: Integer indicating the amount of action dimensions. + + Returns: + Created recurrent cell object. + """ + mean_weights_initializer = ( + tf.contrib.layers.variance_scaling_initializer( + factor=config.init_mean_factor)) + logstd_initializer = tf.random_normal_initializer( + config.init_logstd, 1e-10) + network = constructor( + config.policy_layers, config.value_layers, action_size, + mean_weights_initializer=mean_weights_initializer, + logstd_initializer=logstd_initializer) + return network + + +def initialize_variables(sess, saver, logdir, checkpoint=None, resume=None): + """Initialize or restore variables from a checkpoint if available. + + Args: + sess: Session to initialize variables in. + saver: Saver to restore variables. + logdir: Directory to search for checkpoints. + checkpoint: Specify what checkpoint name to use; defaults to most recent. + resume: Whether to expect recovering a checkpoint or starting a new run. + + Raises: + ValueError: If resume expected but no log directory specified. + RuntimeError: If no resume expected but a checkpoint was found. + """ + sess.run(tf.group( + tf.local_variables_initializer(), + tf.global_variables_initializer())) + if resume and not (logdir or checkpoint): + raise ValueError('Need to specify logdir to resume a checkpoint.') + if logdir: + state = tf.train.get_checkpoint_state(logdir) + if checkpoint: + checkpoint = os.path.join(logdir, checkpoint) + if not checkpoint and state and state.model_checkpoint_path: + checkpoint = state.model_checkpoint_path + if checkpoint and resume is False: + message = 'Found unexpected checkpoint when starting a new run.' + raise RuntimeError(message) + if checkpoint: + saver.restore(sess, checkpoint) + + +def save_config(config, logdir=None): + """Save a new configuration by name. + + If a logging directory is specified, is will be created and the configuration + will be stored there. Otherwise, a log message will be printed. + + Args: + config: Configuration object. + logdir: Location for writing summaries and checkpoints if specified. + + Returns: + Configuration object. + """ + if logdir: + with config.unlocked: + config.logdir = logdir + message = 'Start a new run and write summaries and checkpoints to {}.' + tf.logging.info(message.format(config.logdir)) + tf.gfile.MakeDirs(config.logdir) + config_path = os.path.join(config.logdir, 'config.yaml') + with tf.gfile.FastGFile(config_path, 'w') as file_: + yaml.dump(config, file_, default_flow_style=False) + else: + message = ( + 'Start a new run without storing summaries and checkpoints since no ' + 'logging directory was specified.') + tf.logging.info(message) + return config + + +def load_config(logdir): + """Load a configuration from the log directory. + + Args: + logdir: The logging directory containing the configuration file. + + Raises: + IOError: The logging directory does not contain a configuration file. + + Returns: + Configuration object. + """ + config_path = logdir and os.path.join(logdir, 'config.yaml') + if not config_path or not tf.gfile.Exists(config_path): + message = ( + 'Cannot resume an existing run since the logging directory does not ' + 'contain a configuration file.') + raise IOError(message) + with tf.gfile.FastGFile(config_path, 'r') as file_: + config = yaml.load(file_) + message = 'Resume run and write summaries and checkpoints to {}.' + tf.logging.info(message.format(config.logdir)) + return config + + +def set_up_logging(): + """Configure the TensorFlow logger.""" + tf.logging.set_verbosity(tf.logging.INFO) + logging.getLogger('tensorflow').propagate = False diff --git a/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/visualize.py b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/visualize.py new file mode 100644 index 000000000..3a5f62e4b --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/visualize.py @@ -0,0 +1,157 @@ +# Copyright 2017 The TensorFlow Agents Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r"""Script to render videos of the Proximal Policy Gradient algorithm. + +Command line: + + python3 -m agents.scripts.visualize \ + --logdir=/path/to/logdir/