add a temp copy of TF agents (until the API stops changing or configs.py are included)
This commit is contained in:
147
examples/pybullet/gym/pybullet_envs/agents/tools/simulate.py
Normal file
147
examples/pybullet/gym/pybullet_envs/agents/tools/simulate.py
Normal file
@@ -0,0 +1,147 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""In-graph simulation step of a vectorized algorithm with environments."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from . import streaming_mean
|
||||
|
||||
|
||||
def simulate(batch_env, algo, log=True, reset=False):
|
||||
"""Simulation step of a vecrotized algorithm with in-graph environments.
|
||||
|
||||
Integrates the operations implemented by the algorithm and the environments
|
||||
into a combined operation.
|
||||
|
||||
Args:
|
||||
batch_env: In-graph batch environment.
|
||||
algo: Algorithm instance implementing required operations.
|
||||
log: Tensor indicating whether to compute and return summaries.
|
||||
reset: Tensor causing all environments to reset.
|
||||
|
||||
Returns:
|
||||
Tuple of tensors containing done flags for the current episodes, possibly
|
||||
intermediate scores for the episodes, and a summary tensor.
|
||||
"""
|
||||
|
||||
def _define_begin_episode(agent_indices):
|
||||
"""Reset environments, intermediate scores and durations for new episodes.
|
||||
|
||||
Args:
|
||||
agent_indices: Tensor containing batch indices starting an episode.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
assert agent_indices.shape.ndims == 1
|
||||
zero_scores = tf.zeros_like(agent_indices, tf.float32)
|
||||
zero_durations = tf.zeros_like(agent_indices)
|
||||
reset_ops = [
|
||||
batch_env.reset(agent_indices),
|
||||
tf.scatter_update(score, agent_indices, zero_scores),
|
||||
tf.scatter_update(length, agent_indices, zero_durations)]
|
||||
with tf.control_dependencies(reset_ops):
|
||||
return algo.begin_episode(agent_indices)
|
||||
|
||||
def _define_step():
|
||||
"""Request actions from the algorithm and apply them to the environments.
|
||||
|
||||
Increments the lengths of all episodes and increases their scores by the
|
||||
current reward. After stepping the environments, provides the full
|
||||
transition tuple to the algorithm.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
prevob = batch_env.observ + 0 # Ensure a copy of the variable value.
|
||||
agent_indices = tf.range(len(batch_env))
|
||||
action, step_summary = algo.perform(agent_indices, prevob)
|
||||
action.set_shape(batch_env.action.shape)
|
||||
with tf.control_dependencies([batch_env.simulate(action)]):
|
||||
add_score = score.assign_add(batch_env.reward)
|
||||
inc_length = length.assign_add(tf.ones(len(batch_env), tf.int32))
|
||||
with tf.control_dependencies([add_score, inc_length]):
|
||||
agent_indices = tf.range(len(batch_env))
|
||||
experience_summary = algo.experience(
|
||||
agent_indices, prevob, batch_env.action, batch_env.reward,
|
||||
batch_env.done, batch_env.observ)
|
||||
return tf.summary.merge([step_summary, experience_summary])
|
||||
|
||||
def _define_end_episode(agent_indices):
|
||||
"""Notify the algorithm of ending episodes.
|
||||
|
||||
Also updates the mean score and length counters used for summaries.
|
||||
|
||||
Args:
|
||||
agent_indices: Tensor holding batch indices that end their episodes.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
assert agent_indices.shape.ndims == 1
|
||||
submit_score = mean_score.submit(tf.gather(score, agent_indices))
|
||||
submit_length = mean_length.submit(
|
||||
tf.cast(tf.gather(length, agent_indices), tf.float32))
|
||||
with tf.control_dependencies([submit_score, submit_length]):
|
||||
return algo.end_episode(agent_indices)
|
||||
|
||||
def _define_summaries():
|
||||
"""Reset the average score and duration, and return them as summary.
|
||||
|
||||
Returns:
|
||||
Summary string.
|
||||
"""
|
||||
score_summary = tf.cond(
|
||||
tf.logical_and(log, tf.cast(mean_score.count, tf.bool)),
|
||||
lambda: tf.summary.scalar('mean_score', mean_score.clear()), str)
|
||||
length_summary = tf.cond(
|
||||
tf.logical_and(log, tf.cast(mean_length.count, tf.bool)),
|
||||
lambda: tf.summary.scalar('mean_length', mean_length.clear()), str)
|
||||
return tf.summary.merge([score_summary, length_summary])
|
||||
|
||||
with tf.name_scope('simulate'):
|
||||
log = tf.convert_to_tensor(log)
|
||||
reset = tf.convert_to_tensor(reset)
|
||||
with tf.variable_scope('simulate_temporary'):
|
||||
score = tf.Variable(
|
||||
tf.zeros(len(batch_env), dtype=tf.float32), False, name='score')
|
||||
length = tf.Variable(
|
||||
tf.zeros(len(batch_env), dtype=tf.int32), False, name='length')
|
||||
mean_score = streaming_mean.StreamingMean((), tf.float32)
|
||||
mean_length = streaming_mean.StreamingMean((), tf.float32)
|
||||
agent_indices = tf.cond(
|
||||
reset,
|
||||
lambda: tf.range(len(batch_env)),
|
||||
lambda: tf.cast(tf.where(batch_env.done)[:, 0], tf.int32))
|
||||
begin_episode = tf.cond(
|
||||
tf.cast(tf.shape(agent_indices)[0], tf.bool),
|
||||
lambda: _define_begin_episode(agent_indices), str)
|
||||
with tf.control_dependencies([begin_episode]):
|
||||
step = _define_step()
|
||||
with tf.control_dependencies([step]):
|
||||
agent_indices = tf.cast(tf.where(batch_env.done)[:, 0], tf.int32)
|
||||
end_episode = tf.cond(
|
||||
tf.cast(tf.shape(agent_indices)[0], tf.bool),
|
||||
lambda: _define_end_episode(agent_indices), str)
|
||||
with tf.control_dependencies([end_episode]):
|
||||
summary = tf.summary.merge([
|
||||
_define_summaries(), begin_episode, step, end_episode])
|
||||
with tf.control_dependencies([summary]):
|
||||
done, score = tf.identity(batch_env.done), tf.identity(score)
|
||||
return done, score, summary
|
||||
Reference in New Issue
Block a user