add a temp copy of TF agents (until the API stops changing or configs.py are included)

2017-11-16 16:47:14 +00:00
parent 7f654bdd87
commit 7b030426c1
24 changed files with 3294 additions and 27 deletions
--- a/examples/pybullet/gym/pybullet_envs/agents/tools/simulate.py
+++ b/examples/pybullet/gym/pybullet_envs/agents/tools/simulate.py
@@ -0,0 +1,147 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""In-graph simulation step of a vectorized algorithm with environments."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from . import streaming_mean
+
+
+def simulate(batch_env, algo, log=True, reset=False):
+  """Simulation step of a vecrotized algorithm with in-graph environments.
+
+  Integrates the operations implemented by the algorithm and the environments
+  into a combined operation.
+
+  Args:
+    batch_env: In-graph batch environment.
+    algo: Algorithm instance implementing required operations.
+    log: Tensor indicating whether to compute and return summaries.
+    reset: Tensor causing all environments to reset.
+
+  Returns:
+    Tuple of tensors containing done flags for the current episodes, possibly
+    intermediate scores for the episodes, and a summary tensor.
+  """
+
+  def _define_begin_episode(agent_indices):
+    """Reset environments, intermediate scores and durations for new episodes.
+
+    Args:
+      agent_indices: Tensor containing batch indices starting an episode.
+
+    Returns:
+      Summary tensor.
+    """
+    assert agent_indices.shape.ndims == 1
+    zero_scores = tf.zeros_like(agent_indices, tf.float32)
+    zero_durations = tf.zeros_like(agent_indices)
+    reset_ops = [
+        batch_env.reset(agent_indices),
+        tf.scatter_update(score, agent_indices, zero_scores),
+        tf.scatter_update(length, agent_indices, zero_durations)]
+    with tf.control_dependencies(reset_ops):
+      return algo.begin_episode(agent_indices)
+
+  def _define_step():
+    """Request actions from the algorithm and apply them to the environments.
+
+    Increments the lengths of all episodes and increases their scores by the
+    current reward. After stepping the environments, provides the full
+    transition tuple to the algorithm.
+
+    Returns:
+      Summary tensor.
+    """
+    prevob = batch_env.observ + 0  # Ensure a copy of the variable value.
+    agent_indices = tf.range(len(batch_env))
+    action, step_summary = algo.perform(agent_indices, prevob)
+    action.set_shape(batch_env.action.shape)
+    with tf.control_dependencies([batch_env.simulate(action)]):
+      add_score = score.assign_add(batch_env.reward)
+      inc_length = length.assign_add(tf.ones(len(batch_env), tf.int32))
+    with tf.control_dependencies([add_score, inc_length]):
+      agent_indices = tf.range(len(batch_env))
+      experience_summary = algo.experience(
+          agent_indices, prevob, batch_env.action, batch_env.reward,
+          batch_env.done, batch_env.observ)
+    return tf.summary.merge([step_summary, experience_summary])
+
+  def _define_end_episode(agent_indices):
+    """Notify the algorithm of ending episodes.
+
+    Also updates the mean score and length counters used for summaries.
+
+    Args:
+      agent_indices: Tensor holding batch indices that end their episodes.
+
+    Returns:
+      Summary tensor.
+    """
+    assert agent_indices.shape.ndims == 1
+    submit_score = mean_score.submit(tf.gather(score, agent_indices))
+    submit_length = mean_length.submit(
+        tf.cast(tf.gather(length, agent_indices), tf.float32))
+    with tf.control_dependencies([submit_score, submit_length]):
+      return algo.end_episode(agent_indices)
+
+  def _define_summaries():
+    """Reset the average score and duration, and return them as summary.
+
+    Returns:
+      Summary string.
+    """
+    score_summary = tf.cond(
+        tf.logical_and(log, tf.cast(mean_score.count, tf.bool)),
+        lambda: tf.summary.scalar('mean_score', mean_score.clear()), str)
+    length_summary = tf.cond(
+        tf.logical_and(log, tf.cast(mean_length.count, tf.bool)),
+        lambda: tf.summary.scalar('mean_length', mean_length.clear()), str)
+    return tf.summary.merge([score_summary, length_summary])
+
+  with tf.name_scope('simulate'):
+    log = tf.convert_to_tensor(log)
+    reset = tf.convert_to_tensor(reset)
+    with tf.variable_scope('simulate_temporary'):
+      score = tf.Variable(
+          tf.zeros(len(batch_env), dtype=tf.float32), False, name='score')
+      length = tf.Variable(
+          tf.zeros(len(batch_env), dtype=tf.int32), False, name='length')
+    mean_score = streaming_mean.StreamingMean((), tf.float32)
+    mean_length = streaming_mean.StreamingMean((), tf.float32)
+    agent_indices = tf.cond(
+        reset,
+        lambda: tf.range(len(batch_env)),
+        lambda: tf.cast(tf.where(batch_env.done)[:, 0], tf.int32))
+    begin_episode = tf.cond(
+        tf.cast(tf.shape(agent_indices)[0], tf.bool),
+        lambda: _define_begin_episode(agent_indices), str)
+    with tf.control_dependencies([begin_episode]):
+      step = _define_step()
+    with tf.control_dependencies([step]):
+      agent_indices = tf.cast(tf.where(batch_env.done)[:, 0], tf.int32)
+      end_episode = tf.cond(
+          tf.cast(tf.shape(agent_indices)[0], tf.bool),
+          lambda: _define_end_episode(agent_indices), str)
+    with tf.control_dependencies([end_episode]):
+      summary = tf.summary.merge([
+          _define_summaries(), begin_episode, step, end_episode])
+    with tf.control_dependencies([summary]):
+      done, score = tf.identity(batch_env.done), tf.identity(score)
+    return done, score, summary