add a temp copy of TF agents (until the API stops changing or configs.py are included)

2017-11-16 16:47:14 +00:00
parent 7f654bdd87
commit 7b030426c1
24 changed files with 3294 additions and 27 deletions
--- a/examples/pybullet/gym/pybullet_envs/agents/tools/init.py
+++ b/examples/pybullet/gym/pybullet_envs/agents/tools/init.py
@@ -0,0 +1,31 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tools for reinforcement learning."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from . import wrappers
+from .attr_dict import AttrDict
+from .batch_env import BatchEnv
+from .count_weights import count_weights
+from .in_graph_batch_env import InGraphBatchEnv
+from .in_graph_env import InGraphEnv
+from .loop import Loop
+from .mock_algorithm import MockAlgorithm
+from .mock_environment import MockEnvironment
+from .simulate import simulate
+from .streaming_mean import StreamingMean
--- a/examples/pybullet/gym/pybullet_envs/agents/tools/attr_dict.py
+++ b/examples/pybullet/gym/pybullet_envs/agents/tools/attr_dict.py
@@ -0,0 +1,54 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Wrap a dictionary to access keys as attributes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+
+
+class AttrDict(dict):
+  """Wrap a dictionary to access keys as attributes."""
+
+  def __init__(self, *args, **kwargs):
+    super(AttrDict, self).__init__(*args, **kwargs)
+    super(AttrDict, self).__setattr__('_mutable', False)
+
+  def __getattr__(self, key):
+    # Do not provide None for unimplemented magic attributes.
+    if key.startswith('__'):
+      raise AttributeError
+    return self.get(key, None)
+
+  def __setattr__(self, key, value):
+    if not self._mutable:
+      message = "Cannot set attribute '{}'.".format(key)
+      message += " Use 'with obj.unlocked:' scope to set attributes."
+      raise RuntimeError(message)
+    if key.startswith('__'):
+      raise AttributeError("Cannot set magic attribute '{}'".format(key))
+    self[key] = value
+
+  @property
+  @contextlib.contextmanager
+  def unlocked(self):
+    super(AttrDict, self).__setattr__('_mutable', True)
+    yield
+    super(AttrDict, self).__setattr__('_mutable', False)
+
+  def copy(self):
+    return type(self)(super(AttrDict, self).copy())
--- a/examples/pybullet/gym/pybullet_envs/agents/tools/batch_env.py
+++ b/examples/pybullet/gym/pybullet_envs/agents/tools/batch_env.py
@@ -0,0 +1,124 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Combine multiple environments to step them in batch."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+class BatchEnv(object):
+  """Combine multiple environments to step them in batch."""
+
+  def __init__(self, envs, blocking):
+    """Combine multiple environments to step them in batch.
+
+    To step environments in parallel, environments must support a
+    `blocking=False` argument to their step and reset functions that makes them
+    return callables instead to receive the result at a later time.
+
+    Args:
+      envs: List of environments.
+      blocking: Step environments after another rather than in parallel.
+
+    Raises:
+      ValueError: Environments have different observation or action spaces.
+    """
+    self._envs = envs
+    self._blocking = blocking
+    observ_space = self._envs[0].observation_space
+    if not all(env.observation_space == observ_space for env in self._envs):
+      raise ValueError('All environments must use the same observation space.')
+    action_space = self._envs[0].action_space
+    if not all(env.action_space == action_space for env in self._envs):
+      raise ValueError('All environments must use the same observation space.')
+
+  def __len__(self):
+    """Number of combined environments."""
+    return len(self._envs)
+
+  def __getitem__(self, index):
+    """Access an underlying environment by index."""
+    return self._envs[index]
+
+  def __getattr__(self, name):
+    """Forward unimplemented attributes to one of the original environments.
+
+    Args:
+      name: Attribute that was accessed.
+
+    Returns:
+      Value behind the attribute name one of the wrapped environments.
+    """
+    return getattr(self._envs[0], name)
+
+  def step(self, actions):
+    """Forward a batch of actions to the wrapped environments.
+
+    Args:
+      actions: Batched action to apply to the environment.
+
+    Raises:
+      ValueError: Invalid actions.
+
+    Returns:
+      Batch of observations, rewards, and done flags.
+    """
+    for index, (env, action) in enumerate(zip(self._envs, actions)):
+      if not env.action_space.contains(action):
+        message = 'Invalid action at index {}: {}'
+        raise ValueError(message.format(index, action))
+    if self._blocking:
+      transitions = [
+          env.step(action)
+          for env, action in zip(self._envs, actions)]
+    else:
+      transitions = [
+          env.step(action, blocking=False)
+          for env, action in zip(self._envs, actions)]
+      transitions = [transition() for transition in transitions]
+    observs, rewards, dones, infos = zip(*transitions)
+    observ = np.stack(observs)
+    reward = np.stack(rewards)
+    done = np.stack(dones)
+    info = tuple(infos)
+    return observ, reward, done, info
+
+  def reset(self, indices=None):
+    """Reset the environment and convert the resulting observation.
+
+    Args:
+      indices: The batch indices of environments to reset; defaults to all.
+
+    Returns:
+      Batch of observations.
+    """
+    if indices is None:
+      indices = np.arange(len(self._envs))
+    if self._blocking:
+      observs = [self._envs[index].reset() for index in indices]
+    else:
+      observs = [self._envs[index].reset(blocking=False) for index in indices]
+      observs = [observ() for observ in observs]
+    observ = np.stack(observs)
+    return observ
+
+  def close(self):
+    """Send close messages to the external process and join them."""
+    for env in self._envs:
+      if hasattr(env, 'close'):
+        env.close()
--- a/examples/pybullet/gym/pybullet_envs/agents/tools/count_weights.py
+++ b/examples/pybullet/gym/pybullet_envs/agents/tools/count_weights.py
@@ -0,0 +1,48 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Count learnable parameters."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+
+import numpy as np
+import tensorflow as tf
+
+
+def count_weights(scope=None, exclude=None, graph=None):
+  """Count learnable parameters.
+
+  Args:
+    scope: Resrict the count to a variable scope.
+    exclude: Regex to match variable names to exclude.
+    graph: Operate on a graph other than the current default graph.
+
+  Returns:
+    Number of learnable parameters as integer.
+  """
+  if scope:
+    scope = scope if scope.endswith('/') else scope + '/'
+  graph = graph or tf.get_default_graph()
+  vars_ = graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
+  if scope:
+    vars_ = [var for var in vars_ if var.name.startswith(scope)]
+  if exclude:
+    exclude = re.compile(exclude)
+    vars_ = [var for var in vars_ if not exclude.match(var.name)]
+  shapes = [var.get_shape().as_list() for var in vars_]
+  return int(sum(np.prod(shape) for shape in shapes))
--- a/examples/pybullet/gym/pybullet_envs/agents/tools/in_graph_batch_env.py
+++ b/examples/pybullet/gym/pybullet_envs/agents/tools/in_graph_batch_env.py
@@ -0,0 +1,178 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Batch of environments inside the TensorFlow graph."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gym
+import tensorflow as tf
+
+
+class InGraphBatchEnv(object):
+  """Batch of environments inside the TensorFlow graph.
+
+  The batch of environments will be stepped and reset inside of the graph using
+  a tf.py_func(). The current batch of observations, actions, rewards, and done
+  flags are held in according variables.
+  """
+
+  def __init__(self, batch_env):
+    """Batch of environments inside the TensorFlow graph.
+
+    Args:
+      batch_env: Batch environment.
+    """
+    self._batch_env = batch_env
+    observ_shape = self._parse_shape(self._batch_env.observation_space)
+    observ_dtype = self._parse_dtype(self._batch_env.observation_space)
+    action_shape = self._parse_shape(self._batch_env.action_space)
+    action_dtype = self._parse_dtype(self._batch_env.action_space)
+    with tf.variable_scope('env_temporary'):
+      self._observ = tf.Variable(
+          tf.zeros((len(self._batch_env),) + observ_shape, observ_dtype),
+          name='observ', trainable=False)
+      self._action = tf.Variable(
+          tf.zeros((len(self._batch_env),) + action_shape, action_dtype),
+          name='action', trainable=False)
+      self._reward = tf.Variable(
+          tf.zeros((len(self._batch_env),), tf.float32),
+          name='reward', trainable=False)
+      self._done = tf.Variable(
+          tf.cast(tf.ones((len(self._batch_env),)), tf.bool),
+          name='done', trainable=False)
+
+  def __getattr__(self, name):
+    """Forward unimplemented attributes to one of the original environments.
+
+    Args:
+      name: Attribute that was accessed.
+
+    Returns:
+      Value behind the attribute name in one of the original environments.
+    """
+    return getattr(self._batch_env, name)
+
+  def __len__(self):
+    """Number of combined environments."""
+    return len(self._batch_env)
+
+  def __getitem__(self, index):
+    """Access an underlying environment by index."""
+    return self._batch_env[index]
+
+  def simulate(self, action):
+    """Step the batch of environments.
+
+    The results of the step can be accessed from the variables defined below.
+
+    Args:
+      action: Tensor holding the batch of actions to apply.
+
+    Returns:
+      Operation.
+    """
+    with tf.name_scope('environment/simulate'):
+      if action.dtype in (tf.float16, tf.float32, tf.float64):
+        action = tf.check_numerics(action, 'action')
+      observ_dtype = self._parse_dtype(self._batch_env.observation_space)
+      observ, reward, done = tf.py_func(
+          lambda a: self._batch_env.step(a)[:3], [action],
+          [observ_dtype, tf.float32, tf.bool], name='step')
+      observ = tf.check_numerics(observ, 'observ')
+      reward = tf.check_numerics(reward, 'reward')
+      return tf.group(
+          self._observ.assign(observ),
+          self._action.assign(action),
+          self._reward.assign(reward),
+          self._done.assign(done))
+
+  def reset(self, indices=None):
+    """Reset the batch of environments.
+
+    Args:
+      indices: The batch indices of the environments to reset; defaults to all.
+
+    Returns:
+      Batch tensor of the new observations.
+    """
+    if indices is None:
+      indices = tf.range(len(self._batch_env))
+    observ_dtype = self._parse_dtype(self._batch_env.observation_space)
+    observ = tf.py_func(
+        self._batch_env.reset, [indices], observ_dtype, name='reset')
+    observ = tf.check_numerics(observ, 'observ')
+    reward = tf.zeros_like(indices, tf.float32)
+    done = tf.zeros_like(indices, tf.bool)
+    with tf.control_dependencies([
+        tf.scatter_update(self._observ, indices, observ),
+        tf.scatter_update(self._reward, indices, reward),
+        tf.scatter_update(self._done, indices, done)]):
+      return tf.identity(observ)
+
+  @property
+  def observ(self):
+    """Access the variable holding the current observation."""
+    return self._observ
+
+  @property
+  def action(self):
+    """Access the variable holding the last recieved action."""
+    return self._action
+
+  @property
+  def reward(self):
+    """Access the variable holding the current reward."""
+    return self._reward
+
+  @property
+  def done(self):
+    """Access the variable indicating whether the episode is done."""
+    return self._done
+
+  def close(self):
+    """Send close messages to the external process and join them."""
+    self._batch_env.close()
+
+  def _parse_shape(self, space):
+    """Get a tensor shape from a OpenAI Gym space.
+
+    Args:
+      space: Gym space.
+
+    Returns:
+      Shape tuple.
+    """
+    if isinstance(space, gym.spaces.Discrete):
+      return ()
+    if isinstance(space, gym.spaces.Box):
+      return space.shape
+    raise NotImplementedError()
+
+  def _parse_dtype(self, space):
+    """Get a tensor dtype from a OpenAI Gym space.
+
+    Args:
+      space: Gym space.
+
+    Returns:
+      TensorFlow data type.
+    """
+    if isinstance(space, gym.spaces.Discrete):
+      return tf.int32
+    if isinstance(space, gym.spaces.Box):
+      return tf.float32
+    raise NotImplementedError()
--- a/examples/pybullet/gym/pybullet_envs/agents/tools/in_graph_env.py
+++ b/examples/pybullet/gym/pybullet_envs/agents/tools/in_graph_env.py
@@ -0,0 +1,162 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Put an OpenAI Gym environment into the TensorFlow graph."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gym
+import tensorflow as tf
+
+
+class InGraphEnv(object):
+  """Put an OpenAI Gym environment into the TensorFlow graph.
+
+  The environment will be stepped and reset inside of the graph using
+  tf.py_func(). The current observation, action, reward, and done flag are held
+  in according variables.
+  """
+
+  def __init__(self, env):
+    """Put an OpenAI Gym environment into the TensorFlow graph.
+
+    Args:
+      env: OpenAI Gym environment.
+    """
+    self._env = env
+    observ_shape = self._parse_shape(self._env.observation_space)
+    observ_dtype = self._parse_dtype(self._env.observation_space)
+    action_shape = self._parse_shape(self._env.action_space)
+    action_dtype = self._parse_dtype(self._env.action_space)
+    with tf.name_scope('environment'):
+      self._observ = tf.Variable(
+          tf.zeros(observ_shape, observ_dtype), name='observ', trainable=False)
+      self._action = tf.Variable(
+          tf.zeros(action_shape, action_dtype), name='action', trainable=False)
+      self._reward = tf.Variable(
+          0.0, dtype=tf.float32, name='reward', trainable=False)
+      self._done = tf.Variable(
+          True, dtype=tf.bool, name='done', trainable=False)
+      self._step = tf.Variable(
+          0, dtype=tf.int32, name='step', trainable=False)
+
+  def __getattr__(self, name):
+    """Forward unimplemented attributes to the original environment.
+
+    Args:
+      name: Attribute that was accessed.
+
+    Returns:
+      Value behind the attribute name in the wrapped environment.
+    """
+    return getattr(self._env, name)
+
+  def simulate(self, action):
+    """Step the environment.
+
+    The result of the step can be accessed from the variables defined below.
+
+    Args:
+      action: Tensor holding the action to apply.
+
+    Returns:
+      Operation.
+    """
+    with tf.name_scope('environment/simulate'):
+      if action.dtype in (tf.float16, tf.float32, tf.float64):
+        action = tf.check_numerics(action, 'action')
+      observ_dtype = self._parse_dtype(self._env.observation_space)
+      observ, reward, done = tf.py_func(
+          lambda a: self._env.step(a)[:3], [action],
+          [observ_dtype, tf.float32, tf.bool], name='step')
+      observ = tf.check_numerics(observ, 'observ')
+      reward = tf.check_numerics(reward, 'reward')
+      return tf.group(
+          self._observ.assign(observ),
+          self._action.assign(action),
+          self._reward.assign(reward),
+          self._done.assign(done),
+          self._step.assign_add(1))
+
+  def reset(self):
+    """Reset the environment.
+
+    Returns:
+      Tensor of the current observation.
+    """
+    observ_dtype = self._parse_dtype(self._env.observation_space)
+    observ = tf.py_func(self._env.reset, [], observ_dtype, name='reset')
+    observ = tf.check_numerics(observ, 'observ')
+    with tf.control_dependencies([
+        self._observ.assign(observ),
+        self._reward.assign(0),
+        self._done.assign(False)]):
+      return tf.identity(observ)
+
+  @property
+  def observ(self):
+    """Access the variable holding the current observation."""
+    return self._observ
+
+  @property
+  def action(self):
+    """Access the variable holding the last recieved action."""
+    return self._action
+
+  @property
+  def reward(self):
+    """Access the variable holding the current reward."""
+    return self._reward
+
+  @property
+  def done(self):
+    """Access the variable indicating whether the episode is done."""
+    return self._done
+
+  @property
+  def step(self):
+    """Access the variable containg total steps of this environment."""
+    return self._step
+
+  def _parse_shape(self, space):
+    """Get a tensor shape from a OpenAI Gym space.
+
+    Args:
+      space: Gym space.
+
+    Returns:
+      Shape tuple.
+    """
+    if isinstance(space, gym.spaces.Discrete):
+      return ()
+    if isinstance(space, gym.spaces.Box):
+      return space.shape
+    raise NotImplementedError()
+
+  def _parse_dtype(self, space):
+    """Get a tensor dtype from a OpenAI Gym space.
+
+    Args:
+      space: Gym space.
+
+    Returns:
+      TensorFlow data type.
+    """
+    if isinstance(space, gym.spaces.Discrete):
+      return tf.int32
+    if isinstance(space, gym.spaces.Box):
+      return tf.float32
+    raise NotImplementedError()
--- a/examples/pybullet/gym/pybullet_envs/agents/tools/loop.py
+++ b/examples/pybullet/gym/pybullet_envs/agents/tools/loop.py
@@ -0,0 +1,233 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Execute operations in a loop and coordinate logging and checkpoints."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import os
+
+import tensorflow as tf
+
+from . import streaming_mean
+
+
+_Phase = collections.namedtuple(
+    'Phase',
+    'name, writer, op, batch, steps, feed, report_every, log_every,'
+    'checkpoint_every')
+
+
+class Loop(object):
+  """Execute operations in a loop and coordinate logging and checkpoints.
+
+  Supports multiple phases, that define their own operations to run, and
+  intervals for reporting scores, logging summaries, and storing checkpoints.
+  All class state is stored in-graph to properly recover from checkpoints.
+  """
+
+  def __init__(self, logdir, step=None, log=None, report=None, reset=None):
+    """Execute operations in a loop and coordinate logging and checkpoints.
+
+    The step, log, report, and report arguments will get created if not
+    provided. Reset is used to indicate switching to a new phase, so that the
+    model can start a new computation in case its computation is split over
+    multiple training steps.
+
+    Args:
+      logdir: Will contain checkpoints and summaries for each phase.
+      step: Variable of the global step (optional).
+      log: Tensor indicating to the model to compute summary tensors.
+      report: Tensor indicating to the loop to report the current mean score.
+      reset: Tensor indicating to the model to start a new computation.
+    """
+    self._logdir = logdir
+    self._step = (
+        tf.Variable(0, False, name='global_step') if step is None else step)
+    self._log = tf.placeholder(tf.bool) if log is None else log
+    self._report = tf.placeholder(tf.bool) if report is None else report
+    self._reset = tf.placeholder(tf.bool) if reset is None else reset
+    self._phases = []
+
+  def add_phase(
+      self, name, done, score, summary, steps,
+      report_every=None, log_every=None, checkpoint_every=None, feed=None):
+    """Add a phase to the loop protocol.
+
+    If the model breaks long computation into multiple steps, the done tensor
+    indicates whether the current score should be added to the mean counter.
+    For example, in reinforcement learning we only have a valid score at the
+    end of the episode.
+
+    Score and done tensors can either be scalars or vectors, to support
+    single and batched computations.
+
+    Args:
+      name: Name for the phase, used for the summary writer.
+      done: Tensor indicating whether current score can be used.
+      score: Tensor holding the current, possibly intermediate, score.
+      summary: Tensor holding summary string to write if not an empty string.
+      steps: Duration of the phase in steps.
+      report_every: Yield mean score every this number of steps.
+      log_every: Request summaries via `log` tensor every this number of steps.
+      checkpoint_every: Write checkpoint every this number of steps.
+      feed: Additional feed dictionary for the session run call.
+
+    Raises:
+      ValueError: Unknown rank for done or score tensors.
+    """
+    done = tf.convert_to_tensor(done, tf.bool)
+    score = tf.convert_to_tensor(score, tf.float32)
+    summary = tf.convert_to_tensor(summary, tf.string)
+    feed = feed or {}
+    if done.shape.ndims is None or score.shape.ndims is None:
+      raise ValueError("Rank of 'done' and 'score' tensors must be known.")
+    writer = self._logdir and tf.summary.FileWriter(
+        os.path.join(self._logdir, name), tf.get_default_graph(),
+        flush_secs=60)
+    op = self._define_step(done, score, summary)
+    batch = 1 if score.shape.ndims == 0 else score.shape[0].value
+    self._phases.append(_Phase(
+        name, writer, op, batch, int(steps), feed, report_every,
+        log_every, checkpoint_every))
+
+  def run(self, sess, saver, max_step=None):
+    """Run the loop schedule for a specified number of steps.
+
+    Call the operation of the current phase until the global step reaches the
+    specified maximum step. Phases are repeated over and over in the order they
+    were added.
+
+    Args:
+      sess: Session to use to run the phase operation.
+      saver: Saver used for checkpointing.
+      max_step: Run the operations until the step reaches this limit.
+
+    Yields:
+      Reported mean scores.
+    """
+    global_step = sess.run(self._step)
+    steps_made = 1
+    while True:
+      if max_step and global_step >= max_step:
+        break
+      phase, epoch, steps_in = self._find_current_phase(global_step)
+      phase_step = epoch * phase.steps + steps_in
+      if steps_in % phase.steps < steps_made:
+        message = '\n' + ('-' * 50) + '\n'
+        message += 'Phase {} (phase step {}, global step {}).'
+        tf.logging.info(message.format(phase.name, phase_step, global_step))
+      # Populate book keeping tensors.
+      phase.feed[self._reset] = (steps_in < steps_made)
+      phase.feed[self._log] = (
+          phase.writer and
+          self._is_every_steps(phase_step, phase.batch, phase.log_every))
+      phase.feed[self._report] = (
+          self._is_every_steps(phase_step, phase.batch, phase.report_every))
+      summary, mean_score, global_step, steps_made = sess.run(
+          phase.op, phase.feed)
+      if self._is_every_steps(phase_step, phase.batch, phase.checkpoint_every):
+        self._store_checkpoint(sess, saver, global_step)
+      if self._is_every_steps(phase_step, phase.batch, phase.report_every):
+        yield mean_score
+      if summary and phase.writer:
+        # We want smaller phases to catch up at the beginnig of each epoch so
+        # that their graphs are aligned.
+        longest_phase = max(phase.steps for phase in self._phases)
+        summary_step = epoch * longest_phase + steps_in
+        phase.writer.add_summary(summary, summary_step)
+
+  def _is_every_steps(self, phase_step, batch, every):
+    """Determine whether a periodic event should happen at this step.
+
+    Args:
+      phase_step: The incrementing step.
+      batch: The number of steps progressed at once.
+      every: The interval of the periode.
+
+    Returns:
+      Boolean of whether the event should happen.
+    """
+    if not every:
+      return False
+    covered_steps = range(phase_step, phase_step + batch)
+    return any((step + 1) % every == 0 for step in covered_steps)
+
+  def _find_current_phase(self, global_step):
+    """Determine the current phase based on the global step.
+
+    This ensures continuing the correct phase after restoring checkoints.
+
+    Args:
+      global_step: The global number of steps performed across all phases.
+
+    Returns:
+      Tuple of phase object, epoch number, and phase steps within the epoch.
+    """
+    epoch_size = sum(phase.steps for phase in self._phases)
+    epoch = int(global_step // epoch_size)
+    steps_in = global_step % epoch_size
+    for phase in self._phases:
+      if steps_in < phase.steps:
+        return phase, epoch, steps_in
+      steps_in -= phase.steps
+
+  def _define_step(self, done, score, summary):
+    """Combine operations of a phase.
+
+    Keeps track of the mean score and when to report it.
+
+    Args:
+      done: Tensor indicating whether current score can be used.
+      score: Tensor holding the current, possibly intermediate, score.
+      summary: Tensor holding summary string to write if not an empty string.
+
+    Returns:
+      Tuple of summary tensor, mean score, and new global step. The mean score
+      is zero for non reporting steps.
+    """
+    if done.shape.ndims == 0:
+      done = done[None]
+    if score.shape.ndims == 0:
+      score = score[None]
+    score_mean = streaming_mean.StreamingMean((), tf.float32)
+    with tf.control_dependencies([done, score, summary]):
+      done_score = tf.gather(score, tf.where(done)[:, 0])
+      submit_score = tf.cond(
+          tf.reduce_any(done), lambda: score_mean.submit(done_score), tf.no_op)
+    with tf.control_dependencies([submit_score]):
+      mean_score = tf.cond(self._report, score_mean.clear, float)
+      steps_made = tf.shape(score)[0]
+      next_step = self._step.assign_add(steps_made)
+    with tf.control_dependencies([mean_score, next_step]):
+      return tf.identity(summary), mean_score, next_step, steps_made
+
+  def _store_checkpoint(self, sess, saver, global_step):
+    """Store a checkpoint if a log directory was provided to the constructor.
+
+    The directory will be created if needed.
+
+    Args:
+      sess: Session containing variables to store.
+      saver: Saver used for checkpointing.
+      global_step: Step number of the checkpoint name.
+    """
+    if not self._logdir or not saver:
+      return
+    tf.gfile.MakeDirs(self._logdir)
+    filename = os.path.join(self._logdir, 'model.ckpt')
+    saver.save(sess, filename, global_step)
--- a/examples/pybullet/gym/pybullet_envs/agents/tools/mock_algorithm.py
+++ b/examples/pybullet/gym/pybullet_envs/agents/tools/mock_algorithm.py
@@ -0,0 +1,49 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Mock algorithm for testing reinforcement learning code."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+class MockAlgorithm(object):
+  """Produce random actions and empty summaries."""
+
+  def __init__(self, envs):
+    """Produce random actions and empty summaries.
+
+    Args:
+      envs: List of in-graph environments.
+    """
+    self._envs = envs
+
+  def begin_episode(self, unused_agent_indices):
+    return tf.constant('')
+
+  def perform(self, agent_indices, unused_observ):
+    shape = (tf.shape(agent_indices)[0],) + self._envs[0].action_space.shape
+    low = self._envs[0].action_space.low
+    high = self._envs[0].action_space.high
+    action = tf.random_uniform(shape) * (high - low) + low
+    return action, tf.constant('')
+
+  def experience(self, unused_agent_indices, *unused_transition):
+    return tf.constant('')
+
+  def end_episode(self, unused_agent_indices):
+    return tf.constant('')
--- a/examples/pybullet/gym/pybullet_envs/agents/tools/mock_environment.py
+++ b/examples/pybullet/gym/pybullet_envs/agents/tools/mock_environment.py
@@ -0,0 +1,86 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Mock environment for testing reinforcement learning code."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gym
+import gym.spaces
+import numpy as np
+
+
+class MockEnvironment(object):
+  """Generate random agent input and keep track of statistics."""
+
+  def __init__(self, observ_shape, action_shape, min_duration, max_duration):
+    """Generate random agent input and keep track of statistics.
+
+    Args:
+      observ_shape: Shape for the random observations.
+      action_shape: Shape for the action space.
+      min_duration: Minimum number of steps per episode.
+      max_duration: Maximum number of steps per episode.
+
+    Attributes:
+      steps: List of actual simulated lengths for all episodes.
+      durations: List of decided lengths for all episodes.
+    """
+    self._observ_shape = observ_shape
+    self._action_shape = action_shape
+    self._min_duration = min_duration
+    self._max_duration = max_duration
+    self._random = np.random.RandomState(0)
+    self.steps = []
+    self.durations = []
+
+  @property
+  def observation_space(self):
+    low = np.zeros(self._observ_shape)
+    high = np.ones(self._observ_shape)
+    return gym.spaces.Box(low, high)
+
+  @property
+  def action_space(self):
+    low = np.zeros(self._action_shape)
+    high = np.ones(self._action_shape)
+    return gym.spaces.Box(low, high)
+
+  @property
+  def unwrapped(self):
+    return self
+
+  def step(self, action):
+    assert self.action_space.contains(action)
+    assert self.steps[-1] < self.durations[-1]
+    self.steps[-1] += 1
+    observ = self._current_observation()
+    reward = self._current_reward()
+    done = self.steps[-1] >= self.durations[-1]
+    info = {}
+    return observ, reward, done, info
+
+  def reset(self):
+    duration = self._random.randint(self._min_duration, self._max_duration + 1)
+    self.steps.append(0)
+    self.durations.append(duration)
+    return self._current_observation()
+
+  def _current_observation(self):
+    return self._random.uniform(0, 1, self._observ_shape)
+
+  def _current_reward(self):
+    return self._random.uniform(-1, 1)
--- a/examples/pybullet/gym/pybullet_envs/agents/tools/simulate.py
+++ b/examples/pybullet/gym/pybullet_envs/agents/tools/simulate.py
@@ -0,0 +1,147 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""In-graph simulation step of a vectorized algorithm with environments."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from . import streaming_mean
+
+
+def simulate(batch_env, algo, log=True, reset=False):
+  """Simulation step of a vecrotized algorithm with in-graph environments.
+
+  Integrates the operations implemented by the algorithm and the environments
+  into a combined operation.
+
+  Args:
+    batch_env: In-graph batch environment.
+    algo: Algorithm instance implementing required operations.
+    log: Tensor indicating whether to compute and return summaries.
+    reset: Tensor causing all environments to reset.
+
+  Returns:
+    Tuple of tensors containing done flags for the current episodes, possibly
+    intermediate scores for the episodes, and a summary tensor.
+  """
+
+  def _define_begin_episode(agent_indices):
+    """Reset environments, intermediate scores and durations for new episodes.
+
+    Args:
+      agent_indices: Tensor containing batch indices starting an episode.
+
+    Returns:
+      Summary tensor.
+    """
+    assert agent_indices.shape.ndims == 1
+    zero_scores = tf.zeros_like(agent_indices, tf.float32)
+    zero_durations = tf.zeros_like(agent_indices)
+    reset_ops = [
+        batch_env.reset(agent_indices),
+        tf.scatter_update(score, agent_indices, zero_scores),
+        tf.scatter_update(length, agent_indices, zero_durations)]
+    with tf.control_dependencies(reset_ops):
+      return algo.begin_episode(agent_indices)
+
+  def _define_step():
+    """Request actions from the algorithm and apply them to the environments.
+
+    Increments the lengths of all episodes and increases their scores by the
+    current reward. After stepping the environments, provides the full
+    transition tuple to the algorithm.
+
+    Returns:
+      Summary tensor.
+    """
+    prevob = batch_env.observ + 0  # Ensure a copy of the variable value.
+    agent_indices = tf.range(len(batch_env))
+    action, step_summary = algo.perform(agent_indices, prevob)
+    action.set_shape(batch_env.action.shape)
+    with tf.control_dependencies([batch_env.simulate(action)]):
+      add_score = score.assign_add(batch_env.reward)
+      inc_length = length.assign_add(tf.ones(len(batch_env), tf.int32))
+    with tf.control_dependencies([add_score, inc_length]):
+      agent_indices = tf.range(len(batch_env))
+      experience_summary = algo.experience(
+          agent_indices, prevob, batch_env.action, batch_env.reward,
+          batch_env.done, batch_env.observ)
+    return tf.summary.merge([step_summary, experience_summary])
+
+  def _define_end_episode(agent_indices):
+    """Notify the algorithm of ending episodes.
+
+    Also updates the mean score and length counters used for summaries.
+
+    Args:
+      agent_indices: Tensor holding batch indices that end their episodes.
+
+    Returns:
+      Summary tensor.
+    """
+    assert agent_indices.shape.ndims == 1
+    submit_score = mean_score.submit(tf.gather(score, agent_indices))
+    submit_length = mean_length.submit(
+        tf.cast(tf.gather(length, agent_indices), tf.float32))
+    with tf.control_dependencies([submit_score, submit_length]):
+      return algo.end_episode(agent_indices)
+
+  def _define_summaries():
+    """Reset the average score and duration, and return them as summary.
+
+    Returns:
+      Summary string.
+    """
+    score_summary = tf.cond(
+        tf.logical_and(log, tf.cast(mean_score.count, tf.bool)),
+        lambda: tf.summary.scalar('mean_score', mean_score.clear()), str)
+    length_summary = tf.cond(
+        tf.logical_and(log, tf.cast(mean_length.count, tf.bool)),
+        lambda: tf.summary.scalar('mean_length', mean_length.clear()), str)
+    return tf.summary.merge([score_summary, length_summary])
+
+  with tf.name_scope('simulate'):
+    log = tf.convert_to_tensor(log)
+    reset = tf.convert_to_tensor(reset)
+    with tf.variable_scope('simulate_temporary'):
+      score = tf.Variable(
+          tf.zeros(len(batch_env), dtype=tf.float32), False, name='score')
+      length = tf.Variable(
+          tf.zeros(len(batch_env), dtype=tf.int32), False, name='length')
+    mean_score = streaming_mean.StreamingMean((), tf.float32)
+    mean_length = streaming_mean.StreamingMean((), tf.float32)
+    agent_indices = tf.cond(
+        reset,
+        lambda: tf.range(len(batch_env)),
+        lambda: tf.cast(tf.where(batch_env.done)[:, 0], tf.int32))
+    begin_episode = tf.cond(
+        tf.cast(tf.shape(agent_indices)[0], tf.bool),
+        lambda: _define_begin_episode(agent_indices), str)
+    with tf.control_dependencies([begin_episode]):
+      step = _define_step()
+    with tf.control_dependencies([step]):
+      agent_indices = tf.cast(tf.where(batch_env.done)[:, 0], tf.int32)
+      end_episode = tf.cond(
+          tf.cast(tf.shape(agent_indices)[0], tf.bool),
+          lambda: _define_end_episode(agent_indices), str)
+    with tf.control_dependencies([end_episode]):
+      summary = tf.summary.merge([
+          _define_summaries(), begin_episode, step, end_episode])
+    with tf.control_dependencies([summary]):
+      done, score = tf.identity(batch_env.done), tf.identity(score)
+    return done, score, summary
--- a/examples/pybullet/gym/pybullet_envs/agents/tools/streaming_mean.py
+++ b/examples/pybullet/gym/pybullet_envs/agents/tools/streaming_mean.py
@@ -0,0 +1,67 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Compute a streaming estimation of the mean of submitted tensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+class StreamingMean(object):
+  """Compute a streaming estimation of the mean of submitted tensors."""
+
+  def __init__(self, shape, dtype):
+    """Specify the shape and dtype of the mean to be estimated.
+
+    Note that a float mean to zero submitted elements is NaN, while computing
+    the integer mean of zero elements raises a division by zero error.
+
+    Args:
+      shape: Shape of the mean to compute.
+      dtype: Data type of the mean to compute.
+    """
+    self._dtype = dtype
+    self._sum = tf.Variable(lambda: tf.zeros(shape, dtype), False)
+    self._count = tf.Variable(lambda: 0, trainable=False)
+
+  @property
+  def value(self):
+    """The current value of the mean."""
+    return self._sum / tf.cast(self._count, self._dtype)
+
+  @property
+  def count(self):
+    """The number of submitted samples."""
+    return self._count
+
+  def submit(self, value):
+    """Submit a single or batch tensor to refine the streaming mean."""
+    # Add a batch dimension if necessary.
+    if value.shape.ndims == self._sum.shape.ndims:
+      value = value[None, ...]
+    return tf.group(
+        self._sum.assign_add(tf.reduce_sum(value, 0)),
+        self._count.assign_add(tf.shape(value)[0]))
+
+  def clear(self):
+    """Return the mean estimate and reset the streaming statistics."""
+    value = self._sum / tf.cast(self._count, self._dtype)
+    with tf.control_dependencies([value]):
+      reset_value = self._sum.assign(tf.zeros_like(self._sum))
+      reset_count = self._count.assign(0)
+    with tf.control_dependencies([reset_value, reset_count]):
+      return tf.identity(value)
--- a/examples/pybullet/gym/pybullet_envs/agents/tools/wrappers.py
+++ b/examples/pybullet/gym/pybullet_envs/agents/tools/wrappers.py
@@ -0,0 +1,558 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Wrappers for OpenAI Gym environments."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import atexit
+import multiprocessing
+import sys
+import traceback
+
+import gym
+import gym.spaces
+import numpy as np
+import tensorflow as tf
+
+
+class AutoReset(object):
+  """Automatically reset environment when the episode is done."""
+
+  def __init__(self, env):
+    self._env = env
+    self._done = True
+
+  def __getattr__(self, name):
+    return getattr(self._env, name)
+
+  def step(self, action):
+    if self._done:
+      observ, reward, done, info = self._env.reset(), 0.0, False, {}
+    else:
+      observ, reward, done, info = self._env.step(action)
+    self._done = done
+    return observ, reward, done, info
+
+  def reset(self):
+    self._done = False
+    return self._env.reset()
+
+
+class ActionRepeat(object):
+  """Repeat the agent action multiple steps."""
+
+  def __init__(self, env, amount):
+    self._env = env
+    self._amount = amount
+
+  def __getattr__(self, name):
+    return getattr(self._env, name)
+
+  def step(self, action):
+    done = False
+    total_reward = 0
+    current_step = 0
+    while current_step < self._amount and not done:
+      observ, reward, done, info = self._env.step(action)
+      total_reward += reward
+      current_step += 1
+    return observ, total_reward, done, info
+
+
+class RandomStart(object):
+  """Perform random number of random actions at the start of the episode."""
+
+  def __init__(self, env, max_steps):
+    self._env = env
+    self._max_steps = max_steps
+
+  def __getattr__(self, name):
+    return getattr(self._env, name)
+
+  def reset(self):
+    observ = self._env.reset()
+    random_steps = np.random.randint(0, self._max_steps)
+    for _ in range(random_steps):
+      action = self._env.action_space.sample()
+      observ, unused_reward, done, unused_info = self._env.step(action)
+      if done:
+        tf.logging.warning('Episode ended during random start.')
+        return self.reset()
+    return observ
+
+
+class FrameHistory(object):
+  """Augment the observation with past observations."""
+
+  def __init__(self, env, past_indices, flatten):
+    """Augment the observation with past observations.
+
+    Implemented as a Numpy ring buffer holding the necessary past observations.
+
+    Args:
+      env: OpenAI Gym environment to wrap.
+      past_indices: List of non-negative integers indicating the time offsets
+        from the current time step of observations to include.
+      flatten: Concatenate the past observations rather than stacking them.
+
+    Raises:
+      KeyError: The current observation is not included in the indices.
+    """
+    if 0 not in past_indices:
+      raise KeyError('Past indices should include 0 for the current frame.')
+    self._env = env
+    self._past_indices = past_indices
+    self._step = 0
+    self._buffer = None
+    self._capacity = max(past_indices)
+    self._flatten = flatten
+
+  def __getattr__(self, name):
+    return getattr(self._env, name)
+
+  @property
+  def observation_space(self):
+    low = self._env.observation_space.low
+    high = self._env.observation_space.high
+    low = np.repeat(low[None, ...], len(self._past_indices), 0)
+    high = np.repeat(high[None, ...], len(self._past_indices), 0)
+    if self._flatten:
+      low = np.reshape(low, (-1,) + low.shape[2:])
+      high = np.reshape(high, (-1,) + high.shape[2:])
+    return gym.spaces.Box(low, high)
+
+  def step(self, action):
+    observ, reward, done, info = self._env.step(action)
+    self._step += 1
+    self._buffer[self._step % self._capacity] = observ
+    observ = self._select_frames()
+    return observ, reward, done, info
+
+  def reset(self):
+    observ = self._env.reset()
+    self._buffer = np.repeat(observ[None, ...], self._capacity, 0)
+    self._step = 0
+    return self._select_frames()
+
+  def _select_frames(self):
+    indices = [
+        (self._step - index) % self._capacity for index in self._past_indices]
+    observ = self._buffer[indices]
+    if self._flatten:
+      observ = np.reshape(observ, (-1,) + observ.shape[2:])
+    return observ
+
+
+class FrameDelta(object):
+  """Convert the observation to a difference from the previous observation."""
+
+  def __init__(self, env):
+    self._env = env
+    self._last = None
+
+  def __getattr__(self, name):
+    return getattr(self._env, name)
+
+  @property
+  def observation_space(self):
+    low = self._env.observation_space.low
+    high = self._env.observation_space.high
+    low, high = low - high, high - low
+    return gym.spaces.Box(low, high)
+
+  def step(self, action):
+    observ, reward, done, info = self._env.step(action)
+    delta = observ - self._last
+    self._last = observ
+    return delta, reward, done, info
+
+  def reset(self):
+    observ = self._env.reset()
+    self._last = observ
+    return observ
+
+
+class RangeNormalize(object):
+  """Normalize the specialized observation and action ranges to [-1, 1]."""
+
+  def __init__(self, env, observ=None, action=None):
+    self._env = env
+    self._should_normalize_observ = (
+        observ is not False and self._is_finite(self._env.observation_space))
+    if observ is True and not self._should_normalize_observ:
+      raise ValueError('Cannot normalize infinite observation range.')
+    if observ is None and not self._should_normalize_observ:
+      tf.logging.info('Not normalizing infinite observation range.')
+    self._should_normalize_action = (
+        action is not False and self._is_finite(self._env.action_space))
+    if action is True and not self._should_normalize_action:
+      raise ValueError('Cannot normalize infinite action range.')
+    if action is None and not self._should_normalize_action:
+      tf.logging.info('Not normalizing infinite action range.')
+
+  def __getattr__(self, name):
+    return getattr(self._env, name)
+
+  @property
+  def observation_space(self):
+    space = self._env.observation_space
+    if not self._should_normalize_observ:
+      return space
+    return gym.spaces.Box(-np.ones(space.shape), np.ones(space.shape))
+
+  @property
+  def action_space(self):
+    space = self._env.action_space
+    if not self._should_normalize_action:
+      return space
+    return gym.spaces.Box(-np.ones(space.shape), np.ones(space.shape))
+
+  def step(self, action):
+    if self._should_normalize_action:
+      action = self._denormalize_action(action)
+    observ, reward, done, info = self._env.step(action)
+    if self._should_normalize_observ:
+      observ = self._normalize_observ(observ)
+    return observ, reward, done, info
+
+  def reset(self):
+    observ = self._env.reset()
+    if self._should_normalize_observ:
+      observ = self._normalize_observ(observ)
+    return observ
+
+  def _denormalize_action(self, action):
+    min_ = self._env.action_space.low
+    max_ = self._env.action_space.high
+    action = (action + 1) / 2 * (max_ - min_) + min_
+    return action
+
+  def _normalize_observ(self, observ):
+    min_ = self._env.observation_space.low
+    max_ = self._env.observation_space.high
+    observ = 2 * (observ - min_) / (max_ - min_) - 1
+    return observ
+
+  def _is_finite(self, space):
+    return np.isfinite(space.low).all() and np.isfinite(space.high).all()
+
+
+class ClipAction(object):
+  """Clip out of range actions to the action space of the environment."""
+
+  def __init__(self, env):
+    self._env = env
+
+  def __getattr__(self, name):
+    return getattr(self._env, name)
+
+  @property
+  def action_space(self):
+    shape = self._env.action_space.shape
+    return gym.spaces.Box(-np.inf * np.ones(shape), np.inf * np.ones(shape))
+
+  def step(self, action):
+    action_space = self._env.action_space
+    action = np.clip(action, action_space.low, action_space.high)
+    return self._env.step(action)
+
+
+class LimitDuration(object):
+  """End episodes after specified number of steps."""
+
+  def __init__(self, env, duration):
+    self._env = env
+    self._duration = duration
+    self._step = None
+
+  def __getattr__(self, name):
+    return getattr(self._env, name)
+
+  def step(self, action):
+    if self._step is None:
+      raise RuntimeError('Must reset environment.')
+    observ, reward, done, info = self._env.step(action)
+    self._step += 1
+    if self._step >= self._duration:
+      done = True
+      self._step = None
+    return observ, reward, done, info
+
+  def reset(self):
+    self._step = 0
+    return self._env.reset()
+
+
+class ExternalProcess(object):
+  """Step environment in a separate process for lock free paralellism."""
+
+  # Message types for communication via the pipe.
+  _ACCESS = 1
+  _CALL = 2
+  _RESULT = 3
+  _EXCEPTION = 4
+  _CLOSE = 5
+
+  def __init__(self, constructor):
+    """Step environment in a separate process for lock free paralellism.
+
+    The environment will be created in the external process by calling the
+    specified callable. This can be an environment class, or a function
+    creating the environment and potentially wrapping it. The returned
+    environment should not access global variables.
+
+    Args:
+      constructor: Callable that creates and returns an OpenAI gym environment.
+
+    Attributes:
+      observation_space: The cached observation space of the environment.
+      action_space: The cached action space of the environment.
+    """
+    self._conn, conn = multiprocessing.Pipe()
+    self._process = multiprocessing.Process(
+        target=self._worker, args=(constructor, conn))
+    atexit.register(self.close)
+    self._process.start()
+    self._observ_space = None
+    self._action_space = None
+
+  @property
+  def observation_space(self):
+    if not self._observ_space:
+      self._observ_space = self.__getattr__('observation_space')
+    return self._observ_space
+
+  @property
+  def action_space(self):
+    if not self._action_space:
+      self._action_space = self.__getattr__('action_space')
+    return self._action_space
+
+  def __getattr__(self, name):
+    """Request an attribute from the environment.
+
+    Note that this involves communication with the external process, so it can
+    be slow.
+
+    Args:
+      name: Attribute to access.
+
+    Returns:
+      Value of the attribute.
+    """
+    self._conn.send((self._ACCESS, name))
+    return self._receive()
+
+  def call(self, name, *args, **kwargs):
+    """Asynchronously call a method of the external environment.
+
+    Args:
+      name: Name of the method to call.
+      *args: Positional arguments to forward to the method.
+      **kwargs: Keyword arguments to forward to the method.
+
+    Returns:
+      Promise object that blocks and provides the return value when called.
+    """
+    payload = name, args, kwargs
+    self._conn.send((self._CALL, payload))
+    return self._receive
+
+  def close(self):
+    """Send a close message to the external process and join it."""
+    try:
+      self._conn.send((self._CLOSE, None))
+      self._conn.close()
+    except IOError:
+      # The connection was already closed.
+      pass
+    self._process.join()
+
+  def step(self, action, blocking=True):
+    """Step the environment.
+
+    Args:
+      action: The action to apply to the environment.
+      blocking: Whether to wait for the result.
+
+    Returns:
+      Transition tuple when blocking, otherwise callable that returns the
+      transition tuple.
+    """
+    promise = self.call('step', action)
+    if blocking:
+      return promise()
+    else:
+      return promise
+
+  def reset(self, blocking=True):
+    """Reset the environment.
+
+    Args:
+      blocking: Whether to wait for the result.
+
+    Returns:
+      New observation when blocking, otherwise callable that returns the new
+      observation.
+    """
+    promise = self.call('reset')
+    if blocking:
+      return promise()
+    else:
+      return promise
+
+  def _receive(self):
+    """Wait for a message from the worker process and return its payload.
+
+    Raises:
+      Exception: An exception was raised inside the worker process.
+      KeyError: The reveived message is of an unknown type.
+
+    Returns:
+      Payload object of the message.
+    """
+    message, payload = self._conn.recv()
+    # Re-raise exceptions in the main process.
+    if message == self._EXCEPTION:
+      stacktrace = payload
+      raise Exception(stacktrace)
+    if message == self._RESULT:
+      return payload
+    raise KeyError('Received message of unexpected type {}'.format(message))
+
+  def _worker(self, constructor, conn):
+    """The process waits for actions and sends back environment results.
+
+    Args:
+      constructor: Constructor for the OpenAI Gym environment.
+      conn: Connection for communication to the main process.
+    """
+    try:
+      env = constructor()
+      while True:
+        try:
+          # Only block for short times to have keyboard exceptions be raised.
+          if not conn.poll(0.1):
+            continue
+          message, payload = conn.recv()
+        except (EOFError, KeyboardInterrupt):
+          break
+        if message == self._ACCESS:
+          name = payload
+          result = getattr(env, name)
+          conn.send((self._RESULT, result))
+          continue
+        if message == self._CALL:
+          name, args, kwargs = payload
+          result = getattr(env, name)(*args, **kwargs)
+          conn.send((self._RESULT, result))
+          continue
+        if message == self._CLOSE:
+          assert payload is None
+          break
+        raise KeyError('Received message of unknown type {}'.format(message))
+    except Exception:  # pylint: disable=broad-except
+      stacktrace = ''.join(traceback.format_exception(*sys.exc_info()))
+      tf.logging.error('Error in environment process: {}'.format(stacktrace))
+      conn.send((self._EXCEPTION, stacktrace))
+    conn.close()
+
+
+class ConvertTo32Bit(object):
+  """Convert data types of an OpenAI Gym environment to 32 bit."""
+
+  def __init__(self, env):
+    """Convert data types of an OpenAI Gym environment to 32 bit.
+
+    Args:
+      env: OpenAI Gym environment.
+    """
+    self._env = env
+
+  def __getattr__(self, name):
+    """Forward unimplemented attributes to the original environment.
+
+    Args:
+      name: Attribute that was accessed.
+
+    Returns:
+      Value behind the attribute name in the wrapped environment.
+    """
+    return getattr(self._env, name)
+
+  def step(self, action):
+    """Forward action to the wrapped environment.
+
+    Args:
+      action: Action to apply to the environment.
+
+    Raises:
+      ValueError: Invalid action.
+
+    Returns:
+      Converted observation, converted reward, done flag, and info object.
+    """
+    observ, reward, done, info = self._env.step(action)
+    observ = self._convert_observ(observ)
+    reward = self._convert_reward(reward)
+    return observ, reward, done, info
+
+  def reset(self):
+    """Reset the environment and convert the resulting observation.
+
+    Returns:
+      Converted observation.
+    """
+    observ = self._env.reset()
+    observ = self._convert_observ(observ)
+    return observ
+
+  def _convert_observ(self, observ):
+    """Convert the observation to 32 bits.
+
+    Args:
+      observ: Numpy observation.
+
+    Raises:
+      ValueError: Observation contains infinite values.
+
+    Returns:
+      Numpy observation with 32-bit data type.
+    """
+    if not np.isfinite(observ).all():
+      raise ValueError('Infinite observation encountered.')
+    if observ.dtype == np.float64:
+      return observ.astype(np.float32)
+    if observ.dtype == np.int64:
+      return observ.astype(np.int32)
+    return observ
+
+  def _convert_reward(self, reward):
+    """Convert the reward to 32 bits.
+
+    Args:
+      reward: Numpy reward.
+
+    Raises:
+      ValueError: Rewards contain infinite values.
+
+    Returns:
+      Numpy reward with 32-bit data type.
+    """
+    if not np.isfinite(reward).all():
+      raise ValueError('Infinite reward encountered.')
+    return np.array(reward, dtype=np.float32)