add a temp copy of TF agents (until the API stops changing or configs.py are included)

This commit is contained in:
Erwin Coumans
2017-11-16 16:47:14 +00:00
parent 7f654bdd87
commit 7b030426c1
24 changed files with 3294 additions and 27 deletions

View File

@@ -0,0 +1,31 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tools for reinforcement learning."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from . import wrappers
from .attr_dict import AttrDict
from .batch_env import BatchEnv
from .count_weights import count_weights
from .in_graph_batch_env import InGraphBatchEnv
from .in_graph_env import InGraphEnv
from .loop import Loop
from .mock_algorithm import MockAlgorithm
from .mock_environment import MockEnvironment
from .simulate import simulate
from .streaming_mean import StreamingMean

View File

@@ -0,0 +1,54 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Wrap a dictionary to access keys as attributes."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import contextlib
class AttrDict(dict):
"""Wrap a dictionary to access keys as attributes."""
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
super(AttrDict, self).__setattr__('_mutable', False)
def __getattr__(self, key):
# Do not provide None for unimplemented magic attributes.
if key.startswith('__'):
raise AttributeError
return self.get(key, None)
def __setattr__(self, key, value):
if not self._mutable:
message = "Cannot set attribute '{}'.".format(key)
message += " Use 'with obj.unlocked:' scope to set attributes."
raise RuntimeError(message)
if key.startswith('__'):
raise AttributeError("Cannot set magic attribute '{}'".format(key))
self[key] = value
@property
@contextlib.contextmanager
def unlocked(self):
super(AttrDict, self).__setattr__('_mutable', True)
yield
super(AttrDict, self).__setattr__('_mutable', False)
def copy(self):
return type(self)(super(AttrDict, self).copy())

View File

@@ -0,0 +1,124 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Combine multiple environments to step them in batch."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
class BatchEnv(object):
"""Combine multiple environments to step them in batch."""
def __init__(self, envs, blocking):
"""Combine multiple environments to step them in batch.
To step environments in parallel, environments must support a
`blocking=False` argument to their step and reset functions that makes them
return callables instead to receive the result at a later time.
Args:
envs: List of environments.
blocking: Step environments after another rather than in parallel.
Raises:
ValueError: Environments have different observation or action spaces.
"""
self._envs = envs
self._blocking = blocking
observ_space = self._envs[0].observation_space
if not all(env.observation_space == observ_space for env in self._envs):
raise ValueError('All environments must use the same observation space.')
action_space = self._envs[0].action_space
if not all(env.action_space == action_space for env in self._envs):
raise ValueError('All environments must use the same observation space.')
def __len__(self):
"""Number of combined environments."""
return len(self._envs)
def __getitem__(self, index):
"""Access an underlying environment by index."""
return self._envs[index]
def __getattr__(self, name):
"""Forward unimplemented attributes to one of the original environments.
Args:
name: Attribute that was accessed.
Returns:
Value behind the attribute name one of the wrapped environments.
"""
return getattr(self._envs[0], name)
def step(self, actions):
"""Forward a batch of actions to the wrapped environments.
Args:
actions: Batched action to apply to the environment.
Raises:
ValueError: Invalid actions.
Returns:
Batch of observations, rewards, and done flags.
"""
for index, (env, action) in enumerate(zip(self._envs, actions)):
if not env.action_space.contains(action):
message = 'Invalid action at index {}: {}'
raise ValueError(message.format(index, action))
if self._blocking:
transitions = [
env.step(action)
for env, action in zip(self._envs, actions)]
else:
transitions = [
env.step(action, blocking=False)
for env, action in zip(self._envs, actions)]
transitions = [transition() for transition in transitions]
observs, rewards, dones, infos = zip(*transitions)
observ = np.stack(observs)
reward = np.stack(rewards)
done = np.stack(dones)
info = tuple(infos)
return observ, reward, done, info
def reset(self, indices=None):
"""Reset the environment and convert the resulting observation.
Args:
indices: The batch indices of environments to reset; defaults to all.
Returns:
Batch of observations.
"""
if indices is None:
indices = np.arange(len(self._envs))
if self._blocking:
observs = [self._envs[index].reset() for index in indices]
else:
observs = [self._envs[index].reset(blocking=False) for index in indices]
observs = [observ() for observ in observs]
observ = np.stack(observs)
return observ
def close(self):
"""Send close messages to the external process and join them."""
for env in self._envs:
if hasattr(env, 'close'):
env.close()

View File

@@ -0,0 +1,48 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Count learnable parameters."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import re
import numpy as np
import tensorflow as tf
def count_weights(scope=None, exclude=None, graph=None):
"""Count learnable parameters.
Args:
scope: Resrict the count to a variable scope.
exclude: Regex to match variable names to exclude.
graph: Operate on a graph other than the current default graph.
Returns:
Number of learnable parameters as integer.
"""
if scope:
scope = scope if scope.endswith('/') else scope + '/'
graph = graph or tf.get_default_graph()
vars_ = graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
if scope:
vars_ = [var for var in vars_ if var.name.startswith(scope)]
if exclude:
exclude = re.compile(exclude)
vars_ = [var for var in vars_ if not exclude.match(var.name)]
shapes = [var.get_shape().as_list() for var in vars_]
return int(sum(np.prod(shape) for shape in shapes))

View File

@@ -0,0 +1,178 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Batch of environments inside the TensorFlow graph."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import gym
import tensorflow as tf
class InGraphBatchEnv(object):
"""Batch of environments inside the TensorFlow graph.
The batch of environments will be stepped and reset inside of the graph using
a tf.py_func(). The current batch of observations, actions, rewards, and done
flags are held in according variables.
"""
def __init__(self, batch_env):
"""Batch of environments inside the TensorFlow graph.
Args:
batch_env: Batch environment.
"""
self._batch_env = batch_env
observ_shape = self._parse_shape(self._batch_env.observation_space)
observ_dtype = self._parse_dtype(self._batch_env.observation_space)
action_shape = self._parse_shape(self._batch_env.action_space)
action_dtype = self._parse_dtype(self._batch_env.action_space)
with tf.variable_scope('env_temporary'):
self._observ = tf.Variable(
tf.zeros((len(self._batch_env),) + observ_shape, observ_dtype),
name='observ', trainable=False)
self._action = tf.Variable(
tf.zeros((len(self._batch_env),) + action_shape, action_dtype),
name='action', trainable=False)
self._reward = tf.Variable(
tf.zeros((len(self._batch_env),), tf.float32),
name='reward', trainable=False)
self._done = tf.Variable(
tf.cast(tf.ones((len(self._batch_env),)), tf.bool),
name='done', trainable=False)
def __getattr__(self, name):
"""Forward unimplemented attributes to one of the original environments.
Args:
name: Attribute that was accessed.
Returns:
Value behind the attribute name in one of the original environments.
"""
return getattr(self._batch_env, name)
def __len__(self):
"""Number of combined environments."""
return len(self._batch_env)
def __getitem__(self, index):
"""Access an underlying environment by index."""
return self._batch_env[index]
def simulate(self, action):
"""Step the batch of environments.
The results of the step can be accessed from the variables defined below.
Args:
action: Tensor holding the batch of actions to apply.
Returns:
Operation.
"""
with tf.name_scope('environment/simulate'):
if action.dtype in (tf.float16, tf.float32, tf.float64):
action = tf.check_numerics(action, 'action')
observ_dtype = self._parse_dtype(self._batch_env.observation_space)
observ, reward, done = tf.py_func(
lambda a: self._batch_env.step(a)[:3], [action],
[observ_dtype, tf.float32, tf.bool], name='step')
observ = tf.check_numerics(observ, 'observ')
reward = tf.check_numerics(reward, 'reward')
return tf.group(
self._observ.assign(observ),
self._action.assign(action),
self._reward.assign(reward),
self._done.assign(done))
def reset(self, indices=None):
"""Reset the batch of environments.
Args:
indices: The batch indices of the environments to reset; defaults to all.
Returns:
Batch tensor of the new observations.
"""
if indices is None:
indices = tf.range(len(self._batch_env))
observ_dtype = self._parse_dtype(self._batch_env.observation_space)
observ = tf.py_func(
self._batch_env.reset, [indices], observ_dtype, name='reset')
observ = tf.check_numerics(observ, 'observ')
reward = tf.zeros_like(indices, tf.float32)
done = tf.zeros_like(indices, tf.bool)
with tf.control_dependencies([
tf.scatter_update(self._observ, indices, observ),
tf.scatter_update(self._reward, indices, reward),
tf.scatter_update(self._done, indices, done)]):
return tf.identity(observ)
@property
def observ(self):
"""Access the variable holding the current observation."""
return self._observ
@property
def action(self):
"""Access the variable holding the last recieved action."""
return self._action
@property
def reward(self):
"""Access the variable holding the current reward."""
return self._reward
@property
def done(self):
"""Access the variable indicating whether the episode is done."""
return self._done
def close(self):
"""Send close messages to the external process and join them."""
self._batch_env.close()
def _parse_shape(self, space):
"""Get a tensor shape from a OpenAI Gym space.
Args:
space: Gym space.
Returns:
Shape tuple.
"""
if isinstance(space, gym.spaces.Discrete):
return ()
if isinstance(space, gym.spaces.Box):
return space.shape
raise NotImplementedError()
def _parse_dtype(self, space):
"""Get a tensor dtype from a OpenAI Gym space.
Args:
space: Gym space.
Returns:
TensorFlow data type.
"""
if isinstance(space, gym.spaces.Discrete):
return tf.int32
if isinstance(space, gym.spaces.Box):
return tf.float32
raise NotImplementedError()

View File

@@ -0,0 +1,162 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Put an OpenAI Gym environment into the TensorFlow graph."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import gym
import tensorflow as tf
class InGraphEnv(object):
"""Put an OpenAI Gym environment into the TensorFlow graph.
The environment will be stepped and reset inside of the graph using
tf.py_func(). The current observation, action, reward, and done flag are held
in according variables.
"""
def __init__(self, env):
"""Put an OpenAI Gym environment into the TensorFlow graph.
Args:
env: OpenAI Gym environment.
"""
self._env = env
observ_shape = self._parse_shape(self._env.observation_space)
observ_dtype = self._parse_dtype(self._env.observation_space)
action_shape = self._parse_shape(self._env.action_space)
action_dtype = self._parse_dtype(self._env.action_space)
with tf.name_scope('environment'):
self._observ = tf.Variable(
tf.zeros(observ_shape, observ_dtype), name='observ', trainable=False)
self._action = tf.Variable(
tf.zeros(action_shape, action_dtype), name='action', trainable=False)
self._reward = tf.Variable(
0.0, dtype=tf.float32, name='reward', trainable=False)
self._done = tf.Variable(
True, dtype=tf.bool, name='done', trainable=False)
self._step = tf.Variable(
0, dtype=tf.int32, name='step', trainable=False)
def __getattr__(self, name):
"""Forward unimplemented attributes to the original environment.
Args:
name: Attribute that was accessed.
Returns:
Value behind the attribute name in the wrapped environment.
"""
return getattr(self._env, name)
def simulate(self, action):
"""Step the environment.
The result of the step can be accessed from the variables defined below.
Args:
action: Tensor holding the action to apply.
Returns:
Operation.
"""
with tf.name_scope('environment/simulate'):
if action.dtype in (tf.float16, tf.float32, tf.float64):
action = tf.check_numerics(action, 'action')
observ_dtype = self._parse_dtype(self._env.observation_space)
observ, reward, done = tf.py_func(
lambda a: self._env.step(a)[:3], [action],
[observ_dtype, tf.float32, tf.bool], name='step')
observ = tf.check_numerics(observ, 'observ')
reward = tf.check_numerics(reward, 'reward')
return tf.group(
self._observ.assign(observ),
self._action.assign(action),
self._reward.assign(reward),
self._done.assign(done),
self._step.assign_add(1))
def reset(self):
"""Reset the environment.
Returns:
Tensor of the current observation.
"""
observ_dtype = self._parse_dtype(self._env.observation_space)
observ = tf.py_func(self._env.reset, [], observ_dtype, name='reset')
observ = tf.check_numerics(observ, 'observ')
with tf.control_dependencies([
self._observ.assign(observ),
self._reward.assign(0),
self._done.assign(False)]):
return tf.identity(observ)
@property
def observ(self):
"""Access the variable holding the current observation."""
return self._observ
@property
def action(self):
"""Access the variable holding the last recieved action."""
return self._action
@property
def reward(self):
"""Access the variable holding the current reward."""
return self._reward
@property
def done(self):
"""Access the variable indicating whether the episode is done."""
return self._done
@property
def step(self):
"""Access the variable containg total steps of this environment."""
return self._step
def _parse_shape(self, space):
"""Get a tensor shape from a OpenAI Gym space.
Args:
space: Gym space.
Returns:
Shape tuple.
"""
if isinstance(space, gym.spaces.Discrete):
return ()
if isinstance(space, gym.spaces.Box):
return space.shape
raise NotImplementedError()
def _parse_dtype(self, space):
"""Get a tensor dtype from a OpenAI Gym space.
Args:
space: Gym space.
Returns:
TensorFlow data type.
"""
if isinstance(space, gym.spaces.Discrete):
return tf.int32
if isinstance(space, gym.spaces.Box):
return tf.float32
raise NotImplementedError()

View File

@@ -0,0 +1,233 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Execute operations in a loop and coordinate logging and checkpoints."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import os
import tensorflow as tf
from . import streaming_mean
_Phase = collections.namedtuple(
'Phase',
'name, writer, op, batch, steps, feed, report_every, log_every,'
'checkpoint_every')
class Loop(object):
"""Execute operations in a loop and coordinate logging and checkpoints.
Supports multiple phases, that define their own operations to run, and
intervals for reporting scores, logging summaries, and storing checkpoints.
All class state is stored in-graph to properly recover from checkpoints.
"""
def __init__(self, logdir, step=None, log=None, report=None, reset=None):
"""Execute operations in a loop and coordinate logging and checkpoints.
The step, log, report, and report arguments will get created if not
provided. Reset is used to indicate switching to a new phase, so that the
model can start a new computation in case its computation is split over
multiple training steps.
Args:
logdir: Will contain checkpoints and summaries for each phase.
step: Variable of the global step (optional).
log: Tensor indicating to the model to compute summary tensors.
report: Tensor indicating to the loop to report the current mean score.
reset: Tensor indicating to the model to start a new computation.
"""
self._logdir = logdir
self._step = (
tf.Variable(0, False, name='global_step') if step is None else step)
self._log = tf.placeholder(tf.bool) if log is None else log
self._report = tf.placeholder(tf.bool) if report is None else report
self._reset = tf.placeholder(tf.bool) if reset is None else reset
self._phases = []
def add_phase(
self, name, done, score, summary, steps,
report_every=None, log_every=None, checkpoint_every=None, feed=None):
"""Add a phase to the loop protocol.
If the model breaks long computation into multiple steps, the done tensor
indicates whether the current score should be added to the mean counter.
For example, in reinforcement learning we only have a valid score at the
end of the episode.
Score and done tensors can either be scalars or vectors, to support
single and batched computations.
Args:
name: Name for the phase, used for the summary writer.
done: Tensor indicating whether current score can be used.
score: Tensor holding the current, possibly intermediate, score.
summary: Tensor holding summary string to write if not an empty string.
steps: Duration of the phase in steps.
report_every: Yield mean score every this number of steps.
log_every: Request summaries via `log` tensor every this number of steps.
checkpoint_every: Write checkpoint every this number of steps.
feed: Additional feed dictionary for the session run call.
Raises:
ValueError: Unknown rank for done or score tensors.
"""
done = tf.convert_to_tensor(done, tf.bool)
score = tf.convert_to_tensor(score, tf.float32)
summary = tf.convert_to_tensor(summary, tf.string)
feed = feed or {}
if done.shape.ndims is None or score.shape.ndims is None:
raise ValueError("Rank of 'done' and 'score' tensors must be known.")
writer = self._logdir and tf.summary.FileWriter(
os.path.join(self._logdir, name), tf.get_default_graph(),
flush_secs=60)
op = self._define_step(done, score, summary)
batch = 1 if score.shape.ndims == 0 else score.shape[0].value
self._phases.append(_Phase(
name, writer, op, batch, int(steps), feed, report_every,
log_every, checkpoint_every))
def run(self, sess, saver, max_step=None):
"""Run the loop schedule for a specified number of steps.
Call the operation of the current phase until the global step reaches the
specified maximum step. Phases are repeated over and over in the order they
were added.
Args:
sess: Session to use to run the phase operation.
saver: Saver used for checkpointing.
max_step: Run the operations until the step reaches this limit.
Yields:
Reported mean scores.
"""
global_step = sess.run(self._step)
steps_made = 1
while True:
if max_step and global_step >= max_step:
break
phase, epoch, steps_in = self._find_current_phase(global_step)
phase_step = epoch * phase.steps + steps_in
if steps_in % phase.steps < steps_made:
message = '\n' + ('-' * 50) + '\n'
message += 'Phase {} (phase step {}, global step {}).'
tf.logging.info(message.format(phase.name, phase_step, global_step))
# Populate book keeping tensors.
phase.feed[self._reset] = (steps_in < steps_made)
phase.feed[self._log] = (
phase.writer and
self._is_every_steps(phase_step, phase.batch, phase.log_every))
phase.feed[self._report] = (
self._is_every_steps(phase_step, phase.batch, phase.report_every))
summary, mean_score, global_step, steps_made = sess.run(
phase.op, phase.feed)
if self._is_every_steps(phase_step, phase.batch, phase.checkpoint_every):
self._store_checkpoint(sess, saver, global_step)
if self._is_every_steps(phase_step, phase.batch, phase.report_every):
yield mean_score
if summary and phase.writer:
# We want smaller phases to catch up at the beginnig of each epoch so
# that their graphs are aligned.
longest_phase = max(phase.steps for phase in self._phases)
summary_step = epoch * longest_phase + steps_in
phase.writer.add_summary(summary, summary_step)
def _is_every_steps(self, phase_step, batch, every):
"""Determine whether a periodic event should happen at this step.
Args:
phase_step: The incrementing step.
batch: The number of steps progressed at once.
every: The interval of the periode.
Returns:
Boolean of whether the event should happen.
"""
if not every:
return False
covered_steps = range(phase_step, phase_step + batch)
return any((step + 1) % every == 0 for step in covered_steps)
def _find_current_phase(self, global_step):
"""Determine the current phase based on the global step.
This ensures continuing the correct phase after restoring checkoints.
Args:
global_step: The global number of steps performed across all phases.
Returns:
Tuple of phase object, epoch number, and phase steps within the epoch.
"""
epoch_size = sum(phase.steps for phase in self._phases)
epoch = int(global_step // epoch_size)
steps_in = global_step % epoch_size
for phase in self._phases:
if steps_in < phase.steps:
return phase, epoch, steps_in
steps_in -= phase.steps
def _define_step(self, done, score, summary):
"""Combine operations of a phase.
Keeps track of the mean score and when to report it.
Args:
done: Tensor indicating whether current score can be used.
score: Tensor holding the current, possibly intermediate, score.
summary: Tensor holding summary string to write if not an empty string.
Returns:
Tuple of summary tensor, mean score, and new global step. The mean score
is zero for non reporting steps.
"""
if done.shape.ndims == 0:
done = done[None]
if score.shape.ndims == 0:
score = score[None]
score_mean = streaming_mean.StreamingMean((), tf.float32)
with tf.control_dependencies([done, score, summary]):
done_score = tf.gather(score, tf.where(done)[:, 0])
submit_score = tf.cond(
tf.reduce_any(done), lambda: score_mean.submit(done_score), tf.no_op)
with tf.control_dependencies([submit_score]):
mean_score = tf.cond(self._report, score_mean.clear, float)
steps_made = tf.shape(score)[0]
next_step = self._step.assign_add(steps_made)
with tf.control_dependencies([mean_score, next_step]):
return tf.identity(summary), mean_score, next_step, steps_made
def _store_checkpoint(self, sess, saver, global_step):
"""Store a checkpoint if a log directory was provided to the constructor.
The directory will be created if needed.
Args:
sess: Session containing variables to store.
saver: Saver used for checkpointing.
global_step: Step number of the checkpoint name.
"""
if not self._logdir or not saver:
return
tf.gfile.MakeDirs(self._logdir)
filename = os.path.join(self._logdir, 'model.ckpt')
saver.save(sess, filename, global_step)

View File

@@ -0,0 +1,49 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Mock algorithm for testing reinforcement learning code."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
class MockAlgorithm(object):
"""Produce random actions and empty summaries."""
def __init__(self, envs):
"""Produce random actions and empty summaries.
Args:
envs: List of in-graph environments.
"""
self._envs = envs
def begin_episode(self, unused_agent_indices):
return tf.constant('')
def perform(self, agent_indices, unused_observ):
shape = (tf.shape(agent_indices)[0],) + self._envs[0].action_space.shape
low = self._envs[0].action_space.low
high = self._envs[0].action_space.high
action = tf.random_uniform(shape) * (high - low) + low
return action, tf.constant('')
def experience(self, unused_agent_indices, *unused_transition):
return tf.constant('')
def end_episode(self, unused_agent_indices):
return tf.constant('')

View File

@@ -0,0 +1,86 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Mock environment for testing reinforcement learning code."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import gym
import gym.spaces
import numpy as np
class MockEnvironment(object):
"""Generate random agent input and keep track of statistics."""
def __init__(self, observ_shape, action_shape, min_duration, max_duration):
"""Generate random agent input and keep track of statistics.
Args:
observ_shape: Shape for the random observations.
action_shape: Shape for the action space.
min_duration: Minimum number of steps per episode.
max_duration: Maximum number of steps per episode.
Attributes:
steps: List of actual simulated lengths for all episodes.
durations: List of decided lengths for all episodes.
"""
self._observ_shape = observ_shape
self._action_shape = action_shape
self._min_duration = min_duration
self._max_duration = max_duration
self._random = np.random.RandomState(0)
self.steps = []
self.durations = []
@property
def observation_space(self):
low = np.zeros(self._observ_shape)
high = np.ones(self._observ_shape)
return gym.spaces.Box(low, high)
@property
def action_space(self):
low = np.zeros(self._action_shape)
high = np.ones(self._action_shape)
return gym.spaces.Box(low, high)
@property
def unwrapped(self):
return self
def step(self, action):
assert self.action_space.contains(action)
assert self.steps[-1] < self.durations[-1]
self.steps[-1] += 1
observ = self._current_observation()
reward = self._current_reward()
done = self.steps[-1] >= self.durations[-1]
info = {}
return observ, reward, done, info
def reset(self):
duration = self._random.randint(self._min_duration, self._max_duration + 1)
self.steps.append(0)
self.durations.append(duration)
return self._current_observation()
def _current_observation(self):
return self._random.uniform(0, 1, self._observ_shape)
def _current_reward(self):
return self._random.uniform(-1, 1)

View File

@@ -0,0 +1,147 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""In-graph simulation step of a vectorized algorithm with environments."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from . import streaming_mean
def simulate(batch_env, algo, log=True, reset=False):
"""Simulation step of a vecrotized algorithm with in-graph environments.
Integrates the operations implemented by the algorithm and the environments
into a combined operation.
Args:
batch_env: In-graph batch environment.
algo: Algorithm instance implementing required operations.
log: Tensor indicating whether to compute and return summaries.
reset: Tensor causing all environments to reset.
Returns:
Tuple of tensors containing done flags for the current episodes, possibly
intermediate scores for the episodes, and a summary tensor.
"""
def _define_begin_episode(agent_indices):
"""Reset environments, intermediate scores and durations for new episodes.
Args:
agent_indices: Tensor containing batch indices starting an episode.
Returns:
Summary tensor.
"""
assert agent_indices.shape.ndims == 1
zero_scores = tf.zeros_like(agent_indices, tf.float32)
zero_durations = tf.zeros_like(agent_indices)
reset_ops = [
batch_env.reset(agent_indices),
tf.scatter_update(score, agent_indices, zero_scores),
tf.scatter_update(length, agent_indices, zero_durations)]
with tf.control_dependencies(reset_ops):
return algo.begin_episode(agent_indices)
def _define_step():
"""Request actions from the algorithm and apply them to the environments.
Increments the lengths of all episodes and increases their scores by the
current reward. After stepping the environments, provides the full
transition tuple to the algorithm.
Returns:
Summary tensor.
"""
prevob = batch_env.observ + 0 # Ensure a copy of the variable value.
agent_indices = tf.range(len(batch_env))
action, step_summary = algo.perform(agent_indices, prevob)
action.set_shape(batch_env.action.shape)
with tf.control_dependencies([batch_env.simulate(action)]):
add_score = score.assign_add(batch_env.reward)
inc_length = length.assign_add(tf.ones(len(batch_env), tf.int32))
with tf.control_dependencies([add_score, inc_length]):
agent_indices = tf.range(len(batch_env))
experience_summary = algo.experience(
agent_indices, prevob, batch_env.action, batch_env.reward,
batch_env.done, batch_env.observ)
return tf.summary.merge([step_summary, experience_summary])
def _define_end_episode(agent_indices):
"""Notify the algorithm of ending episodes.
Also updates the mean score and length counters used for summaries.
Args:
agent_indices: Tensor holding batch indices that end their episodes.
Returns:
Summary tensor.
"""
assert agent_indices.shape.ndims == 1
submit_score = mean_score.submit(tf.gather(score, agent_indices))
submit_length = mean_length.submit(
tf.cast(tf.gather(length, agent_indices), tf.float32))
with tf.control_dependencies([submit_score, submit_length]):
return algo.end_episode(agent_indices)
def _define_summaries():
"""Reset the average score and duration, and return them as summary.
Returns:
Summary string.
"""
score_summary = tf.cond(
tf.logical_and(log, tf.cast(mean_score.count, tf.bool)),
lambda: tf.summary.scalar('mean_score', mean_score.clear()), str)
length_summary = tf.cond(
tf.logical_and(log, tf.cast(mean_length.count, tf.bool)),
lambda: tf.summary.scalar('mean_length', mean_length.clear()), str)
return tf.summary.merge([score_summary, length_summary])
with tf.name_scope('simulate'):
log = tf.convert_to_tensor(log)
reset = tf.convert_to_tensor(reset)
with tf.variable_scope('simulate_temporary'):
score = tf.Variable(
tf.zeros(len(batch_env), dtype=tf.float32), False, name='score')
length = tf.Variable(
tf.zeros(len(batch_env), dtype=tf.int32), False, name='length')
mean_score = streaming_mean.StreamingMean((), tf.float32)
mean_length = streaming_mean.StreamingMean((), tf.float32)
agent_indices = tf.cond(
reset,
lambda: tf.range(len(batch_env)),
lambda: tf.cast(tf.where(batch_env.done)[:, 0], tf.int32))
begin_episode = tf.cond(
tf.cast(tf.shape(agent_indices)[0], tf.bool),
lambda: _define_begin_episode(agent_indices), str)
with tf.control_dependencies([begin_episode]):
step = _define_step()
with tf.control_dependencies([step]):
agent_indices = tf.cast(tf.where(batch_env.done)[:, 0], tf.int32)
end_episode = tf.cond(
tf.cast(tf.shape(agent_indices)[0], tf.bool),
lambda: _define_end_episode(agent_indices), str)
with tf.control_dependencies([end_episode]):
summary = tf.summary.merge([
_define_summaries(), begin_episode, step, end_episode])
with tf.control_dependencies([summary]):
done, score = tf.identity(batch_env.done), tf.identity(score)
return done, score, summary

View File

@@ -0,0 +1,67 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Compute a streaming estimation of the mean of submitted tensors."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
class StreamingMean(object):
"""Compute a streaming estimation of the mean of submitted tensors."""
def __init__(self, shape, dtype):
"""Specify the shape and dtype of the mean to be estimated.
Note that a float mean to zero submitted elements is NaN, while computing
the integer mean of zero elements raises a division by zero error.
Args:
shape: Shape of the mean to compute.
dtype: Data type of the mean to compute.
"""
self._dtype = dtype
self._sum = tf.Variable(lambda: tf.zeros(shape, dtype), False)
self._count = tf.Variable(lambda: 0, trainable=False)
@property
def value(self):
"""The current value of the mean."""
return self._sum / tf.cast(self._count, self._dtype)
@property
def count(self):
"""The number of submitted samples."""
return self._count
def submit(self, value):
"""Submit a single or batch tensor to refine the streaming mean."""
# Add a batch dimension if necessary.
if value.shape.ndims == self._sum.shape.ndims:
value = value[None, ...]
return tf.group(
self._sum.assign_add(tf.reduce_sum(value, 0)),
self._count.assign_add(tf.shape(value)[0]))
def clear(self):
"""Return the mean estimate and reset the streaming statistics."""
value = self._sum / tf.cast(self._count, self._dtype)
with tf.control_dependencies([value]):
reset_value = self._sum.assign(tf.zeros_like(self._sum))
reset_count = self._count.assign(0)
with tf.control_dependencies([reset_value, reset_count]):
return tf.identity(value)

View File

@@ -0,0 +1,558 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Wrappers for OpenAI Gym environments."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import atexit
import multiprocessing
import sys
import traceback
import gym
import gym.spaces
import numpy as np
import tensorflow as tf
class AutoReset(object):
"""Automatically reset environment when the episode is done."""
def __init__(self, env):
self._env = env
self._done = True
def __getattr__(self, name):
return getattr(self._env, name)
def step(self, action):
if self._done:
observ, reward, done, info = self._env.reset(), 0.0, False, {}
else:
observ, reward, done, info = self._env.step(action)
self._done = done
return observ, reward, done, info
def reset(self):
self._done = False
return self._env.reset()
class ActionRepeat(object):
"""Repeat the agent action multiple steps."""
def __init__(self, env, amount):
self._env = env
self._amount = amount
def __getattr__(self, name):
return getattr(self._env, name)
def step(self, action):
done = False
total_reward = 0
current_step = 0
while current_step < self._amount and not done:
observ, reward, done, info = self._env.step(action)
total_reward += reward
current_step += 1
return observ, total_reward, done, info
class RandomStart(object):
"""Perform random number of random actions at the start of the episode."""
def __init__(self, env, max_steps):
self._env = env
self._max_steps = max_steps
def __getattr__(self, name):
return getattr(self._env, name)
def reset(self):
observ = self._env.reset()
random_steps = np.random.randint(0, self._max_steps)
for _ in range(random_steps):
action = self._env.action_space.sample()
observ, unused_reward, done, unused_info = self._env.step(action)
if done:
tf.logging.warning('Episode ended during random start.')
return self.reset()
return observ
class FrameHistory(object):
"""Augment the observation with past observations."""
def __init__(self, env, past_indices, flatten):
"""Augment the observation with past observations.
Implemented as a Numpy ring buffer holding the necessary past observations.
Args:
env: OpenAI Gym environment to wrap.
past_indices: List of non-negative integers indicating the time offsets
from the current time step of observations to include.
flatten: Concatenate the past observations rather than stacking them.
Raises:
KeyError: The current observation is not included in the indices.
"""
if 0 not in past_indices:
raise KeyError('Past indices should include 0 for the current frame.')
self._env = env
self._past_indices = past_indices
self._step = 0
self._buffer = None
self._capacity = max(past_indices)
self._flatten = flatten
def __getattr__(self, name):
return getattr(self._env, name)
@property
def observation_space(self):
low = self._env.observation_space.low
high = self._env.observation_space.high
low = np.repeat(low[None, ...], len(self._past_indices), 0)
high = np.repeat(high[None, ...], len(self._past_indices), 0)
if self._flatten:
low = np.reshape(low, (-1,) + low.shape[2:])
high = np.reshape(high, (-1,) + high.shape[2:])
return gym.spaces.Box(low, high)
def step(self, action):
observ, reward, done, info = self._env.step(action)
self._step += 1
self._buffer[self._step % self._capacity] = observ
observ = self._select_frames()
return observ, reward, done, info
def reset(self):
observ = self._env.reset()
self._buffer = np.repeat(observ[None, ...], self._capacity, 0)
self._step = 0
return self._select_frames()
def _select_frames(self):
indices = [
(self._step - index) % self._capacity for index in self._past_indices]
observ = self._buffer[indices]
if self._flatten:
observ = np.reshape(observ, (-1,) + observ.shape[2:])
return observ
class FrameDelta(object):
"""Convert the observation to a difference from the previous observation."""
def __init__(self, env):
self._env = env
self._last = None
def __getattr__(self, name):
return getattr(self._env, name)
@property
def observation_space(self):
low = self._env.observation_space.low
high = self._env.observation_space.high
low, high = low - high, high - low
return gym.spaces.Box(low, high)
def step(self, action):
observ, reward, done, info = self._env.step(action)
delta = observ - self._last
self._last = observ
return delta, reward, done, info
def reset(self):
observ = self._env.reset()
self._last = observ
return observ
class RangeNormalize(object):
"""Normalize the specialized observation and action ranges to [-1, 1]."""
def __init__(self, env, observ=None, action=None):
self._env = env
self._should_normalize_observ = (
observ is not False and self._is_finite(self._env.observation_space))
if observ is True and not self._should_normalize_observ:
raise ValueError('Cannot normalize infinite observation range.')
if observ is None and not self._should_normalize_observ:
tf.logging.info('Not normalizing infinite observation range.')
self._should_normalize_action = (
action is not False and self._is_finite(self._env.action_space))
if action is True and not self._should_normalize_action:
raise ValueError('Cannot normalize infinite action range.')
if action is None and not self._should_normalize_action:
tf.logging.info('Not normalizing infinite action range.')
def __getattr__(self, name):
return getattr(self._env, name)
@property
def observation_space(self):
space = self._env.observation_space
if not self._should_normalize_observ:
return space
return gym.spaces.Box(-np.ones(space.shape), np.ones(space.shape))
@property
def action_space(self):
space = self._env.action_space
if not self._should_normalize_action:
return space
return gym.spaces.Box(-np.ones(space.shape), np.ones(space.shape))
def step(self, action):
if self._should_normalize_action:
action = self._denormalize_action(action)
observ, reward, done, info = self._env.step(action)
if self._should_normalize_observ:
observ = self._normalize_observ(observ)
return observ, reward, done, info
def reset(self):
observ = self._env.reset()
if self._should_normalize_observ:
observ = self._normalize_observ(observ)
return observ
def _denormalize_action(self, action):
min_ = self._env.action_space.low
max_ = self._env.action_space.high
action = (action + 1) / 2 * (max_ - min_) + min_
return action
def _normalize_observ(self, observ):
min_ = self._env.observation_space.low
max_ = self._env.observation_space.high
observ = 2 * (observ - min_) / (max_ - min_) - 1
return observ
def _is_finite(self, space):
return np.isfinite(space.low).all() and np.isfinite(space.high).all()
class ClipAction(object):
"""Clip out of range actions to the action space of the environment."""
def __init__(self, env):
self._env = env
def __getattr__(self, name):
return getattr(self._env, name)
@property
def action_space(self):
shape = self._env.action_space.shape
return gym.spaces.Box(-np.inf * np.ones(shape), np.inf * np.ones(shape))
def step(self, action):
action_space = self._env.action_space
action = np.clip(action, action_space.low, action_space.high)
return self._env.step(action)
class LimitDuration(object):
"""End episodes after specified number of steps."""
def __init__(self, env, duration):
self._env = env
self._duration = duration
self._step = None
def __getattr__(self, name):
return getattr(self._env, name)
def step(self, action):
if self._step is None:
raise RuntimeError('Must reset environment.')
observ, reward, done, info = self._env.step(action)
self._step += 1
if self._step >= self._duration:
done = True
self._step = None
return observ, reward, done, info
def reset(self):
self._step = 0
return self._env.reset()
class ExternalProcess(object):
"""Step environment in a separate process for lock free paralellism."""
# Message types for communication via the pipe.
_ACCESS = 1
_CALL = 2
_RESULT = 3
_EXCEPTION = 4
_CLOSE = 5
def __init__(self, constructor):
"""Step environment in a separate process for lock free paralellism.
The environment will be created in the external process by calling the
specified callable. This can be an environment class, or a function
creating the environment and potentially wrapping it. The returned
environment should not access global variables.
Args:
constructor: Callable that creates and returns an OpenAI gym environment.
Attributes:
observation_space: The cached observation space of the environment.
action_space: The cached action space of the environment.
"""
self._conn, conn = multiprocessing.Pipe()
self._process = multiprocessing.Process(
target=self._worker, args=(constructor, conn))
atexit.register(self.close)
self._process.start()
self._observ_space = None
self._action_space = None
@property
def observation_space(self):
if not self._observ_space:
self._observ_space = self.__getattr__('observation_space')
return self._observ_space
@property
def action_space(self):
if not self._action_space:
self._action_space = self.__getattr__('action_space')
return self._action_space
def __getattr__(self, name):
"""Request an attribute from the environment.
Note that this involves communication with the external process, so it can
be slow.
Args:
name: Attribute to access.
Returns:
Value of the attribute.
"""
self._conn.send((self._ACCESS, name))
return self._receive()
def call(self, name, *args, **kwargs):
"""Asynchronously call a method of the external environment.
Args:
name: Name of the method to call.
*args: Positional arguments to forward to the method.
**kwargs: Keyword arguments to forward to the method.
Returns:
Promise object that blocks and provides the return value when called.
"""
payload = name, args, kwargs
self._conn.send((self._CALL, payload))
return self._receive
def close(self):
"""Send a close message to the external process and join it."""
try:
self._conn.send((self._CLOSE, None))
self._conn.close()
except IOError:
# The connection was already closed.
pass
self._process.join()
def step(self, action, blocking=True):
"""Step the environment.
Args:
action: The action to apply to the environment.
blocking: Whether to wait for the result.
Returns:
Transition tuple when blocking, otherwise callable that returns the
transition tuple.
"""
promise = self.call('step', action)
if blocking:
return promise()
else:
return promise
def reset(self, blocking=True):
"""Reset the environment.
Args:
blocking: Whether to wait for the result.
Returns:
New observation when blocking, otherwise callable that returns the new
observation.
"""
promise = self.call('reset')
if blocking:
return promise()
else:
return promise
def _receive(self):
"""Wait for a message from the worker process and return its payload.
Raises:
Exception: An exception was raised inside the worker process.
KeyError: The reveived message is of an unknown type.
Returns:
Payload object of the message.
"""
message, payload = self._conn.recv()
# Re-raise exceptions in the main process.
if message == self._EXCEPTION:
stacktrace = payload
raise Exception(stacktrace)
if message == self._RESULT:
return payload
raise KeyError('Received message of unexpected type {}'.format(message))
def _worker(self, constructor, conn):
"""The process waits for actions and sends back environment results.
Args:
constructor: Constructor for the OpenAI Gym environment.
conn: Connection for communication to the main process.
"""
try:
env = constructor()
while True:
try:
# Only block for short times to have keyboard exceptions be raised.
if not conn.poll(0.1):
continue
message, payload = conn.recv()
except (EOFError, KeyboardInterrupt):
break
if message == self._ACCESS:
name = payload
result = getattr(env, name)
conn.send((self._RESULT, result))
continue
if message == self._CALL:
name, args, kwargs = payload
result = getattr(env, name)(*args, **kwargs)
conn.send((self._RESULT, result))
continue
if message == self._CLOSE:
assert payload is None
break
raise KeyError('Received message of unknown type {}'.format(message))
except Exception: # pylint: disable=broad-except
stacktrace = ''.join(traceback.format_exception(*sys.exc_info()))
tf.logging.error('Error in environment process: {}'.format(stacktrace))
conn.send((self._EXCEPTION, stacktrace))
conn.close()
class ConvertTo32Bit(object):
"""Convert data types of an OpenAI Gym environment to 32 bit."""
def __init__(self, env):
"""Convert data types of an OpenAI Gym environment to 32 bit.
Args:
env: OpenAI Gym environment.
"""
self._env = env
def __getattr__(self, name):
"""Forward unimplemented attributes to the original environment.
Args:
name: Attribute that was accessed.
Returns:
Value behind the attribute name in the wrapped environment.
"""
return getattr(self._env, name)
def step(self, action):
"""Forward action to the wrapped environment.
Args:
action: Action to apply to the environment.
Raises:
ValueError: Invalid action.
Returns:
Converted observation, converted reward, done flag, and info object.
"""
observ, reward, done, info = self._env.step(action)
observ = self._convert_observ(observ)
reward = self._convert_reward(reward)
return observ, reward, done, info
def reset(self):
"""Reset the environment and convert the resulting observation.
Returns:
Converted observation.
"""
observ = self._env.reset()
observ = self._convert_observ(observ)
return observ
def _convert_observ(self, observ):
"""Convert the observation to 32 bits.
Args:
observ: Numpy observation.
Raises:
ValueError: Observation contains infinite values.
Returns:
Numpy observation with 32-bit data type.
"""
if not np.isfinite(observ).all():
raise ValueError('Infinite observation encountered.')
if observ.dtype == np.float64:
return observ.astype(np.float32)
if observ.dtype == np.int64:
return observ.astype(np.int32)
return observ
def _convert_reward(self, reward):
"""Convert the reward to 32 bits.
Args:
reward: Numpy reward.
Raises:
ValueError: Rewards contain infinite values.
Returns:
Numpy reward with 32-bit data type.
"""
if not np.isfinite(reward).all():
raise ValueError('Infinite reward encountered.')
return np.array(reward, dtype=np.float32)