add a temp copy of TF agents (until the API stops changing or configs.py are included)
This commit is contained in:
31
examples/pybullet/gym/pybullet_envs/agents/tools/__init__.py
Normal file
31
examples/pybullet/gym/pybullet_envs/agents/tools/__init__.py
Normal file
@@ -0,0 +1,31 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Tools for reinforcement learning."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from . import wrappers
|
||||
from .attr_dict import AttrDict
|
||||
from .batch_env import BatchEnv
|
||||
from .count_weights import count_weights
|
||||
from .in_graph_batch_env import InGraphBatchEnv
|
||||
from .in_graph_env import InGraphEnv
|
||||
from .loop import Loop
|
||||
from .mock_algorithm import MockAlgorithm
|
||||
from .mock_environment import MockEnvironment
|
||||
from .simulate import simulate
|
||||
from .streaming_mean import StreamingMean
|
||||
@@ -0,0 +1,54 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Wrap a dictionary to access keys as attributes."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import contextlib
|
||||
|
||||
|
||||
class AttrDict(dict):
|
||||
"""Wrap a dictionary to access keys as attributes."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(AttrDict, self).__init__(*args, **kwargs)
|
||||
super(AttrDict, self).__setattr__('_mutable', False)
|
||||
|
||||
def __getattr__(self, key):
|
||||
# Do not provide None for unimplemented magic attributes.
|
||||
if key.startswith('__'):
|
||||
raise AttributeError
|
||||
return self.get(key, None)
|
||||
|
||||
def __setattr__(self, key, value):
|
||||
if not self._mutable:
|
||||
message = "Cannot set attribute '{}'.".format(key)
|
||||
message += " Use 'with obj.unlocked:' scope to set attributes."
|
||||
raise RuntimeError(message)
|
||||
if key.startswith('__'):
|
||||
raise AttributeError("Cannot set magic attribute '{}'".format(key))
|
||||
self[key] = value
|
||||
|
||||
@property
|
||||
@contextlib.contextmanager
|
||||
def unlocked(self):
|
||||
super(AttrDict, self).__setattr__('_mutable', True)
|
||||
yield
|
||||
super(AttrDict, self).__setattr__('_mutable', False)
|
||||
|
||||
def copy(self):
|
||||
return type(self)(super(AttrDict, self).copy())
|
||||
124
examples/pybullet/gym/pybullet_envs/agents/tools/batch_env.py
Normal file
124
examples/pybullet/gym/pybullet_envs/agents/tools/batch_env.py
Normal file
@@ -0,0 +1,124 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Combine multiple environments to step them in batch."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
class BatchEnv(object):
|
||||
"""Combine multiple environments to step them in batch."""
|
||||
|
||||
def __init__(self, envs, blocking):
|
||||
"""Combine multiple environments to step them in batch.
|
||||
|
||||
To step environments in parallel, environments must support a
|
||||
`blocking=False` argument to their step and reset functions that makes them
|
||||
return callables instead to receive the result at a later time.
|
||||
|
||||
Args:
|
||||
envs: List of environments.
|
||||
blocking: Step environments after another rather than in parallel.
|
||||
|
||||
Raises:
|
||||
ValueError: Environments have different observation or action spaces.
|
||||
"""
|
||||
self._envs = envs
|
||||
self._blocking = blocking
|
||||
observ_space = self._envs[0].observation_space
|
||||
if not all(env.observation_space == observ_space for env in self._envs):
|
||||
raise ValueError('All environments must use the same observation space.')
|
||||
action_space = self._envs[0].action_space
|
||||
if not all(env.action_space == action_space for env in self._envs):
|
||||
raise ValueError('All environments must use the same observation space.')
|
||||
|
||||
def __len__(self):
|
||||
"""Number of combined environments."""
|
||||
return len(self._envs)
|
||||
|
||||
def __getitem__(self, index):
|
||||
"""Access an underlying environment by index."""
|
||||
return self._envs[index]
|
||||
|
||||
def __getattr__(self, name):
|
||||
"""Forward unimplemented attributes to one of the original environments.
|
||||
|
||||
Args:
|
||||
name: Attribute that was accessed.
|
||||
|
||||
Returns:
|
||||
Value behind the attribute name one of the wrapped environments.
|
||||
"""
|
||||
return getattr(self._envs[0], name)
|
||||
|
||||
def step(self, actions):
|
||||
"""Forward a batch of actions to the wrapped environments.
|
||||
|
||||
Args:
|
||||
actions: Batched action to apply to the environment.
|
||||
|
||||
Raises:
|
||||
ValueError: Invalid actions.
|
||||
|
||||
Returns:
|
||||
Batch of observations, rewards, and done flags.
|
||||
"""
|
||||
for index, (env, action) in enumerate(zip(self._envs, actions)):
|
||||
if not env.action_space.contains(action):
|
||||
message = 'Invalid action at index {}: {}'
|
||||
raise ValueError(message.format(index, action))
|
||||
if self._blocking:
|
||||
transitions = [
|
||||
env.step(action)
|
||||
for env, action in zip(self._envs, actions)]
|
||||
else:
|
||||
transitions = [
|
||||
env.step(action, blocking=False)
|
||||
for env, action in zip(self._envs, actions)]
|
||||
transitions = [transition() for transition in transitions]
|
||||
observs, rewards, dones, infos = zip(*transitions)
|
||||
observ = np.stack(observs)
|
||||
reward = np.stack(rewards)
|
||||
done = np.stack(dones)
|
||||
info = tuple(infos)
|
||||
return observ, reward, done, info
|
||||
|
||||
def reset(self, indices=None):
|
||||
"""Reset the environment and convert the resulting observation.
|
||||
|
||||
Args:
|
||||
indices: The batch indices of environments to reset; defaults to all.
|
||||
|
||||
Returns:
|
||||
Batch of observations.
|
||||
"""
|
||||
if indices is None:
|
||||
indices = np.arange(len(self._envs))
|
||||
if self._blocking:
|
||||
observs = [self._envs[index].reset() for index in indices]
|
||||
else:
|
||||
observs = [self._envs[index].reset(blocking=False) for index in indices]
|
||||
observs = [observ() for observ in observs]
|
||||
observ = np.stack(observs)
|
||||
return observ
|
||||
|
||||
def close(self):
|
||||
"""Send close messages to the external process and join them."""
|
||||
for env in self._envs:
|
||||
if hasattr(env, 'close'):
|
||||
env.close()
|
||||
@@ -0,0 +1,48 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Count learnable parameters."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
def count_weights(scope=None, exclude=None, graph=None):
|
||||
"""Count learnable parameters.
|
||||
|
||||
Args:
|
||||
scope: Resrict the count to a variable scope.
|
||||
exclude: Regex to match variable names to exclude.
|
||||
graph: Operate on a graph other than the current default graph.
|
||||
|
||||
Returns:
|
||||
Number of learnable parameters as integer.
|
||||
"""
|
||||
if scope:
|
||||
scope = scope if scope.endswith('/') else scope + '/'
|
||||
graph = graph or tf.get_default_graph()
|
||||
vars_ = graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
|
||||
if scope:
|
||||
vars_ = [var for var in vars_ if var.name.startswith(scope)]
|
||||
if exclude:
|
||||
exclude = re.compile(exclude)
|
||||
vars_ = [var for var in vars_ if not exclude.match(var.name)]
|
||||
shapes = [var.get_shape().as_list() for var in vars_]
|
||||
return int(sum(np.prod(shape) for shape in shapes))
|
||||
@@ -0,0 +1,178 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Batch of environments inside the TensorFlow graph."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import gym
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
class InGraphBatchEnv(object):
|
||||
"""Batch of environments inside the TensorFlow graph.
|
||||
|
||||
The batch of environments will be stepped and reset inside of the graph using
|
||||
a tf.py_func(). The current batch of observations, actions, rewards, and done
|
||||
flags are held in according variables.
|
||||
"""
|
||||
|
||||
def __init__(self, batch_env):
|
||||
"""Batch of environments inside the TensorFlow graph.
|
||||
|
||||
Args:
|
||||
batch_env: Batch environment.
|
||||
"""
|
||||
self._batch_env = batch_env
|
||||
observ_shape = self._parse_shape(self._batch_env.observation_space)
|
||||
observ_dtype = self._parse_dtype(self._batch_env.observation_space)
|
||||
action_shape = self._parse_shape(self._batch_env.action_space)
|
||||
action_dtype = self._parse_dtype(self._batch_env.action_space)
|
||||
with tf.variable_scope('env_temporary'):
|
||||
self._observ = tf.Variable(
|
||||
tf.zeros((len(self._batch_env),) + observ_shape, observ_dtype),
|
||||
name='observ', trainable=False)
|
||||
self._action = tf.Variable(
|
||||
tf.zeros((len(self._batch_env),) + action_shape, action_dtype),
|
||||
name='action', trainable=False)
|
||||
self._reward = tf.Variable(
|
||||
tf.zeros((len(self._batch_env),), tf.float32),
|
||||
name='reward', trainable=False)
|
||||
self._done = tf.Variable(
|
||||
tf.cast(tf.ones((len(self._batch_env),)), tf.bool),
|
||||
name='done', trainable=False)
|
||||
|
||||
def __getattr__(self, name):
|
||||
"""Forward unimplemented attributes to one of the original environments.
|
||||
|
||||
Args:
|
||||
name: Attribute that was accessed.
|
||||
|
||||
Returns:
|
||||
Value behind the attribute name in one of the original environments.
|
||||
"""
|
||||
return getattr(self._batch_env, name)
|
||||
|
||||
def __len__(self):
|
||||
"""Number of combined environments."""
|
||||
return len(self._batch_env)
|
||||
|
||||
def __getitem__(self, index):
|
||||
"""Access an underlying environment by index."""
|
||||
return self._batch_env[index]
|
||||
|
||||
def simulate(self, action):
|
||||
"""Step the batch of environments.
|
||||
|
||||
The results of the step can be accessed from the variables defined below.
|
||||
|
||||
Args:
|
||||
action: Tensor holding the batch of actions to apply.
|
||||
|
||||
Returns:
|
||||
Operation.
|
||||
"""
|
||||
with tf.name_scope('environment/simulate'):
|
||||
if action.dtype in (tf.float16, tf.float32, tf.float64):
|
||||
action = tf.check_numerics(action, 'action')
|
||||
observ_dtype = self._parse_dtype(self._batch_env.observation_space)
|
||||
observ, reward, done = tf.py_func(
|
||||
lambda a: self._batch_env.step(a)[:3], [action],
|
||||
[observ_dtype, tf.float32, tf.bool], name='step')
|
||||
observ = tf.check_numerics(observ, 'observ')
|
||||
reward = tf.check_numerics(reward, 'reward')
|
||||
return tf.group(
|
||||
self._observ.assign(observ),
|
||||
self._action.assign(action),
|
||||
self._reward.assign(reward),
|
||||
self._done.assign(done))
|
||||
|
||||
def reset(self, indices=None):
|
||||
"""Reset the batch of environments.
|
||||
|
||||
Args:
|
||||
indices: The batch indices of the environments to reset; defaults to all.
|
||||
|
||||
Returns:
|
||||
Batch tensor of the new observations.
|
||||
"""
|
||||
if indices is None:
|
||||
indices = tf.range(len(self._batch_env))
|
||||
observ_dtype = self._parse_dtype(self._batch_env.observation_space)
|
||||
observ = tf.py_func(
|
||||
self._batch_env.reset, [indices], observ_dtype, name='reset')
|
||||
observ = tf.check_numerics(observ, 'observ')
|
||||
reward = tf.zeros_like(indices, tf.float32)
|
||||
done = tf.zeros_like(indices, tf.bool)
|
||||
with tf.control_dependencies([
|
||||
tf.scatter_update(self._observ, indices, observ),
|
||||
tf.scatter_update(self._reward, indices, reward),
|
||||
tf.scatter_update(self._done, indices, done)]):
|
||||
return tf.identity(observ)
|
||||
|
||||
@property
|
||||
def observ(self):
|
||||
"""Access the variable holding the current observation."""
|
||||
return self._observ
|
||||
|
||||
@property
|
||||
def action(self):
|
||||
"""Access the variable holding the last recieved action."""
|
||||
return self._action
|
||||
|
||||
@property
|
||||
def reward(self):
|
||||
"""Access the variable holding the current reward."""
|
||||
return self._reward
|
||||
|
||||
@property
|
||||
def done(self):
|
||||
"""Access the variable indicating whether the episode is done."""
|
||||
return self._done
|
||||
|
||||
def close(self):
|
||||
"""Send close messages to the external process and join them."""
|
||||
self._batch_env.close()
|
||||
|
||||
def _parse_shape(self, space):
|
||||
"""Get a tensor shape from a OpenAI Gym space.
|
||||
|
||||
Args:
|
||||
space: Gym space.
|
||||
|
||||
Returns:
|
||||
Shape tuple.
|
||||
"""
|
||||
if isinstance(space, gym.spaces.Discrete):
|
||||
return ()
|
||||
if isinstance(space, gym.spaces.Box):
|
||||
return space.shape
|
||||
raise NotImplementedError()
|
||||
|
||||
def _parse_dtype(self, space):
|
||||
"""Get a tensor dtype from a OpenAI Gym space.
|
||||
|
||||
Args:
|
||||
space: Gym space.
|
||||
|
||||
Returns:
|
||||
TensorFlow data type.
|
||||
"""
|
||||
if isinstance(space, gym.spaces.Discrete):
|
||||
return tf.int32
|
||||
if isinstance(space, gym.spaces.Box):
|
||||
return tf.float32
|
||||
raise NotImplementedError()
|
||||
162
examples/pybullet/gym/pybullet_envs/agents/tools/in_graph_env.py
Normal file
162
examples/pybullet/gym/pybullet_envs/agents/tools/in_graph_env.py
Normal file
@@ -0,0 +1,162 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Put an OpenAI Gym environment into the TensorFlow graph."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import gym
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
class InGraphEnv(object):
|
||||
"""Put an OpenAI Gym environment into the TensorFlow graph.
|
||||
|
||||
The environment will be stepped and reset inside of the graph using
|
||||
tf.py_func(). The current observation, action, reward, and done flag are held
|
||||
in according variables.
|
||||
"""
|
||||
|
||||
def __init__(self, env):
|
||||
"""Put an OpenAI Gym environment into the TensorFlow graph.
|
||||
|
||||
Args:
|
||||
env: OpenAI Gym environment.
|
||||
"""
|
||||
self._env = env
|
||||
observ_shape = self._parse_shape(self._env.observation_space)
|
||||
observ_dtype = self._parse_dtype(self._env.observation_space)
|
||||
action_shape = self._parse_shape(self._env.action_space)
|
||||
action_dtype = self._parse_dtype(self._env.action_space)
|
||||
with tf.name_scope('environment'):
|
||||
self._observ = tf.Variable(
|
||||
tf.zeros(observ_shape, observ_dtype), name='observ', trainable=False)
|
||||
self._action = tf.Variable(
|
||||
tf.zeros(action_shape, action_dtype), name='action', trainable=False)
|
||||
self._reward = tf.Variable(
|
||||
0.0, dtype=tf.float32, name='reward', trainable=False)
|
||||
self._done = tf.Variable(
|
||||
True, dtype=tf.bool, name='done', trainable=False)
|
||||
self._step = tf.Variable(
|
||||
0, dtype=tf.int32, name='step', trainable=False)
|
||||
|
||||
def __getattr__(self, name):
|
||||
"""Forward unimplemented attributes to the original environment.
|
||||
|
||||
Args:
|
||||
name: Attribute that was accessed.
|
||||
|
||||
Returns:
|
||||
Value behind the attribute name in the wrapped environment.
|
||||
"""
|
||||
return getattr(self._env, name)
|
||||
|
||||
def simulate(self, action):
|
||||
"""Step the environment.
|
||||
|
||||
The result of the step can be accessed from the variables defined below.
|
||||
|
||||
Args:
|
||||
action: Tensor holding the action to apply.
|
||||
|
||||
Returns:
|
||||
Operation.
|
||||
"""
|
||||
with tf.name_scope('environment/simulate'):
|
||||
if action.dtype in (tf.float16, tf.float32, tf.float64):
|
||||
action = tf.check_numerics(action, 'action')
|
||||
observ_dtype = self._parse_dtype(self._env.observation_space)
|
||||
observ, reward, done = tf.py_func(
|
||||
lambda a: self._env.step(a)[:3], [action],
|
||||
[observ_dtype, tf.float32, tf.bool], name='step')
|
||||
observ = tf.check_numerics(observ, 'observ')
|
||||
reward = tf.check_numerics(reward, 'reward')
|
||||
return tf.group(
|
||||
self._observ.assign(observ),
|
||||
self._action.assign(action),
|
||||
self._reward.assign(reward),
|
||||
self._done.assign(done),
|
||||
self._step.assign_add(1))
|
||||
|
||||
def reset(self):
|
||||
"""Reset the environment.
|
||||
|
||||
Returns:
|
||||
Tensor of the current observation.
|
||||
"""
|
||||
observ_dtype = self._parse_dtype(self._env.observation_space)
|
||||
observ = tf.py_func(self._env.reset, [], observ_dtype, name='reset')
|
||||
observ = tf.check_numerics(observ, 'observ')
|
||||
with tf.control_dependencies([
|
||||
self._observ.assign(observ),
|
||||
self._reward.assign(0),
|
||||
self._done.assign(False)]):
|
||||
return tf.identity(observ)
|
||||
|
||||
@property
|
||||
def observ(self):
|
||||
"""Access the variable holding the current observation."""
|
||||
return self._observ
|
||||
|
||||
@property
|
||||
def action(self):
|
||||
"""Access the variable holding the last recieved action."""
|
||||
return self._action
|
||||
|
||||
@property
|
||||
def reward(self):
|
||||
"""Access the variable holding the current reward."""
|
||||
return self._reward
|
||||
|
||||
@property
|
||||
def done(self):
|
||||
"""Access the variable indicating whether the episode is done."""
|
||||
return self._done
|
||||
|
||||
@property
|
||||
def step(self):
|
||||
"""Access the variable containg total steps of this environment."""
|
||||
return self._step
|
||||
|
||||
def _parse_shape(self, space):
|
||||
"""Get a tensor shape from a OpenAI Gym space.
|
||||
|
||||
Args:
|
||||
space: Gym space.
|
||||
|
||||
Returns:
|
||||
Shape tuple.
|
||||
"""
|
||||
if isinstance(space, gym.spaces.Discrete):
|
||||
return ()
|
||||
if isinstance(space, gym.spaces.Box):
|
||||
return space.shape
|
||||
raise NotImplementedError()
|
||||
|
||||
def _parse_dtype(self, space):
|
||||
"""Get a tensor dtype from a OpenAI Gym space.
|
||||
|
||||
Args:
|
||||
space: Gym space.
|
||||
|
||||
Returns:
|
||||
TensorFlow data type.
|
||||
"""
|
||||
if isinstance(space, gym.spaces.Discrete):
|
||||
return tf.int32
|
||||
if isinstance(space, gym.spaces.Box):
|
||||
return tf.float32
|
||||
raise NotImplementedError()
|
||||
233
examples/pybullet/gym/pybullet_envs/agents/tools/loop.py
Normal file
233
examples/pybullet/gym/pybullet_envs/agents/tools/loop.py
Normal file
@@ -0,0 +1,233 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Execute operations in a loop and coordinate logging and checkpoints."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import collections
|
||||
import os
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from . import streaming_mean
|
||||
|
||||
|
||||
_Phase = collections.namedtuple(
|
||||
'Phase',
|
||||
'name, writer, op, batch, steps, feed, report_every, log_every,'
|
||||
'checkpoint_every')
|
||||
|
||||
|
||||
class Loop(object):
|
||||
"""Execute operations in a loop and coordinate logging and checkpoints.
|
||||
|
||||
Supports multiple phases, that define their own operations to run, and
|
||||
intervals for reporting scores, logging summaries, and storing checkpoints.
|
||||
All class state is stored in-graph to properly recover from checkpoints.
|
||||
"""
|
||||
|
||||
def __init__(self, logdir, step=None, log=None, report=None, reset=None):
|
||||
"""Execute operations in a loop and coordinate logging and checkpoints.
|
||||
|
||||
The step, log, report, and report arguments will get created if not
|
||||
provided. Reset is used to indicate switching to a new phase, so that the
|
||||
model can start a new computation in case its computation is split over
|
||||
multiple training steps.
|
||||
|
||||
Args:
|
||||
logdir: Will contain checkpoints and summaries for each phase.
|
||||
step: Variable of the global step (optional).
|
||||
log: Tensor indicating to the model to compute summary tensors.
|
||||
report: Tensor indicating to the loop to report the current mean score.
|
||||
reset: Tensor indicating to the model to start a new computation.
|
||||
"""
|
||||
self._logdir = logdir
|
||||
self._step = (
|
||||
tf.Variable(0, False, name='global_step') if step is None else step)
|
||||
self._log = tf.placeholder(tf.bool) if log is None else log
|
||||
self._report = tf.placeholder(tf.bool) if report is None else report
|
||||
self._reset = tf.placeholder(tf.bool) if reset is None else reset
|
||||
self._phases = []
|
||||
|
||||
def add_phase(
|
||||
self, name, done, score, summary, steps,
|
||||
report_every=None, log_every=None, checkpoint_every=None, feed=None):
|
||||
"""Add a phase to the loop protocol.
|
||||
|
||||
If the model breaks long computation into multiple steps, the done tensor
|
||||
indicates whether the current score should be added to the mean counter.
|
||||
For example, in reinforcement learning we only have a valid score at the
|
||||
end of the episode.
|
||||
|
||||
Score and done tensors can either be scalars or vectors, to support
|
||||
single and batched computations.
|
||||
|
||||
Args:
|
||||
name: Name for the phase, used for the summary writer.
|
||||
done: Tensor indicating whether current score can be used.
|
||||
score: Tensor holding the current, possibly intermediate, score.
|
||||
summary: Tensor holding summary string to write if not an empty string.
|
||||
steps: Duration of the phase in steps.
|
||||
report_every: Yield mean score every this number of steps.
|
||||
log_every: Request summaries via `log` tensor every this number of steps.
|
||||
checkpoint_every: Write checkpoint every this number of steps.
|
||||
feed: Additional feed dictionary for the session run call.
|
||||
|
||||
Raises:
|
||||
ValueError: Unknown rank for done or score tensors.
|
||||
"""
|
||||
done = tf.convert_to_tensor(done, tf.bool)
|
||||
score = tf.convert_to_tensor(score, tf.float32)
|
||||
summary = tf.convert_to_tensor(summary, tf.string)
|
||||
feed = feed or {}
|
||||
if done.shape.ndims is None or score.shape.ndims is None:
|
||||
raise ValueError("Rank of 'done' and 'score' tensors must be known.")
|
||||
writer = self._logdir and tf.summary.FileWriter(
|
||||
os.path.join(self._logdir, name), tf.get_default_graph(),
|
||||
flush_secs=60)
|
||||
op = self._define_step(done, score, summary)
|
||||
batch = 1 if score.shape.ndims == 0 else score.shape[0].value
|
||||
self._phases.append(_Phase(
|
||||
name, writer, op, batch, int(steps), feed, report_every,
|
||||
log_every, checkpoint_every))
|
||||
|
||||
def run(self, sess, saver, max_step=None):
|
||||
"""Run the loop schedule for a specified number of steps.
|
||||
|
||||
Call the operation of the current phase until the global step reaches the
|
||||
specified maximum step. Phases are repeated over and over in the order they
|
||||
were added.
|
||||
|
||||
Args:
|
||||
sess: Session to use to run the phase operation.
|
||||
saver: Saver used for checkpointing.
|
||||
max_step: Run the operations until the step reaches this limit.
|
||||
|
||||
Yields:
|
||||
Reported mean scores.
|
||||
"""
|
||||
global_step = sess.run(self._step)
|
||||
steps_made = 1
|
||||
while True:
|
||||
if max_step and global_step >= max_step:
|
||||
break
|
||||
phase, epoch, steps_in = self._find_current_phase(global_step)
|
||||
phase_step = epoch * phase.steps + steps_in
|
||||
if steps_in % phase.steps < steps_made:
|
||||
message = '\n' + ('-' * 50) + '\n'
|
||||
message += 'Phase {} (phase step {}, global step {}).'
|
||||
tf.logging.info(message.format(phase.name, phase_step, global_step))
|
||||
# Populate book keeping tensors.
|
||||
phase.feed[self._reset] = (steps_in < steps_made)
|
||||
phase.feed[self._log] = (
|
||||
phase.writer and
|
||||
self._is_every_steps(phase_step, phase.batch, phase.log_every))
|
||||
phase.feed[self._report] = (
|
||||
self._is_every_steps(phase_step, phase.batch, phase.report_every))
|
||||
summary, mean_score, global_step, steps_made = sess.run(
|
||||
phase.op, phase.feed)
|
||||
if self._is_every_steps(phase_step, phase.batch, phase.checkpoint_every):
|
||||
self._store_checkpoint(sess, saver, global_step)
|
||||
if self._is_every_steps(phase_step, phase.batch, phase.report_every):
|
||||
yield mean_score
|
||||
if summary and phase.writer:
|
||||
# We want smaller phases to catch up at the beginnig of each epoch so
|
||||
# that their graphs are aligned.
|
||||
longest_phase = max(phase.steps for phase in self._phases)
|
||||
summary_step = epoch * longest_phase + steps_in
|
||||
phase.writer.add_summary(summary, summary_step)
|
||||
|
||||
def _is_every_steps(self, phase_step, batch, every):
|
||||
"""Determine whether a periodic event should happen at this step.
|
||||
|
||||
Args:
|
||||
phase_step: The incrementing step.
|
||||
batch: The number of steps progressed at once.
|
||||
every: The interval of the periode.
|
||||
|
||||
Returns:
|
||||
Boolean of whether the event should happen.
|
||||
"""
|
||||
if not every:
|
||||
return False
|
||||
covered_steps = range(phase_step, phase_step + batch)
|
||||
return any((step + 1) % every == 0 for step in covered_steps)
|
||||
|
||||
def _find_current_phase(self, global_step):
|
||||
"""Determine the current phase based on the global step.
|
||||
|
||||
This ensures continuing the correct phase after restoring checkoints.
|
||||
|
||||
Args:
|
||||
global_step: The global number of steps performed across all phases.
|
||||
|
||||
Returns:
|
||||
Tuple of phase object, epoch number, and phase steps within the epoch.
|
||||
"""
|
||||
epoch_size = sum(phase.steps for phase in self._phases)
|
||||
epoch = int(global_step // epoch_size)
|
||||
steps_in = global_step % epoch_size
|
||||
for phase in self._phases:
|
||||
if steps_in < phase.steps:
|
||||
return phase, epoch, steps_in
|
||||
steps_in -= phase.steps
|
||||
|
||||
def _define_step(self, done, score, summary):
|
||||
"""Combine operations of a phase.
|
||||
|
||||
Keeps track of the mean score and when to report it.
|
||||
|
||||
Args:
|
||||
done: Tensor indicating whether current score can be used.
|
||||
score: Tensor holding the current, possibly intermediate, score.
|
||||
summary: Tensor holding summary string to write if not an empty string.
|
||||
|
||||
Returns:
|
||||
Tuple of summary tensor, mean score, and new global step. The mean score
|
||||
is zero for non reporting steps.
|
||||
"""
|
||||
if done.shape.ndims == 0:
|
||||
done = done[None]
|
||||
if score.shape.ndims == 0:
|
||||
score = score[None]
|
||||
score_mean = streaming_mean.StreamingMean((), tf.float32)
|
||||
with tf.control_dependencies([done, score, summary]):
|
||||
done_score = tf.gather(score, tf.where(done)[:, 0])
|
||||
submit_score = tf.cond(
|
||||
tf.reduce_any(done), lambda: score_mean.submit(done_score), tf.no_op)
|
||||
with tf.control_dependencies([submit_score]):
|
||||
mean_score = tf.cond(self._report, score_mean.clear, float)
|
||||
steps_made = tf.shape(score)[0]
|
||||
next_step = self._step.assign_add(steps_made)
|
||||
with tf.control_dependencies([mean_score, next_step]):
|
||||
return tf.identity(summary), mean_score, next_step, steps_made
|
||||
|
||||
def _store_checkpoint(self, sess, saver, global_step):
|
||||
"""Store a checkpoint if a log directory was provided to the constructor.
|
||||
|
||||
The directory will be created if needed.
|
||||
|
||||
Args:
|
||||
sess: Session containing variables to store.
|
||||
saver: Saver used for checkpointing.
|
||||
global_step: Step number of the checkpoint name.
|
||||
"""
|
||||
if not self._logdir or not saver:
|
||||
return
|
||||
tf.gfile.MakeDirs(self._logdir)
|
||||
filename = os.path.join(self._logdir, 'model.ckpt')
|
||||
saver.save(sess, filename, global_step)
|
||||
@@ -0,0 +1,49 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Mock algorithm for testing reinforcement learning code."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
class MockAlgorithm(object):
|
||||
"""Produce random actions and empty summaries."""
|
||||
|
||||
def __init__(self, envs):
|
||||
"""Produce random actions and empty summaries.
|
||||
|
||||
Args:
|
||||
envs: List of in-graph environments.
|
||||
"""
|
||||
self._envs = envs
|
||||
|
||||
def begin_episode(self, unused_agent_indices):
|
||||
return tf.constant('')
|
||||
|
||||
def perform(self, agent_indices, unused_observ):
|
||||
shape = (tf.shape(agent_indices)[0],) + self._envs[0].action_space.shape
|
||||
low = self._envs[0].action_space.low
|
||||
high = self._envs[0].action_space.high
|
||||
action = tf.random_uniform(shape) * (high - low) + low
|
||||
return action, tf.constant('')
|
||||
|
||||
def experience(self, unused_agent_indices, *unused_transition):
|
||||
return tf.constant('')
|
||||
|
||||
def end_episode(self, unused_agent_indices):
|
||||
return tf.constant('')
|
||||
@@ -0,0 +1,86 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Mock environment for testing reinforcement learning code."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import gym
|
||||
import gym.spaces
|
||||
import numpy as np
|
||||
|
||||
|
||||
class MockEnvironment(object):
|
||||
"""Generate random agent input and keep track of statistics."""
|
||||
|
||||
def __init__(self, observ_shape, action_shape, min_duration, max_duration):
|
||||
"""Generate random agent input and keep track of statistics.
|
||||
|
||||
Args:
|
||||
observ_shape: Shape for the random observations.
|
||||
action_shape: Shape for the action space.
|
||||
min_duration: Minimum number of steps per episode.
|
||||
max_duration: Maximum number of steps per episode.
|
||||
|
||||
Attributes:
|
||||
steps: List of actual simulated lengths for all episodes.
|
||||
durations: List of decided lengths for all episodes.
|
||||
"""
|
||||
self._observ_shape = observ_shape
|
||||
self._action_shape = action_shape
|
||||
self._min_duration = min_duration
|
||||
self._max_duration = max_duration
|
||||
self._random = np.random.RandomState(0)
|
||||
self.steps = []
|
||||
self.durations = []
|
||||
|
||||
@property
|
||||
def observation_space(self):
|
||||
low = np.zeros(self._observ_shape)
|
||||
high = np.ones(self._observ_shape)
|
||||
return gym.spaces.Box(low, high)
|
||||
|
||||
@property
|
||||
def action_space(self):
|
||||
low = np.zeros(self._action_shape)
|
||||
high = np.ones(self._action_shape)
|
||||
return gym.spaces.Box(low, high)
|
||||
|
||||
@property
|
||||
def unwrapped(self):
|
||||
return self
|
||||
|
||||
def step(self, action):
|
||||
assert self.action_space.contains(action)
|
||||
assert self.steps[-1] < self.durations[-1]
|
||||
self.steps[-1] += 1
|
||||
observ = self._current_observation()
|
||||
reward = self._current_reward()
|
||||
done = self.steps[-1] >= self.durations[-1]
|
||||
info = {}
|
||||
return observ, reward, done, info
|
||||
|
||||
def reset(self):
|
||||
duration = self._random.randint(self._min_duration, self._max_duration + 1)
|
||||
self.steps.append(0)
|
||||
self.durations.append(duration)
|
||||
return self._current_observation()
|
||||
|
||||
def _current_observation(self):
|
||||
return self._random.uniform(0, 1, self._observ_shape)
|
||||
|
||||
def _current_reward(self):
|
||||
return self._random.uniform(-1, 1)
|
||||
147
examples/pybullet/gym/pybullet_envs/agents/tools/simulate.py
Normal file
147
examples/pybullet/gym/pybullet_envs/agents/tools/simulate.py
Normal file
@@ -0,0 +1,147 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""In-graph simulation step of a vectorized algorithm with environments."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from . import streaming_mean
|
||||
|
||||
|
||||
def simulate(batch_env, algo, log=True, reset=False):
|
||||
"""Simulation step of a vecrotized algorithm with in-graph environments.
|
||||
|
||||
Integrates the operations implemented by the algorithm and the environments
|
||||
into a combined operation.
|
||||
|
||||
Args:
|
||||
batch_env: In-graph batch environment.
|
||||
algo: Algorithm instance implementing required operations.
|
||||
log: Tensor indicating whether to compute and return summaries.
|
||||
reset: Tensor causing all environments to reset.
|
||||
|
||||
Returns:
|
||||
Tuple of tensors containing done flags for the current episodes, possibly
|
||||
intermediate scores for the episodes, and a summary tensor.
|
||||
"""
|
||||
|
||||
def _define_begin_episode(agent_indices):
|
||||
"""Reset environments, intermediate scores and durations for new episodes.
|
||||
|
||||
Args:
|
||||
agent_indices: Tensor containing batch indices starting an episode.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
assert agent_indices.shape.ndims == 1
|
||||
zero_scores = tf.zeros_like(agent_indices, tf.float32)
|
||||
zero_durations = tf.zeros_like(agent_indices)
|
||||
reset_ops = [
|
||||
batch_env.reset(agent_indices),
|
||||
tf.scatter_update(score, agent_indices, zero_scores),
|
||||
tf.scatter_update(length, agent_indices, zero_durations)]
|
||||
with tf.control_dependencies(reset_ops):
|
||||
return algo.begin_episode(agent_indices)
|
||||
|
||||
def _define_step():
|
||||
"""Request actions from the algorithm and apply them to the environments.
|
||||
|
||||
Increments the lengths of all episodes and increases their scores by the
|
||||
current reward. After stepping the environments, provides the full
|
||||
transition tuple to the algorithm.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
prevob = batch_env.observ + 0 # Ensure a copy of the variable value.
|
||||
agent_indices = tf.range(len(batch_env))
|
||||
action, step_summary = algo.perform(agent_indices, prevob)
|
||||
action.set_shape(batch_env.action.shape)
|
||||
with tf.control_dependencies([batch_env.simulate(action)]):
|
||||
add_score = score.assign_add(batch_env.reward)
|
||||
inc_length = length.assign_add(tf.ones(len(batch_env), tf.int32))
|
||||
with tf.control_dependencies([add_score, inc_length]):
|
||||
agent_indices = tf.range(len(batch_env))
|
||||
experience_summary = algo.experience(
|
||||
agent_indices, prevob, batch_env.action, batch_env.reward,
|
||||
batch_env.done, batch_env.observ)
|
||||
return tf.summary.merge([step_summary, experience_summary])
|
||||
|
||||
def _define_end_episode(agent_indices):
|
||||
"""Notify the algorithm of ending episodes.
|
||||
|
||||
Also updates the mean score and length counters used for summaries.
|
||||
|
||||
Args:
|
||||
agent_indices: Tensor holding batch indices that end their episodes.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
assert agent_indices.shape.ndims == 1
|
||||
submit_score = mean_score.submit(tf.gather(score, agent_indices))
|
||||
submit_length = mean_length.submit(
|
||||
tf.cast(tf.gather(length, agent_indices), tf.float32))
|
||||
with tf.control_dependencies([submit_score, submit_length]):
|
||||
return algo.end_episode(agent_indices)
|
||||
|
||||
def _define_summaries():
|
||||
"""Reset the average score and duration, and return them as summary.
|
||||
|
||||
Returns:
|
||||
Summary string.
|
||||
"""
|
||||
score_summary = tf.cond(
|
||||
tf.logical_and(log, tf.cast(mean_score.count, tf.bool)),
|
||||
lambda: tf.summary.scalar('mean_score', mean_score.clear()), str)
|
||||
length_summary = tf.cond(
|
||||
tf.logical_and(log, tf.cast(mean_length.count, tf.bool)),
|
||||
lambda: tf.summary.scalar('mean_length', mean_length.clear()), str)
|
||||
return tf.summary.merge([score_summary, length_summary])
|
||||
|
||||
with tf.name_scope('simulate'):
|
||||
log = tf.convert_to_tensor(log)
|
||||
reset = tf.convert_to_tensor(reset)
|
||||
with tf.variable_scope('simulate_temporary'):
|
||||
score = tf.Variable(
|
||||
tf.zeros(len(batch_env), dtype=tf.float32), False, name='score')
|
||||
length = tf.Variable(
|
||||
tf.zeros(len(batch_env), dtype=tf.int32), False, name='length')
|
||||
mean_score = streaming_mean.StreamingMean((), tf.float32)
|
||||
mean_length = streaming_mean.StreamingMean((), tf.float32)
|
||||
agent_indices = tf.cond(
|
||||
reset,
|
||||
lambda: tf.range(len(batch_env)),
|
||||
lambda: tf.cast(tf.where(batch_env.done)[:, 0], tf.int32))
|
||||
begin_episode = tf.cond(
|
||||
tf.cast(tf.shape(agent_indices)[0], tf.bool),
|
||||
lambda: _define_begin_episode(agent_indices), str)
|
||||
with tf.control_dependencies([begin_episode]):
|
||||
step = _define_step()
|
||||
with tf.control_dependencies([step]):
|
||||
agent_indices = tf.cast(tf.where(batch_env.done)[:, 0], tf.int32)
|
||||
end_episode = tf.cond(
|
||||
tf.cast(tf.shape(agent_indices)[0], tf.bool),
|
||||
lambda: _define_end_episode(agent_indices), str)
|
||||
with tf.control_dependencies([end_episode]):
|
||||
summary = tf.summary.merge([
|
||||
_define_summaries(), begin_episode, step, end_episode])
|
||||
with tf.control_dependencies([summary]):
|
||||
done, score = tf.identity(batch_env.done), tf.identity(score)
|
||||
return done, score, summary
|
||||
@@ -0,0 +1,67 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Compute a streaming estimation of the mean of submitted tensors."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
class StreamingMean(object):
|
||||
"""Compute a streaming estimation of the mean of submitted tensors."""
|
||||
|
||||
def __init__(self, shape, dtype):
|
||||
"""Specify the shape and dtype of the mean to be estimated.
|
||||
|
||||
Note that a float mean to zero submitted elements is NaN, while computing
|
||||
the integer mean of zero elements raises a division by zero error.
|
||||
|
||||
Args:
|
||||
shape: Shape of the mean to compute.
|
||||
dtype: Data type of the mean to compute.
|
||||
"""
|
||||
self._dtype = dtype
|
||||
self._sum = tf.Variable(lambda: tf.zeros(shape, dtype), False)
|
||||
self._count = tf.Variable(lambda: 0, trainable=False)
|
||||
|
||||
@property
|
||||
def value(self):
|
||||
"""The current value of the mean."""
|
||||
return self._sum / tf.cast(self._count, self._dtype)
|
||||
|
||||
@property
|
||||
def count(self):
|
||||
"""The number of submitted samples."""
|
||||
return self._count
|
||||
|
||||
def submit(self, value):
|
||||
"""Submit a single or batch tensor to refine the streaming mean."""
|
||||
# Add a batch dimension if necessary.
|
||||
if value.shape.ndims == self._sum.shape.ndims:
|
||||
value = value[None, ...]
|
||||
return tf.group(
|
||||
self._sum.assign_add(tf.reduce_sum(value, 0)),
|
||||
self._count.assign_add(tf.shape(value)[0]))
|
||||
|
||||
def clear(self):
|
||||
"""Return the mean estimate and reset the streaming statistics."""
|
||||
value = self._sum / tf.cast(self._count, self._dtype)
|
||||
with tf.control_dependencies([value]):
|
||||
reset_value = self._sum.assign(tf.zeros_like(self._sum))
|
||||
reset_count = self._count.assign(0)
|
||||
with tf.control_dependencies([reset_value, reset_count]):
|
||||
return tf.identity(value)
|
||||
558
examples/pybullet/gym/pybullet_envs/agents/tools/wrappers.py
Normal file
558
examples/pybullet/gym/pybullet_envs/agents/tools/wrappers.py
Normal file
@@ -0,0 +1,558 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Wrappers for OpenAI Gym environments."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import atexit
|
||||
import multiprocessing
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
import gym
|
||||
import gym.spaces
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
class AutoReset(object):
|
||||
"""Automatically reset environment when the episode is done."""
|
||||
|
||||
def __init__(self, env):
|
||||
self._env = env
|
||||
self._done = True
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._env, name)
|
||||
|
||||
def step(self, action):
|
||||
if self._done:
|
||||
observ, reward, done, info = self._env.reset(), 0.0, False, {}
|
||||
else:
|
||||
observ, reward, done, info = self._env.step(action)
|
||||
self._done = done
|
||||
return observ, reward, done, info
|
||||
|
||||
def reset(self):
|
||||
self._done = False
|
||||
return self._env.reset()
|
||||
|
||||
|
||||
class ActionRepeat(object):
|
||||
"""Repeat the agent action multiple steps."""
|
||||
|
||||
def __init__(self, env, amount):
|
||||
self._env = env
|
||||
self._amount = amount
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._env, name)
|
||||
|
||||
def step(self, action):
|
||||
done = False
|
||||
total_reward = 0
|
||||
current_step = 0
|
||||
while current_step < self._amount and not done:
|
||||
observ, reward, done, info = self._env.step(action)
|
||||
total_reward += reward
|
||||
current_step += 1
|
||||
return observ, total_reward, done, info
|
||||
|
||||
|
||||
class RandomStart(object):
|
||||
"""Perform random number of random actions at the start of the episode."""
|
||||
|
||||
def __init__(self, env, max_steps):
|
||||
self._env = env
|
||||
self._max_steps = max_steps
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._env, name)
|
||||
|
||||
def reset(self):
|
||||
observ = self._env.reset()
|
||||
random_steps = np.random.randint(0, self._max_steps)
|
||||
for _ in range(random_steps):
|
||||
action = self._env.action_space.sample()
|
||||
observ, unused_reward, done, unused_info = self._env.step(action)
|
||||
if done:
|
||||
tf.logging.warning('Episode ended during random start.')
|
||||
return self.reset()
|
||||
return observ
|
||||
|
||||
|
||||
class FrameHistory(object):
|
||||
"""Augment the observation with past observations."""
|
||||
|
||||
def __init__(self, env, past_indices, flatten):
|
||||
"""Augment the observation with past observations.
|
||||
|
||||
Implemented as a Numpy ring buffer holding the necessary past observations.
|
||||
|
||||
Args:
|
||||
env: OpenAI Gym environment to wrap.
|
||||
past_indices: List of non-negative integers indicating the time offsets
|
||||
from the current time step of observations to include.
|
||||
flatten: Concatenate the past observations rather than stacking them.
|
||||
|
||||
Raises:
|
||||
KeyError: The current observation is not included in the indices.
|
||||
"""
|
||||
if 0 not in past_indices:
|
||||
raise KeyError('Past indices should include 0 for the current frame.')
|
||||
self._env = env
|
||||
self._past_indices = past_indices
|
||||
self._step = 0
|
||||
self._buffer = None
|
||||
self._capacity = max(past_indices)
|
||||
self._flatten = flatten
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._env, name)
|
||||
|
||||
@property
|
||||
def observation_space(self):
|
||||
low = self._env.observation_space.low
|
||||
high = self._env.observation_space.high
|
||||
low = np.repeat(low[None, ...], len(self._past_indices), 0)
|
||||
high = np.repeat(high[None, ...], len(self._past_indices), 0)
|
||||
if self._flatten:
|
||||
low = np.reshape(low, (-1,) + low.shape[2:])
|
||||
high = np.reshape(high, (-1,) + high.shape[2:])
|
||||
return gym.spaces.Box(low, high)
|
||||
|
||||
def step(self, action):
|
||||
observ, reward, done, info = self._env.step(action)
|
||||
self._step += 1
|
||||
self._buffer[self._step % self._capacity] = observ
|
||||
observ = self._select_frames()
|
||||
return observ, reward, done, info
|
||||
|
||||
def reset(self):
|
||||
observ = self._env.reset()
|
||||
self._buffer = np.repeat(observ[None, ...], self._capacity, 0)
|
||||
self._step = 0
|
||||
return self._select_frames()
|
||||
|
||||
def _select_frames(self):
|
||||
indices = [
|
||||
(self._step - index) % self._capacity for index in self._past_indices]
|
||||
observ = self._buffer[indices]
|
||||
if self._flatten:
|
||||
observ = np.reshape(observ, (-1,) + observ.shape[2:])
|
||||
return observ
|
||||
|
||||
|
||||
class FrameDelta(object):
|
||||
"""Convert the observation to a difference from the previous observation."""
|
||||
|
||||
def __init__(self, env):
|
||||
self._env = env
|
||||
self._last = None
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._env, name)
|
||||
|
||||
@property
|
||||
def observation_space(self):
|
||||
low = self._env.observation_space.low
|
||||
high = self._env.observation_space.high
|
||||
low, high = low - high, high - low
|
||||
return gym.spaces.Box(low, high)
|
||||
|
||||
def step(self, action):
|
||||
observ, reward, done, info = self._env.step(action)
|
||||
delta = observ - self._last
|
||||
self._last = observ
|
||||
return delta, reward, done, info
|
||||
|
||||
def reset(self):
|
||||
observ = self._env.reset()
|
||||
self._last = observ
|
||||
return observ
|
||||
|
||||
|
||||
class RangeNormalize(object):
|
||||
"""Normalize the specialized observation and action ranges to [-1, 1]."""
|
||||
|
||||
def __init__(self, env, observ=None, action=None):
|
||||
self._env = env
|
||||
self._should_normalize_observ = (
|
||||
observ is not False and self._is_finite(self._env.observation_space))
|
||||
if observ is True and not self._should_normalize_observ:
|
||||
raise ValueError('Cannot normalize infinite observation range.')
|
||||
if observ is None and not self._should_normalize_observ:
|
||||
tf.logging.info('Not normalizing infinite observation range.')
|
||||
self._should_normalize_action = (
|
||||
action is not False and self._is_finite(self._env.action_space))
|
||||
if action is True and not self._should_normalize_action:
|
||||
raise ValueError('Cannot normalize infinite action range.')
|
||||
if action is None and not self._should_normalize_action:
|
||||
tf.logging.info('Not normalizing infinite action range.')
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._env, name)
|
||||
|
||||
@property
|
||||
def observation_space(self):
|
||||
space = self._env.observation_space
|
||||
if not self._should_normalize_observ:
|
||||
return space
|
||||
return gym.spaces.Box(-np.ones(space.shape), np.ones(space.shape))
|
||||
|
||||
@property
|
||||
def action_space(self):
|
||||
space = self._env.action_space
|
||||
if not self._should_normalize_action:
|
||||
return space
|
||||
return gym.spaces.Box(-np.ones(space.shape), np.ones(space.shape))
|
||||
|
||||
def step(self, action):
|
||||
if self._should_normalize_action:
|
||||
action = self._denormalize_action(action)
|
||||
observ, reward, done, info = self._env.step(action)
|
||||
if self._should_normalize_observ:
|
||||
observ = self._normalize_observ(observ)
|
||||
return observ, reward, done, info
|
||||
|
||||
def reset(self):
|
||||
observ = self._env.reset()
|
||||
if self._should_normalize_observ:
|
||||
observ = self._normalize_observ(observ)
|
||||
return observ
|
||||
|
||||
def _denormalize_action(self, action):
|
||||
min_ = self._env.action_space.low
|
||||
max_ = self._env.action_space.high
|
||||
action = (action + 1) / 2 * (max_ - min_) + min_
|
||||
return action
|
||||
|
||||
def _normalize_observ(self, observ):
|
||||
min_ = self._env.observation_space.low
|
||||
max_ = self._env.observation_space.high
|
||||
observ = 2 * (observ - min_) / (max_ - min_) - 1
|
||||
return observ
|
||||
|
||||
def _is_finite(self, space):
|
||||
return np.isfinite(space.low).all() and np.isfinite(space.high).all()
|
||||
|
||||
|
||||
class ClipAction(object):
|
||||
"""Clip out of range actions to the action space of the environment."""
|
||||
|
||||
def __init__(self, env):
|
||||
self._env = env
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._env, name)
|
||||
|
||||
@property
|
||||
def action_space(self):
|
||||
shape = self._env.action_space.shape
|
||||
return gym.spaces.Box(-np.inf * np.ones(shape), np.inf * np.ones(shape))
|
||||
|
||||
def step(self, action):
|
||||
action_space = self._env.action_space
|
||||
action = np.clip(action, action_space.low, action_space.high)
|
||||
return self._env.step(action)
|
||||
|
||||
|
||||
class LimitDuration(object):
|
||||
"""End episodes after specified number of steps."""
|
||||
|
||||
def __init__(self, env, duration):
|
||||
self._env = env
|
||||
self._duration = duration
|
||||
self._step = None
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._env, name)
|
||||
|
||||
def step(self, action):
|
||||
if self._step is None:
|
||||
raise RuntimeError('Must reset environment.')
|
||||
observ, reward, done, info = self._env.step(action)
|
||||
self._step += 1
|
||||
if self._step >= self._duration:
|
||||
done = True
|
||||
self._step = None
|
||||
return observ, reward, done, info
|
||||
|
||||
def reset(self):
|
||||
self._step = 0
|
||||
return self._env.reset()
|
||||
|
||||
|
||||
class ExternalProcess(object):
|
||||
"""Step environment in a separate process for lock free paralellism."""
|
||||
|
||||
# Message types for communication via the pipe.
|
||||
_ACCESS = 1
|
||||
_CALL = 2
|
||||
_RESULT = 3
|
||||
_EXCEPTION = 4
|
||||
_CLOSE = 5
|
||||
|
||||
def __init__(self, constructor):
|
||||
"""Step environment in a separate process for lock free paralellism.
|
||||
|
||||
The environment will be created in the external process by calling the
|
||||
specified callable. This can be an environment class, or a function
|
||||
creating the environment and potentially wrapping it. The returned
|
||||
environment should not access global variables.
|
||||
|
||||
Args:
|
||||
constructor: Callable that creates and returns an OpenAI gym environment.
|
||||
|
||||
Attributes:
|
||||
observation_space: The cached observation space of the environment.
|
||||
action_space: The cached action space of the environment.
|
||||
"""
|
||||
self._conn, conn = multiprocessing.Pipe()
|
||||
self._process = multiprocessing.Process(
|
||||
target=self._worker, args=(constructor, conn))
|
||||
atexit.register(self.close)
|
||||
self._process.start()
|
||||
self._observ_space = None
|
||||
self._action_space = None
|
||||
|
||||
@property
|
||||
def observation_space(self):
|
||||
if not self._observ_space:
|
||||
self._observ_space = self.__getattr__('observation_space')
|
||||
return self._observ_space
|
||||
|
||||
@property
|
||||
def action_space(self):
|
||||
if not self._action_space:
|
||||
self._action_space = self.__getattr__('action_space')
|
||||
return self._action_space
|
||||
|
||||
def __getattr__(self, name):
|
||||
"""Request an attribute from the environment.
|
||||
|
||||
Note that this involves communication with the external process, so it can
|
||||
be slow.
|
||||
|
||||
Args:
|
||||
name: Attribute to access.
|
||||
|
||||
Returns:
|
||||
Value of the attribute.
|
||||
"""
|
||||
self._conn.send((self._ACCESS, name))
|
||||
return self._receive()
|
||||
|
||||
def call(self, name, *args, **kwargs):
|
||||
"""Asynchronously call a method of the external environment.
|
||||
|
||||
Args:
|
||||
name: Name of the method to call.
|
||||
*args: Positional arguments to forward to the method.
|
||||
**kwargs: Keyword arguments to forward to the method.
|
||||
|
||||
Returns:
|
||||
Promise object that blocks and provides the return value when called.
|
||||
"""
|
||||
payload = name, args, kwargs
|
||||
self._conn.send((self._CALL, payload))
|
||||
return self._receive
|
||||
|
||||
def close(self):
|
||||
"""Send a close message to the external process and join it."""
|
||||
try:
|
||||
self._conn.send((self._CLOSE, None))
|
||||
self._conn.close()
|
||||
except IOError:
|
||||
# The connection was already closed.
|
||||
pass
|
||||
self._process.join()
|
||||
|
||||
def step(self, action, blocking=True):
|
||||
"""Step the environment.
|
||||
|
||||
Args:
|
||||
action: The action to apply to the environment.
|
||||
blocking: Whether to wait for the result.
|
||||
|
||||
Returns:
|
||||
Transition tuple when blocking, otherwise callable that returns the
|
||||
transition tuple.
|
||||
"""
|
||||
promise = self.call('step', action)
|
||||
if blocking:
|
||||
return promise()
|
||||
else:
|
||||
return promise
|
||||
|
||||
def reset(self, blocking=True):
|
||||
"""Reset the environment.
|
||||
|
||||
Args:
|
||||
blocking: Whether to wait for the result.
|
||||
|
||||
Returns:
|
||||
New observation when blocking, otherwise callable that returns the new
|
||||
observation.
|
||||
"""
|
||||
promise = self.call('reset')
|
||||
if blocking:
|
||||
return promise()
|
||||
else:
|
||||
return promise
|
||||
|
||||
def _receive(self):
|
||||
"""Wait for a message from the worker process and return its payload.
|
||||
|
||||
Raises:
|
||||
Exception: An exception was raised inside the worker process.
|
||||
KeyError: The reveived message is of an unknown type.
|
||||
|
||||
Returns:
|
||||
Payload object of the message.
|
||||
"""
|
||||
message, payload = self._conn.recv()
|
||||
# Re-raise exceptions in the main process.
|
||||
if message == self._EXCEPTION:
|
||||
stacktrace = payload
|
||||
raise Exception(stacktrace)
|
||||
if message == self._RESULT:
|
||||
return payload
|
||||
raise KeyError('Received message of unexpected type {}'.format(message))
|
||||
|
||||
def _worker(self, constructor, conn):
|
||||
"""The process waits for actions and sends back environment results.
|
||||
|
||||
Args:
|
||||
constructor: Constructor for the OpenAI Gym environment.
|
||||
conn: Connection for communication to the main process.
|
||||
"""
|
||||
try:
|
||||
env = constructor()
|
||||
while True:
|
||||
try:
|
||||
# Only block for short times to have keyboard exceptions be raised.
|
||||
if not conn.poll(0.1):
|
||||
continue
|
||||
message, payload = conn.recv()
|
||||
except (EOFError, KeyboardInterrupt):
|
||||
break
|
||||
if message == self._ACCESS:
|
||||
name = payload
|
||||
result = getattr(env, name)
|
||||
conn.send((self._RESULT, result))
|
||||
continue
|
||||
if message == self._CALL:
|
||||
name, args, kwargs = payload
|
||||
result = getattr(env, name)(*args, **kwargs)
|
||||
conn.send((self._RESULT, result))
|
||||
continue
|
||||
if message == self._CLOSE:
|
||||
assert payload is None
|
||||
break
|
||||
raise KeyError('Received message of unknown type {}'.format(message))
|
||||
except Exception: # pylint: disable=broad-except
|
||||
stacktrace = ''.join(traceback.format_exception(*sys.exc_info()))
|
||||
tf.logging.error('Error in environment process: {}'.format(stacktrace))
|
||||
conn.send((self._EXCEPTION, stacktrace))
|
||||
conn.close()
|
||||
|
||||
|
||||
class ConvertTo32Bit(object):
|
||||
"""Convert data types of an OpenAI Gym environment to 32 bit."""
|
||||
|
||||
def __init__(self, env):
|
||||
"""Convert data types of an OpenAI Gym environment to 32 bit.
|
||||
|
||||
Args:
|
||||
env: OpenAI Gym environment.
|
||||
"""
|
||||
self._env = env
|
||||
|
||||
def __getattr__(self, name):
|
||||
"""Forward unimplemented attributes to the original environment.
|
||||
|
||||
Args:
|
||||
name: Attribute that was accessed.
|
||||
|
||||
Returns:
|
||||
Value behind the attribute name in the wrapped environment.
|
||||
"""
|
||||
return getattr(self._env, name)
|
||||
|
||||
def step(self, action):
|
||||
"""Forward action to the wrapped environment.
|
||||
|
||||
Args:
|
||||
action: Action to apply to the environment.
|
||||
|
||||
Raises:
|
||||
ValueError: Invalid action.
|
||||
|
||||
Returns:
|
||||
Converted observation, converted reward, done flag, and info object.
|
||||
"""
|
||||
observ, reward, done, info = self._env.step(action)
|
||||
observ = self._convert_observ(observ)
|
||||
reward = self._convert_reward(reward)
|
||||
return observ, reward, done, info
|
||||
|
||||
def reset(self):
|
||||
"""Reset the environment and convert the resulting observation.
|
||||
|
||||
Returns:
|
||||
Converted observation.
|
||||
"""
|
||||
observ = self._env.reset()
|
||||
observ = self._convert_observ(observ)
|
||||
return observ
|
||||
|
||||
def _convert_observ(self, observ):
|
||||
"""Convert the observation to 32 bits.
|
||||
|
||||
Args:
|
||||
observ: Numpy observation.
|
||||
|
||||
Raises:
|
||||
ValueError: Observation contains infinite values.
|
||||
|
||||
Returns:
|
||||
Numpy observation with 32-bit data type.
|
||||
"""
|
||||
if not np.isfinite(observ).all():
|
||||
raise ValueError('Infinite observation encountered.')
|
||||
if observ.dtype == np.float64:
|
||||
return observ.astype(np.float32)
|
||||
if observ.dtype == np.int64:
|
||||
return observ.astype(np.int32)
|
||||
return observ
|
||||
|
||||
def _convert_reward(self, reward):
|
||||
"""Convert the reward to 32 bits.
|
||||
|
||||
Args:
|
||||
reward: Numpy reward.
|
||||
|
||||
Raises:
|
||||
ValueError: Rewards contain infinite values.
|
||||
|
||||
Returns:
|
||||
Numpy reward with 32-bit data type.
|
||||
"""
|
||||
if not np.isfinite(reward).all():
|
||||
raise ValueError('Infinite reward encountered.')
|
||||
return np.array(reward, dtype=np.float32)
|
||||
Reference in New Issue
Block a user