Merge pull request #2565 from araffin/feat/sb-example

Add Stable-Baselines example with SAC and TD3
This commit is contained in:
erwincoumans
2020-01-08 11:51:35 -08:00
committed by GitHub
3 changed files with 174 additions and 0 deletions

View File

@@ -0,0 +1,60 @@
# Code adapted from https://github.com/araffin/rl-baselines-zoo
# it requires stable-baselines to be installed
# Colab Notebook: https://colab.research.google.com/drive/1nZkHO4QTYfAksm9ZTaZ5vXyC7szZxC3F
# Author: Antonin RAFFIN
# MIT License
import argparse
import pybullet_envs
import gym
import numpy as np
from stable_baselines import SAC, TD3
from stable_baselines.common.noise import NormalActionNoise
from utils import TimeFeatureWrapper, EvalCallback
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--algo', help='RL Algorithm (Soft Actor-Critic by default)', default='sac',
type=str, required=False, choices=['sac', 'td3'])
parser.add_argument('--env', type=str, default='HalfCheetahBulletEnv-v0', help='environment ID')
parser.add_argument('-n', '--n-timesteps', help='Number of training timesteps', default=int(1e6),
type=int)
args = parser.parse_args()
env_id = args.env
n_timesteps = args.n_timesteps
save_path = '{}_{}'.format(args.algo, env_id)
# Instantiate and wrap the environment
env = TimeFeatureWrapper(gym.make(env_id))
# Create the evaluation environment and callback
eval_env = TimeFeatureWrapper(gym.make(env_id))
callback = EvalCallback(eval_env, best_model_save_path=save_path + '_best')
algo = {
'sac': SAC,
'td3': TD3
}[args.algo]
n_actions = env.action_space.shape[0]
# Tuned hyperparameters from https://github.com/araffin/rl-baselines-zoo
hyperparams = {
'sac': dict(batch_size=256, gamma=0.98, policy_kwargs=dict(layers=[256, 256]),
learning_starts=10000, buffer_size=int(2e5), tau=0.01),
'td3': dict(batch_size=100, policy_kwargs=dict(layers=[400, 300]),
learning_rate=1e-3, learning_starts=10000, buffer_size=int(1e6),
train_freq=1000, gradient_steps=1000,
action_noise=NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)))
}[args.algo]
model = algo('MlpPolicy', env, verbose=1, **hyperparams)
model.learn(n_timesteps, callback=callback)
print("Saving to {}.zip".format(save_path))
model.save(save_path)

View File

@@ -0,0 +1,114 @@
# Code adapted from https://github.com/araffin/rl-baselines-zoo
# it requires stable-baselines to be installed
# Author: Antonin RAFFIN
# MIT License
import gym
import numpy as np
from gym.wrappers import TimeLimit
from stable_baselines.common.evaluation import evaluate_policy
class TimeFeatureWrapper(gym.Wrapper):
"""
Add remaining time to observation space for fixed length episodes.
See https://arxiv.org/abs/1712.00378 and https://github.com/aravindr93/mjrl/issues/13.
:param env: (gym.Env)
:param max_steps: (int) Max number of steps of an episode
if it is not wrapped in a TimeLimit object.
:param test_mode: (bool) In test mode, the time feature is constant,
equal to zero. This allow to check that the agent did not overfit this feature,
learning a deterministic pre-defined sequence of actions.
"""
def __init__(self, env, max_steps=1000, test_mode=False):
assert isinstance(env.observation_space, gym.spaces.Box)
# Add a time feature to the observation
low, high = env.observation_space.low, env.observation_space.high
low, high= np.concatenate((low, [0])), np.concatenate((high, [1.]))
env.observation_space = gym.spaces.Box(low=low, high=high, dtype=np.float32)
super(TimeFeatureWrapper, self).__init__(env)
if isinstance(env, TimeLimit):
self._max_steps = env._max_episode_steps
else:
self._max_steps = max_steps
self._current_step = 0
self._test_mode = test_mode
def reset(self):
self._current_step = 0
return self._get_obs(self.env.reset())
def step(self, action):
self._current_step += 1
obs, reward, done, info = self.env.step(action)
return self._get_obs(obs), reward, done, info
def _get_obs(self, obs):
"""
Concatenate the time feature to the current observation.
:param obs: (np.ndarray)
:return: (np.ndarray)
"""
# Remaining time is more general
time_feature = 1 - (self._current_step / self._max_steps)
if self._test_mode:
time_feature = 1.0
# Optionnaly: concatenate [time_feature, time_feature ** 2]
return np.concatenate((obs, [time_feature]))
class EvalCallback(object):
"""
Callback for evaluating an agent.
:param eval_env: (gym.Env) The environment used for initialization
:param n_eval_episodes: (int) The number of episodes to test the agent
:param eval_freq: (int) Evaluate the agent every eval_freq call of the callback.
:param deterministic: (bool)
:param best_model_save_path: (str)
:param verbose: (int)
"""
def __init__(self, eval_env, n_eval_episodes=5, eval_freq=10000,
deterministic=True, best_model_save_path=None, verbose=1):
super(EvalCallback, self).__init__()
self.n_eval_episodes = n_eval_episodes
self.eval_freq = eval_freq
self.best_mean_reward = -np.inf
self.deterministic = deterministic
self.eval_env = eval_env
self.verbose = verbose
self.model, self.num_timesteps = None, 0
self.best_model_save_path = best_model_save_path
self.n_calls = 0
def __call__(self, locals_, globals_):
"""
:param locals_: (dict)
:param globals_: (dict)
:return: (bool)
"""
self.n_calls += 1
self.model = locals_['self']
self.num_timesteps = self.model.num_timesteps
if self.n_calls % self.eval_freq == 0:
episode_rewards, _ = evaluate_policy(self.model, self.eval_env, n_eval_episodes=self.n_eval_episodes,
deterministic=self.deterministic, return_episode_rewards=True)
mean_reward, std_reward = np.mean(episode_rewards), np.std(episode_rewards)
if self.verbose > 0:
print("Eval num_timesteps={}, "
"episode_reward={:.2f} +/- {:.2f}".format(self.num_timesteps, mean_reward, std_reward))
if mean_reward > self.best_mean_reward:
if self.best_model_save_path is not None:
print("Saving best model")
self.model.save(self.best_model_save_path)
self.best_mean_reward = mean_reward
return True