From 9c969614bcb52e5856b854d4bd2b02951e0563a6 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Thu, 2 Jan 2020 11:00:45 +0100 Subject: [PATCH 1/4] Add Stable-Baselines example with SAC and TD3 --- .../stable_baselines/__init__.py | 0 .../pybullet_envs/stable_baselines/train.py | 69 +++++++++++ .../pybullet_envs/stable_baselines/utils.py | 114 ++++++++++++++++++ 3 files changed, 183 insertions(+) create mode 100644 examples/pybullet/gym/pybullet_envs/stable_baselines/__init__.py create mode 100644 examples/pybullet/gym/pybullet_envs/stable_baselines/train.py create mode 100644 examples/pybullet/gym/pybullet_envs/stable_baselines/utils.py diff --git a/examples/pybullet/gym/pybullet_envs/stable_baselines/__init__.py b/examples/pybullet/gym/pybullet_envs/stable_baselines/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/pybullet/gym/pybullet_envs/stable_baselines/train.py b/examples/pybullet/gym/pybullet_envs/stable_baselines/train.py new file mode 100644 index 000000000..c258a30c7 --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/stable_baselines/train.py @@ -0,0 +1,69 @@ +# Code adapted from https://github.com/araffin/rl-baselines-zoo +# it requires stable-baselines to be installed +# Colab Notebook: +# Author: Antonin RAFFIN +# MIT License + +# Add parent dir to find package. Only needed for source code build, pip install doesn't need it. +import os +import inspect +import argparse + +try: + import pybullet_envs +except ImportError: + current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) + parent_dir = os.path.dirname(os.path.dirname(current_dir)) + os.sys.path.insert(0, parent_dir) + +import gym +import numpy as np +from stable_baselines import SAC, TD3 +from stable_baselines.common.noise import NormalActionNoise + +from utils import TimeFeatureWrapper, EvalCallback + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--algo', help='RL Algorithm (Soft Actor-Critic by default)', default='sac', + type=str, required=False, choices=['sac', 'td3']) + parser.add_argument('--env', type=str, default='HalfCheetahBulletEnv-v0', help='environment ID') + parser.add_argument('-n', '--n-timesteps', help='Number of training timesteps', default=int(1e6), + type=int) + args = parser.parse_args() + + env_id = args.env + n_timesteps = args.n_timesteps + save_path = '{}_{}'.format(args.algo, env_id) + + # Instantiate and wrap the environment + env = TimeFeatureWrapper(gym.make(env_id)) + + # Create the evaluation environment and callback + eval_env = TimeFeatureWrapper(gym.make(env_id)) + callback = EvalCallback(eval_env, best_model_save_path=save_path + '_best') + + algo = { + 'sac': SAC, + 'td3': TD3 + }[args.algo] + + n_actions = env.action_space.shape[0] + + # Tuned hyperparameters from https://github.com/araffin/rl-baselines-zoo + hyperparams = { + 'sac': dict(batch_size=256, gamma=0.98, policy_kwargs=dict(layers=[256, 256]), + learning_starts=10000, buffer_size=int(1e6), tau=0.01), + + 'td3': dict(batch_size=100, policy_kwargs=dict(layers=[400, 300]), + learning_rate=1e-3, learning_starts=10000, buffer_size=int(1e6), + train_freq=1000, gradient_steps=1000, + action_noise=NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))) + }[args.algo] + + model = algo('MlpPolicy', env, verbose=1, **hyperparams) + model.learn(n_timesteps, callback=callback) + + print("Saving to {}.zip".format(save_path)) + model.save(save_path) diff --git a/examples/pybullet/gym/pybullet_envs/stable_baselines/utils.py b/examples/pybullet/gym/pybullet_envs/stable_baselines/utils.py new file mode 100644 index 000000000..1e2c054c8 --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/stable_baselines/utils.py @@ -0,0 +1,114 @@ +# Code adapted from https://github.com/araffin/rl-baselines-zoo +# it requires stable-baselines to be installed +# Author: Antonin RAFFIN +# MIT License +import gym +import numpy as np +from gym.wrappers import TimeLimit + +from stable_baselines.common.evaluation import evaluate_policy + + +class TimeFeatureWrapper(gym.Wrapper): + """ + Add remaining time to observation space for fixed length episodes. + See https://arxiv.org/abs/1712.00378 and https://github.com/aravindr93/mjrl/issues/13. + + :param env: (gym.Env) + :param max_steps: (int) Max number of steps of an episode + if it is not wrapped in a TimeLimit object. + :param test_mode: (bool) In test mode, the time feature is constant, + equal to zero. This allow to check that the agent did not overfit this feature, + learning a deterministic pre-defined sequence of actions. + """ + def __init__(self, env, max_steps=1000, test_mode=False): + assert isinstance(env.observation_space, gym.spaces.Box) + # Add a time feature to the observation + low, high = env.observation_space.low, env.observation_space.high + low, high= np.concatenate((low, [0])), np.concatenate((high, [1.])) + env.observation_space = gym.spaces.Box(low=low, high=high, dtype=np.float32) + + super(TimeFeatureWrapper, self).__init__(env) + + if isinstance(env, TimeLimit): + self._max_steps = env._max_episode_steps + else: + self._max_steps = max_steps + self._current_step = 0 + self._test_mode = test_mode + + def reset(self): + self._current_step = 0 + return self._get_obs(self.env.reset()) + + def step(self, action): + self._current_step += 1 + obs, reward, done, info = self.env.step(action) + return self._get_obs(obs), reward, done, info + + def _get_obs(self, obs): + """ + Concatenate the time feature to the current observation. + + :param obs: (np.ndarray) + :return: (np.ndarray) + """ + # Remaining time is more general + time_feature = 1 - (self._current_step / self._max_steps) + if self._test_mode: + time_feature = 1.0 + # Optionnaly: concatenate [time_feature, time_feature ** 2] + return np.concatenate((obs, [time_feature])) + + +class EvalCallback(object): + """ + Callback for evaluating an agent. + + :param eval_env: (gym.Env) The environment used for initialization + :param n_eval_episodes: (int) The number of episodes to test the agent + :param eval_freq: (int) Evaluate the agent every eval_freq call of the callback. + :param deterministic: (bool) + :param best_model_save_path: (str) + :param verbose: (int) + """ + def __init__(self, eval_env, n_eval_episodes=5, eval_freq=10000, + deterministic=True, best_model_save_path=None, verbose=1): + super(EvalCallback, self).__init__() + self.n_eval_episodes = n_eval_episodes + self.eval_freq = eval_freq + self.best_mean_reward = -np.inf + self.deterministic = deterministic + self.eval_env = eval_env + self.verbose = verbose + self.model, self.num_timesteps = None, 0 + self.best_model_save_path = best_model_save_path + self.n_calls = 0 + + def __call__(self, locals_, globals_): + """ + :param locals_: (dict) + :param globals_: (dict) + :return: (bool) + """ + self.n_calls += 1 + self.model = locals_['self'] + self.num_timesteps = self.model.num_timesteps + + if self.n_calls % self.eval_freq == 0: + episode_rewards, _ = evaluate_policy(self.model, self.eval_env, n_eval_episodes=self.n_eval_episodes, + deterministic=self.deterministic, return_episode_rewards=True) + + + mean_reward, std_reward = np.mean(episode_rewards), np.std(episode_rewards) + if self.verbose > 0: + print("Eval num_timesteps={}, " + "episode_reward={:.2f} +/- {:.2f}".format(self.num_timesteps, mean_reward, std_reward)) + + if mean_reward > self.best_mean_reward: + if self.best_model_save_path is not None: + print("Saving best model") + self.model.save(self.best_model_save_path) + self.best_mean_reward = mean_reward + + return True From da0483b03a34ac6402d13be454365201de368c5e Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Thu, 2 Jan 2020 11:06:39 +0100 Subject: [PATCH 2/4] Add colab notebook --- examples/pybullet/gym/pybullet_envs/stable_baselines/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pybullet/gym/pybullet_envs/stable_baselines/train.py b/examples/pybullet/gym/pybullet_envs/stable_baselines/train.py index c258a30c7..cc8055d73 100644 --- a/examples/pybullet/gym/pybullet_envs/stable_baselines/train.py +++ b/examples/pybullet/gym/pybullet_envs/stable_baselines/train.py @@ -1,6 +1,6 @@ # Code adapted from https://github.com/araffin/rl-baselines-zoo # it requires stable-baselines to be installed -# Colab Notebook: +# Colab Notebook: https://colab.research.google.com/drive/1nZkHO4QTYfAksm9ZTaZ5vXyC7szZxC3F # Author: Antonin RAFFIN # MIT License From 59c61a46bb51aded4db9800a9bcbe60bca59cac7 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Thu, 2 Jan 2020 11:12:45 +0100 Subject: [PATCH 3/4] Simplify imports --- .../gym/pybullet_envs/stable_baselines/train.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/examples/pybullet/gym/pybullet_envs/stable_baselines/train.py b/examples/pybullet/gym/pybullet_envs/stable_baselines/train.py index cc8055d73..9a47dc463 100644 --- a/examples/pybullet/gym/pybullet_envs/stable_baselines/train.py +++ b/examples/pybullet/gym/pybullet_envs/stable_baselines/train.py @@ -3,18 +3,9 @@ # Colab Notebook: https://colab.research.google.com/drive/1nZkHO4QTYfAksm9ZTaZ5vXyC7szZxC3F # Author: Antonin RAFFIN # MIT License - -# Add parent dir to find package. Only needed for source code build, pip install doesn't need it. -import os -import inspect import argparse -try: - import pybullet_envs -except ImportError: - current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) - parent_dir = os.path.dirname(os.path.dirname(current_dir)) - os.sys.path.insert(0, parent_dir) +import pybullet_envs import gym import numpy as np From adad4dc402a4b361cde269ede393acb31464c9a8 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Thu, 2 Jan 2020 14:47:30 +0100 Subject: [PATCH 4/4] Update buffer size --- examples/pybullet/gym/pybullet_envs/stable_baselines/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pybullet/gym/pybullet_envs/stable_baselines/train.py b/examples/pybullet/gym/pybullet_envs/stable_baselines/train.py index 9a47dc463..4390b7bb9 100644 --- a/examples/pybullet/gym/pybullet_envs/stable_baselines/train.py +++ b/examples/pybullet/gym/pybullet_envs/stable_baselines/train.py @@ -45,7 +45,7 @@ if __name__ == '__main__': # Tuned hyperparameters from https://github.com/araffin/rl-baselines-zoo hyperparams = { 'sac': dict(batch_size=256, gamma=0.98, policy_kwargs=dict(layers=[256, 256]), - learning_starts=10000, buffer_size=int(1e6), tau=0.01), + learning_starts=10000, buffer_size=int(2e5), tau=0.01), 'td3': dict(batch_size=100, policy_kwargs=dict(layers=[400, 300]), learning_rate=1e-3, learning_starts=10000, buffer_size=int(1e6),