diff --git a/examples/pybullet/gym/pybullet_envs/stable_baselines/__init__.py b/examples/pybullet/gym/pybullet_envs/stable_baselines/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/pybullet/gym/pybullet_envs/stable_baselines/train.py b/examples/pybullet/gym/pybullet_envs/stable_baselines/train.py new file mode 100644 index 000000000..c258a30c7 --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/stable_baselines/train.py @@ -0,0 +1,69 @@ +# Code adapted from https://github.com/araffin/rl-baselines-zoo +# it requires stable-baselines to be installed +# Colab Notebook: +# Author: Antonin RAFFIN +# MIT License + +# Add parent dir to find package. Only needed for source code build, pip install doesn't need it. +import os +import inspect +import argparse + +try: + import pybullet_envs +except ImportError: + current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) + parent_dir = os.path.dirname(os.path.dirname(current_dir)) + os.sys.path.insert(0, parent_dir) + +import gym +import numpy as np +from stable_baselines import SAC, TD3 +from stable_baselines.common.noise import NormalActionNoise + +from utils import TimeFeatureWrapper, EvalCallback + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--algo', help='RL Algorithm (Soft Actor-Critic by default)', default='sac', + type=str, required=False, choices=['sac', 'td3']) + parser.add_argument('--env', type=str, default='HalfCheetahBulletEnv-v0', help='environment ID') + parser.add_argument('-n', '--n-timesteps', help='Number of training timesteps', default=int(1e6), + type=int) + args = parser.parse_args() + + env_id = args.env + n_timesteps = args.n_timesteps + save_path = '{}_{}'.format(args.algo, env_id) + + # Instantiate and wrap the environment + env = TimeFeatureWrapper(gym.make(env_id)) + + # Create the evaluation environment and callback + eval_env = TimeFeatureWrapper(gym.make(env_id)) + callback = EvalCallback(eval_env, best_model_save_path=save_path + '_best') + + algo = { + 'sac': SAC, + 'td3': TD3 + }[args.algo] + + n_actions = env.action_space.shape[0] + + # Tuned hyperparameters from https://github.com/araffin/rl-baselines-zoo + hyperparams = { + 'sac': dict(batch_size=256, gamma=0.98, policy_kwargs=dict(layers=[256, 256]), + learning_starts=10000, buffer_size=int(1e6), tau=0.01), + + 'td3': dict(batch_size=100, policy_kwargs=dict(layers=[400, 300]), + learning_rate=1e-3, learning_starts=10000, buffer_size=int(1e6), + train_freq=1000, gradient_steps=1000, + action_noise=NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))) + }[args.algo] + + model = algo('MlpPolicy', env, verbose=1, **hyperparams) + model.learn(n_timesteps, callback=callback) + + print("Saving to {}.zip".format(save_path)) + model.save(save_path) diff --git a/examples/pybullet/gym/pybullet_envs/stable_baselines/utils.py b/examples/pybullet/gym/pybullet_envs/stable_baselines/utils.py new file mode 100644 index 000000000..1e2c054c8 --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/stable_baselines/utils.py @@ -0,0 +1,114 @@ +# Code adapted from https://github.com/araffin/rl-baselines-zoo +# it requires stable-baselines to be installed +# Author: Antonin RAFFIN +# MIT License +import gym +import numpy as np +from gym.wrappers import TimeLimit + +from stable_baselines.common.evaluation import evaluate_policy + + +class TimeFeatureWrapper(gym.Wrapper): + """ + Add remaining time to observation space for fixed length episodes. + See https://arxiv.org/abs/1712.00378 and https://github.com/aravindr93/mjrl/issues/13. + + :param env: (gym.Env) + :param max_steps: (int) Max number of steps of an episode + if it is not wrapped in a TimeLimit object. + :param test_mode: (bool) In test mode, the time feature is constant, + equal to zero. This allow to check that the agent did not overfit this feature, + learning a deterministic pre-defined sequence of actions. + """ + def __init__(self, env, max_steps=1000, test_mode=False): + assert isinstance(env.observation_space, gym.spaces.Box) + # Add a time feature to the observation + low, high = env.observation_space.low, env.observation_space.high + low, high= np.concatenate((low, [0])), np.concatenate((high, [1.])) + env.observation_space = gym.spaces.Box(low=low, high=high, dtype=np.float32) + + super(TimeFeatureWrapper, self).__init__(env) + + if isinstance(env, TimeLimit): + self._max_steps = env._max_episode_steps + else: + self._max_steps = max_steps + self._current_step = 0 + self._test_mode = test_mode + + def reset(self): + self._current_step = 0 + return self._get_obs(self.env.reset()) + + def step(self, action): + self._current_step += 1 + obs, reward, done, info = self.env.step(action) + return self._get_obs(obs), reward, done, info + + def _get_obs(self, obs): + """ + Concatenate the time feature to the current observation. + + :param obs: (np.ndarray) + :return: (np.ndarray) + """ + # Remaining time is more general + time_feature = 1 - (self._current_step / self._max_steps) + if self._test_mode: + time_feature = 1.0 + # Optionnaly: concatenate [time_feature, time_feature ** 2] + return np.concatenate((obs, [time_feature])) + + +class EvalCallback(object): + """ + Callback for evaluating an agent. + + :param eval_env: (gym.Env) The environment used for initialization + :param n_eval_episodes: (int) The number of episodes to test the agent + :param eval_freq: (int) Evaluate the agent every eval_freq call of the callback. + :param deterministic: (bool) + :param best_model_save_path: (str) + :param verbose: (int) + """ + def __init__(self, eval_env, n_eval_episodes=5, eval_freq=10000, + deterministic=True, best_model_save_path=None, verbose=1): + super(EvalCallback, self).__init__() + self.n_eval_episodes = n_eval_episodes + self.eval_freq = eval_freq + self.best_mean_reward = -np.inf + self.deterministic = deterministic + self.eval_env = eval_env + self.verbose = verbose + self.model, self.num_timesteps = None, 0 + self.best_model_save_path = best_model_save_path + self.n_calls = 0 + + def __call__(self, locals_, globals_): + """ + :param locals_: (dict) + :param globals_: (dict) + :return: (bool) + """ + self.n_calls += 1 + self.model = locals_['self'] + self.num_timesteps = self.model.num_timesteps + + if self.n_calls % self.eval_freq == 0: + episode_rewards, _ = evaluate_policy(self.model, self.eval_env, n_eval_episodes=self.n_eval_episodes, + deterministic=self.deterministic, return_episode_rewards=True) + + + mean_reward, std_reward = np.mean(episode_rewards), np.std(episode_rewards) + if self.verbose > 0: + print("Eval num_timesteps={}, " + "episode_reward={:.2f} +/- {:.2f}".format(self.num_timesteps, mean_reward, std_reward)) + + if mean_reward > self.best_mean_reward: + if self.best_model_save_path is not None: + print("Saving best model") + self.model.save(self.best_model_save_path) + self.best_mean_reward = mean_reward + + return True