From 8b53e47fe8531710eb1ae0533035ef4e51f855d4 Mon Sep 17 00:00:00 2001 From: Erwin Coumans Date: Fri, 2 Nov 2018 11:19:46 -0700 Subject: [PATCH] add simpler ARS implementation, thanks to Alexis Jacq and Hadelin de Ponteves (will add save/restore of policy and rendering movies through command-line arguments soon) --- .../pybullet/gym/pybullet_envs/ARS/ars.py | 142 ++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 examples/pybullet/gym/pybullet_envs/ARS/ars.py diff --git a/examples/pybullet/gym/pybullet_envs/ARS/ars.py b/examples/pybullet/gym/pybullet_envs/ARS/ars.py new file mode 100644 index 000000000..b4c0c3127 --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/ARS/ars.py @@ -0,0 +1,142 @@ +# AI 2018 + +# Importing the libraries +import os +import numpy as np +import gym +from gym import wrappers +import pybullet_envs + +# Setting the Hyper Parameters + +class Hp(): + + def __init__(self): + self.nb_steps = 1000 + self.episode_length = 1000 + self.learning_rate = 0.02 + self.nb_directions = 16 + self.nb_best_directions = 16 + assert self.nb_best_directions <= self.nb_directions + self.noise = 0.03 + self.seed = 1 + self.env_name = 'HalfCheetahBulletEnv-v0' + +# Normalizing the states + +class Normalizer(): + + def __init__(self, nb_inputs): + self.n = np.zeros(nb_inputs) + self.mean = np.zeros(nb_inputs) + self.mean_diff = np.zeros(nb_inputs) + self.var = np.zeros(nb_inputs) + + def observe(self, x): + self.n += 1. + last_mean = self.mean.copy() + self.mean += (x - self.mean) / self.n + self.mean_diff += (x - last_mean) * (x - self.mean) + self.var = (self.mean_diff / self.n).clip(min = 1e-2) + + def normalize(self, inputs): + obs_mean = self.mean + obs_std = np.sqrt(self.var) + return (inputs - obs_mean) / obs_std + +# Building the AI + +class Policy(): + + def __init__(self, input_size, output_size): + self.theta = np.zeros((output_size, input_size)) + print("self.theta=",self.theta) + def evaluate(self, input, delta = None, direction = None): + if direction is None: + return np.clip(self.theta.dot(input), -1.0, 1.0) + elif direction == "positive": + return np.clip((self.theta + hp.noise*delta).dot(input), -1.0, 1.0) + else: + return np.clip((self.theta - hp.noise*delta).dot(input), -1.0, 1.0) + + def sample_deltas(self): + return [np.random.randn(*self.theta.shape) for _ in range(hp.nb_directions)] + + def update(self, rollouts, sigma_r): + step = np.zeros(self.theta.shape) + for r_pos, r_neg, d in rollouts: + step += (r_pos - r_neg) * d + self.theta += hp.learning_rate / (hp.nb_best_directions * sigma_r) * step + +# Exploring the policy on one specific direction and over one episode + +def explore(env, normalizer, policy, direction = None, delta = None): + state = env.reset() + done = False + num_plays = 0. + sum_rewards = 0 + while not done and num_plays < hp.episode_length: + normalizer.observe(state) + state = normalizer.normalize(state) + action = policy.evaluate(state, delta, direction) + state, reward, done, _ = env.step(action) + reward = max(min(reward, 1), -1) + sum_rewards += reward + num_plays += 1 + return sum_rewards + +# Training the AI + +def train(env, policy, normalizer, hp): + + for step in range(hp.nb_steps): + + # Initializing the perturbations deltas and the positive/negative rewards + deltas = policy.sample_deltas() + positive_rewards = [0] * hp.nb_directions + negative_rewards = [0] * hp.nb_directions + + # Getting the positive rewards in the positive directions + for k in range(hp.nb_directions): + positive_rewards[k] = explore(env, normalizer, policy, direction = "positive", delta = deltas[k]) + + # Getting the negative rewards in the negative/opposite directions + for k in range(hp.nb_directions): + negative_rewards[k] = explore(env, normalizer, policy, direction = "negative", delta = deltas[k]) + + # Gathering all the positive/negative rewards to compute the standard deviation of these rewards + all_rewards = np.array(positive_rewards + negative_rewards) + sigma_r = all_rewards.std() + + # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions + scores = {k:max(r_pos, r_neg) for k,(r_pos,r_neg) in enumerate(zip(positive_rewards, negative_rewards))} + order = sorted(scores.keys(), key = lambda x:scores[x])[:hp.nb_best_directions] + rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order] + + # Updating our policy + policy.update(rollouts, sigma_r) + + # Printing the final reward of the policy after the update + reward_evaluation = explore(env, normalizer, policy) + print('Step:', step, 'Reward:', reward_evaluation) + +# Running the main code + +def mkdir(base, name): + path = os.path.join(base, name) + if not os.path.exists(path): + os.makedirs(path) + return path +work_dir = mkdir('exp', 'brs') +monitor_dir = mkdir(work_dir, 'monitor') + +hp = Hp() +np.random.seed(hp.seed) +env = gym.make(hp.env_name) +# env.render(mode = "human") +#env = wrappers.Monitor(env, monitor_dir, force = True) +nb_inputs = env.observation_space.shape[0] +nb_outputs = env.action_space.shape[0] +policy = Policy(nb_inputs, nb_outputs) +normalizer = Normalizer(nb_inputs) +train(env, policy, normalizer, hp)