From 8b53e47fe8531710eb1ae0533035ef4e51f855d4 Mon Sep 17 00:00:00 2001
From: Erwin Coumans
 <erwincoumans@erwincoumans-macbookpro2.roam.corp.google.com>
Date: Fri, 2 Nov 2018 11:19:46 -0700
Subject: [PATCH] add simpler ARS implementation, thanks to Alexis Jacq and
 Hadelin de Ponteves (will add save/restore of policy and rendering movies
 through command-line arguments soon)

---
 .../pybullet/gym/pybullet_envs/ARS/ars.py     | 142 ++++++++++++++++++
 1 file changed, 142 insertions(+)
 create mode 100644 examples/pybullet/gym/pybullet_envs/ARS/ars.py

diff --git a/examples/pybullet/gym/pybullet_envs/ARS/ars.py b/examples/pybullet/gym/pybullet_envs/ARS/ars.py
new file mode 100644
index 000000000..b4c0c3127
--- /dev/null
+++ b/examples/pybullet/gym/pybullet_envs/ARS/ars.py
@@ -0,0 +1,142 @@
+# AI 2018
+
+# Importing the libraries
+import os
+import numpy as np
+import gym
+from gym import wrappers
+import pybullet_envs
+
+# Setting the Hyper Parameters
+
+class Hp():
+    
+    def __init__(self):
+        self.nb_steps = 1000
+        self.episode_length = 1000
+        self.learning_rate = 0.02
+        self.nb_directions = 16
+        self.nb_best_directions = 16
+        assert self.nb_best_directions <= self.nb_directions
+        self.noise = 0.03
+        self.seed = 1
+        self.env_name = 'HalfCheetahBulletEnv-v0'
+
+# Normalizing the states
+
+class Normalizer():
+    
+    def __init__(self, nb_inputs):
+        self.n = np.zeros(nb_inputs)
+        self.mean = np.zeros(nb_inputs)
+        self.mean_diff = np.zeros(nb_inputs)
+        self.var = np.zeros(nb_inputs)
+    
+    def observe(self, x):
+        self.n += 1.
+        last_mean = self.mean.copy()
+        self.mean += (x - self.mean) / self.n
+        self.mean_diff += (x - last_mean) * (x - self.mean)
+        self.var = (self.mean_diff / self.n).clip(min = 1e-2)
+    
+    def normalize(self, inputs):
+        obs_mean = self.mean
+        obs_std = np.sqrt(self.var)
+        return (inputs - obs_mean) / obs_std
+
+# Building the AI
+
+class Policy():
+    
+    def __init__(self, input_size, output_size):
+        self.theta = np.zeros((output_size, input_size))
+        print("self.theta=",self.theta)    
+    def evaluate(self, input, delta = None, direction = None):
+        if direction is None:
+            return np.clip(self.theta.dot(input), -1.0, 1.0)
+        elif direction == "positive":
+            return np.clip((self.theta + hp.noise*delta).dot(input), -1.0, 1.0)
+        else:
+            return np.clip((self.theta - hp.noise*delta).dot(input), -1.0, 1.0)
+    
+    def sample_deltas(self):
+        return [np.random.randn(*self.theta.shape) for _ in range(hp.nb_directions)]
+    
+    def update(self, rollouts, sigma_r):
+        step = np.zeros(self.theta.shape)
+        for r_pos, r_neg, d in rollouts:
+            step += (r_pos - r_neg) * d
+        self.theta += hp.learning_rate / (hp.nb_best_directions * sigma_r) * step
+
+# Exploring the policy on one specific direction and over one episode
+
+def explore(env, normalizer, policy, direction = None, delta = None):
+    state = env.reset()
+    done = False
+    num_plays = 0.
+    sum_rewards = 0
+    while not done and num_plays < hp.episode_length:
+        normalizer.observe(state)
+        state = normalizer.normalize(state)
+        action = policy.evaluate(state, delta, direction)
+        state, reward, done, _ = env.step(action)
+        reward = max(min(reward, 1), -1)
+        sum_rewards += reward
+        num_plays += 1
+    return sum_rewards
+
+# Training the AI
+
+def train(env, policy, normalizer, hp):
+    
+    for step in range(hp.nb_steps):
+        
+        # Initializing the perturbations deltas and the positive/negative rewards
+        deltas = policy.sample_deltas()
+        positive_rewards = [0] * hp.nb_directions
+        negative_rewards = [0] * hp.nb_directions
+        
+        # Getting the positive rewards in the positive directions
+        for k in range(hp.nb_directions):
+            positive_rewards[k] = explore(env, normalizer, policy, direction = "positive", delta = deltas[k])
+        
+        # Getting the negative rewards in the negative/opposite directions
+        for k in range(hp.nb_directions):
+            negative_rewards[k] = explore(env, normalizer, policy, direction = "negative", delta = deltas[k])
+        
+        # Gathering all the positive/negative rewards to compute the standard deviation of these rewards
+        all_rewards = np.array(positive_rewards + negative_rewards)
+        sigma_r = all_rewards.std()
+        
+        # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions
+        scores = {k:max(r_pos, r_neg) for k,(r_pos,r_neg) in enumerate(zip(positive_rewards, negative_rewards))}
+        order = sorted(scores.keys(), key = lambda x:scores[x])[:hp.nb_best_directions]
+        rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order]
+        
+        # Updating our policy
+        policy.update(rollouts, sigma_r)
+        
+        # Printing the final reward of the policy after the update
+        reward_evaluation = explore(env, normalizer, policy)
+        print('Step:', step, 'Reward:', reward_evaluation)
+
+# Running the main code
+
+def mkdir(base, name):
+    path = os.path.join(base, name)
+    if not os.path.exists(path):
+        os.makedirs(path)
+    return path
+work_dir = mkdir('exp', 'brs')
+monitor_dir = mkdir(work_dir, 'monitor')
+
+hp = Hp()
+np.random.seed(hp.seed)
+env = gym.make(hp.env_name)
+# env.render(mode = "human")
+#env = wrappers.Monitor(env, monitor_dir, force = True)
+nb_inputs = env.observation_space.shape[0]
+nb_outputs = env.action_space.shape[0]
+policy = Policy(nb_inputs, nb_outputs)
+normalizer = Normalizer(nb_inputs)
+train(env, policy, normalizer, hp)