Merge remote-tracking branch 'bp/master'

2019-04-27 14:19:58 -07:00
parent ef9570c315 38f45d7137
commit efe73670ef
8 changed files with 341 additions and 190 deletions
--- a/examples/pybullet/gym/pybullet_envs/ARS/ars.py
+++ b/examples/pybullet/gym/pybullet_envs/ARS/ars.py
@@ -20,17 +20,17 @@ import argparse

 # Setting the Hyper Parameters
 class Hp():
-
-  def __init__(self):
-    self.nb_steps = 10000
-    self.episode_length = 1000
-    self.learning_rate = 0.02
-    self.nb_directions = 16
-    self.nb_best_directions = 16
-    assert self.nb_best_directions <= self.nb_directions
-    self.noise = 0.03
-    self.seed = 1
-    self.env_name = 'HalfCheetahBulletEnv-v0'
+    
+    def __init__(self):
+        self.nb_steps = 10000
+        self.episode_length = 1000
+        self.learning_rate = 0.02
+        self.nb_directions = 16
+        self.nb_best_directions = 8
+        assert self.nb_best_directions <= self.nb_directions
+        self.noise = 0.03
+        self.seed = 1
+        self.env_name = 'HalfCheetahBulletEnv-v0'


 # Multiprocess Exploring the policy on one specific direction and over one episode
@@ -165,55 +165,52 @@ def explore(env, normalizer, policy, direction, delta, hp):


 def train(env, policy, normalizer, hp, parentPipes, args):
-
-  for step in range(hp.nb_steps):
-
-    # Initializing the perturbations deltas and the positive/negative rewards
-    deltas = policy.sample_deltas()
-    positive_rewards = [0] * hp.nb_directions
-    negative_rewards = [0] * hp.nb_directions
-
-    if parentPipes:
-      for k in range(hp.nb_directions):
-        parentPipe = parentPipes[k]
-        parentPipe.send([_EXPLORE, [normalizer, policy, hp, "positive", deltas[k]]])
-      for k in range(hp.nb_directions):
-        positive_rewards[k] = parentPipes[k].recv()[0]
-
-      for k in range(hp.nb_directions):
-        parentPipe = parentPipes[k]
-        parentPipe.send([_EXPLORE, [normalizer, policy, hp, "negative", deltas[k]]])
-      for k in range(hp.nb_directions):
-        negative_rewards[k] = parentPipes[k].recv()[0]
-
-    else:
-      # Getting the positive rewards in the positive directions
-      for k in range(hp.nb_directions):
-        positive_rewards[k] = explore(env, normalizer, policy, "positive", deltas[k], hp)
-
-      # Getting the negative rewards in the negative/opposite directions
-      for k in range(hp.nb_directions):
-        negative_rewards[k] = explore(env, normalizer, policy, "negative", deltas[k], hp)
-
-    # Gathering all the positive/negative rewards to compute the standard deviation of these rewards
-    all_rewards = np.array(positive_rewards + negative_rewards)
-    sigma_r = all_rewards.std()
-
-    # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions
-    scores = {
-        k: max(r_pos, r_neg)
-        for k, (r_pos, r_neg) in enumerate(zip(positive_rewards, negative_rewards))
-    }
-    order = sorted(scores.keys(), key=lambda x: scores[x])[:hp.nb_best_directions]
-    rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order]
-
-    # Updating our policy
-    policy.update(rollouts, sigma_r, args)
-
-    # Printing the final reward of the policy after the update
-    reward_evaluation = explore(env, normalizer, policy, None, None, hp)
-    print('Step:', step, 'Reward:', reward_evaluation)
-
+    for step in range(hp.nb_steps):
+        
+        # Initializing the perturbations deltas and the positive/negative rewards
+        deltas = policy.sample_deltas()
+        positive_rewards = [0] * hp.nb_directions
+        negative_rewards = [0] * hp.nb_directions
+        
+        if parentPipes:
+          for k in range(hp.nb_directions):
+            parentPipe = parentPipes[k]
+            parentPipe.send([_EXPLORE,[normalizer, policy, hp, "positive", deltas[k]]])
+          for k in range(hp.nb_directions):
+            positive_rewards[k] = parentPipes[k].recv()[0]
+          
+          for k in range(hp.nb_directions):
+            parentPipe = parentPipes[k]
+            parentPipe.send([_EXPLORE,[normalizer, policy, hp, "negative", deltas[k]]])
+          for k in range(hp.nb_directions):
+            negative_rewards[k] = parentPipes[k].recv()[0]
+          
+        else:
+          # Getting the positive rewards in the positive directions
+          for k in range(hp.nb_directions):
+              positive_rewards[k] = explore(env, normalizer, policy, "positive", deltas[k], hp)
+        
+          
+          # Getting the negative rewards in the negative/opposite directions
+          for k in range(hp.nb_directions):
+              negative_rewards[k] = explore(env, normalizer, policy, "negative", deltas[k], hp)
+            
+        
+        # Gathering all the positive/negative rewards to compute the standard deviation of these rewards
+        all_rewards = np.array(positive_rewards + negative_rewards)
+        sigma_r = all_rewards.std()
+        
+        # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions
+        scores = {k:max(r_pos, r_neg) for k,(r_pos,r_neg) in enumerate(zip(positive_rewards, negative_rewards))}
+        order = sorted(scores.keys(), key = lambda x:-scores[x])[:hp.nb_best_directions]
+        rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order]
+        
+        # Updating our policy
+        policy.update(rollouts, sigma_r, args)
+        
+        # Printing the final reward of the policy after the update
+        reward_evaluation = explore(env, normalizer, policy, None, None, hp)
+        print('Step:', step, 'Reward:', reward_evaluation)

 # Running the main code