diff --git a/examples/pybullet/gym/pybullet_envs/ARS/ars.py b/examples/pybullet/gym/pybullet_envs/ARS/ars.py index 52ae67b25..51bc776db 100644 --- a/examples/pybullet/gym/pybullet_envs/ARS/ars.py +++ b/examples/pybullet/gym/pybullet_envs/ARS/ars.py @@ -20,17 +20,17 @@ import argparse # Setting the Hyper Parameters class Hp(): - - def __init__(self): - self.nb_steps = 10000 - self.episode_length = 1000 - self.learning_rate = 0.02 - self.nb_directions = 16 - self.nb_best_directions = 8 - assert self.nb_best_directions <= self.nb_directions - self.noise = 0.03 - self.seed = 1 - self.env_name = 'HalfCheetahBulletEnv-v0' + + def __init__(self): + self.nb_steps = 10000 + self.episode_length = 1000 + self.learning_rate = 0.02 + self.nb_directions = 16 + self.nb_best_directions = 8 + assert self.nb_best_directions <= self.nb_directions + self.noise = 0.03 + self.seed = 1 + self.env_name = 'HalfCheetahBulletEnv-v0' # Multiprocess Exploring the policy on one specific direction and over one episode @@ -165,52 +165,54 @@ def explore(env, normalizer, policy, direction, delta, hp): def train(env, policy, normalizer, hp, parentPipes, args): - for step in range(hp.nb_steps): - - # Initializing the perturbations deltas and the positive/negative rewards - deltas = policy.sample_deltas() - positive_rewards = [0] * hp.nb_directions - negative_rewards = [0] * hp.nb_directions - - if parentPipes: - for k in range(hp.nb_directions): - parentPipe = parentPipes[k] - parentPipe.send([_EXPLORE,[normalizer, policy, hp, "positive", deltas[k]]]) - for k in range(hp.nb_directions): - positive_rewards[k] = parentPipes[k].recv()[0] - - for k in range(hp.nb_directions): - parentPipe = parentPipes[k] - parentPipe.send([_EXPLORE,[normalizer, policy, hp, "negative", deltas[k]]]) - for k in range(hp.nb_directions): - negative_rewards[k] = parentPipes[k].recv()[0] - - else: - # Getting the positive rewards in the positive directions - for k in range(hp.nb_directions): - positive_rewards[k] = explore(env, normalizer, policy, "positive", deltas[k], hp) - - - # Getting the negative rewards in the negative/opposite directions - for k in range(hp.nb_directions): - negative_rewards[k] = explore(env, normalizer, policy, "negative", deltas[k], hp) - - - # Gathering all the positive/negative rewards to compute the standard deviation of these rewards - all_rewards = np.array(positive_rewards + negative_rewards) - sigma_r = all_rewards.std() - - # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions - scores = {k:max(r_pos, r_neg) for k,(r_pos,r_neg) in enumerate(zip(positive_rewards, negative_rewards))} - order = sorted(scores.keys(), key = lambda x:-scores[x])[:hp.nb_best_directions] - rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order] - - # Updating our policy - policy.update(rollouts, sigma_r, args) - - # Printing the final reward of the policy after the update - reward_evaluation = explore(env, normalizer, policy, None, None, hp) - print('Step:', step, 'Reward:', reward_evaluation) + for step in range(hp.nb_steps): + + # Initializing the perturbations deltas and the positive/negative rewards + deltas = policy.sample_deltas() + positive_rewards = [0] * hp.nb_directions + negative_rewards = [0] * hp.nb_directions + + if parentPipes: + for k in range(hp.nb_directions): + parentPipe = parentPipes[k] + parentPipe.send([_EXPLORE, [normalizer, policy, hp, "positive", deltas[k]]]) + for k in range(hp.nb_directions): + positive_rewards[k] = parentPipes[k].recv()[0] + + for k in range(hp.nb_directions): + parentPipe = parentPipes[k] + parentPipe.send([_EXPLORE, [normalizer, policy, hp, "negative", deltas[k]]]) + for k in range(hp.nb_directions): + negative_rewards[k] = parentPipes[k].recv()[0] + + else: + # Getting the positive rewards in the positive directions + for k in range(hp.nb_directions): + positive_rewards[k] = explore(env, normalizer, policy, "positive", deltas[k], hp) + + # Getting the negative rewards in the negative/opposite directions + for k in range(hp.nb_directions): + negative_rewards[k] = explore(env, normalizer, policy, "negative", deltas[k], hp) + + # Gathering all the positive/negative rewards to compute the standard deviation of these rewards + all_rewards = np.array(positive_rewards + negative_rewards) + sigma_r = all_rewards.std() + + # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions + scores = { + k: max(r_pos, r_neg) + for k, (r_pos, r_neg) in enumerate(zip(positive_rewards, negative_rewards)) + } + order = sorted(scores.keys(), key=lambda x: -scores[x])[:hp.nb_best_directions] + rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order] + + # Updating our policy + policy.update(rollouts, sigma_r, args) + + # Printing the final reward of the policy after the update + reward_evaluation = explore(env, normalizer, policy, None, None, hp) + print('Step:', step, 'Reward:', reward_evaluation) + # Running the main code @@ -226,19 +228,15 @@ if __name__ == "__main__": mp.freeze_support() parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--env', - help='Gym environment name', - type=str, - default='HalfCheetahBulletEnv-v0') + parser.add_argument( + '--env', help='Gym environment name', type=str, default='HalfCheetahBulletEnv-v0') parser.add_argument('--seed', help='RNG seed', type=int, default=1) parser.add_argument('--render', help='OpenGL Visualizer', type=int, default=0) parser.add_argument('--movie', help='rgb_array gym movie', type=int, default=0) parser.add_argument('--steps', help='Number of steps', type=int, default=10000) parser.add_argument('--policy', help='Starting policy file (npy)', type=str, default='') - parser.add_argument('--logdir', - help='Directory root to log policy files (npy)', - type=str, - default='.') + parser.add_argument( + '--logdir', help='Directory root to log policy files (npy)', type=str, default='.') parser.add_argument('--mp', help='Enable multiprocessing', type=int, default=1) args = parser.parse_args()