apply yapf

This commit is contained in:
Erwin Coumans
2019-04-27 14:22:02 -07:00
parent efe73670ef
commit 61c099460c

View File

@@ -21,16 +21,16 @@ import argparse
# Setting the Hyper Parameters
class Hp():
def __init__(self):
self.nb_steps = 10000
self.episode_length = 1000
self.learning_rate = 0.02
self.nb_directions = 16
self.nb_best_directions = 8
assert self.nb_best_directions <= self.nb_directions
self.noise = 0.03
self.seed = 1
self.env_name = 'HalfCheetahBulletEnv-v0'
def __init__(self):
self.nb_steps = 10000
self.episode_length = 1000
self.learning_rate = 0.02
self.nb_directions = 16
self.nb_best_directions = 8
assert self.nb_best_directions <= self.nb_directions
self.noise = 0.03
self.seed = 1
self.env_name = 'HalfCheetahBulletEnv-v0'
# Multiprocess Exploring the policy on one specific direction and over one episode
@@ -165,52 +165,54 @@ def explore(env, normalizer, policy, direction, delta, hp):
def train(env, policy, normalizer, hp, parentPipes, args):
for step in range(hp.nb_steps):
for step in range(hp.nb_steps):
# Initializing the perturbations deltas and the positive/negative rewards
deltas = policy.sample_deltas()
positive_rewards = [0] * hp.nb_directions
negative_rewards = [0] * hp.nb_directions
# Initializing the perturbations deltas and the positive/negative rewards
deltas = policy.sample_deltas()
positive_rewards = [0] * hp.nb_directions
negative_rewards = [0] * hp.nb_directions
if parentPipes:
for k in range(hp.nb_directions):
parentPipe = parentPipes[k]
parentPipe.send([_EXPLORE,[normalizer, policy, hp, "positive", deltas[k]]])
for k in range(hp.nb_directions):
positive_rewards[k] = parentPipes[k].recv()[0]
if parentPipes:
for k in range(hp.nb_directions):
parentPipe = parentPipes[k]
parentPipe.send([_EXPLORE, [normalizer, policy, hp, "positive", deltas[k]]])
for k in range(hp.nb_directions):
positive_rewards[k] = parentPipes[k].recv()[0]
for k in range(hp.nb_directions):
parentPipe = parentPipes[k]
parentPipe.send([_EXPLORE,[normalizer, policy, hp, "negative", deltas[k]]])
for k in range(hp.nb_directions):
negative_rewards[k] = parentPipes[k].recv()[0]
for k in range(hp.nb_directions):
parentPipe = parentPipes[k]
parentPipe.send([_EXPLORE, [normalizer, policy, hp, "negative", deltas[k]]])
for k in range(hp.nb_directions):
negative_rewards[k] = parentPipes[k].recv()[0]
else:
# Getting the positive rewards in the positive directions
for k in range(hp.nb_directions):
positive_rewards[k] = explore(env, normalizer, policy, "positive", deltas[k], hp)
else:
# Getting the positive rewards in the positive directions
for k in range(hp.nb_directions):
positive_rewards[k] = explore(env, normalizer, policy, "positive", deltas[k], hp)
# Getting the negative rewards in the negative/opposite directions
for k in range(hp.nb_directions):
negative_rewards[k] = explore(env, normalizer, policy, "negative", deltas[k], hp)
# Getting the negative rewards in the negative/opposite directions
for k in range(hp.nb_directions):
negative_rewards[k] = explore(env, normalizer, policy, "negative", deltas[k], hp)
# Gathering all the positive/negative rewards to compute the standard deviation of these rewards
all_rewards = np.array(positive_rewards + negative_rewards)
sigma_r = all_rewards.std()
# Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions
scores = {
k: max(r_pos, r_neg)
for k, (r_pos, r_neg) in enumerate(zip(positive_rewards, negative_rewards))
}
order = sorted(scores.keys(), key=lambda x: -scores[x])[:hp.nb_best_directions]
rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order]
# Gathering all the positive/negative rewards to compute the standard deviation of these rewards
all_rewards = np.array(positive_rewards + negative_rewards)
sigma_r = all_rewards.std()
# Updating our policy
policy.update(rollouts, sigma_r, args)
# Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions
scores = {k:max(r_pos, r_neg) for k,(r_pos,r_neg) in enumerate(zip(positive_rewards, negative_rewards))}
order = sorted(scores.keys(), key = lambda x:-scores[x])[:hp.nb_best_directions]
rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order]
# Printing the final reward of the policy after the update
reward_evaluation = explore(env, normalizer, policy, None, None, hp)
print('Step:', step, 'Reward:', reward_evaluation)
# Updating our policy
policy.update(rollouts, sigma_r, args)
# Printing the final reward of the policy after the update
reward_evaluation = explore(env, normalizer, policy, None, None, hp)
print('Step:', step, 'Reward:', reward_evaluation)
# Running the main code
@@ -226,19 +228,15 @@ if __name__ == "__main__":
mp.freeze_support()
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--env',
help='Gym environment name',
type=str,
default='HalfCheetahBulletEnv-v0')
parser.add_argument(
'--env', help='Gym environment name', type=str, default='HalfCheetahBulletEnv-v0')
parser.add_argument('--seed', help='RNG seed', type=int, default=1)
parser.add_argument('--render', help='OpenGL Visualizer', type=int, default=0)
parser.add_argument('--movie', help='rgb_array gym movie', type=int, default=0)
parser.add_argument('--steps', help='Number of steps', type=int, default=10000)
parser.add_argument('--policy', help='Starting policy file (npy)', type=str, default='')
parser.add_argument('--logdir',
help='Directory root to log policy files (npy)',
type=str,
default='.')
parser.add_argument(
'--logdir', help='Directory root to log policy files (npy)', type=str, default='.')
parser.add_argument('--mp', help='Enable multiprocessing', type=int, default=1)
args = parser.parse_args()