add yapf style and apply yapf to format all Python files
This recreates pull request #2192
This commit is contained in:
@@ -4,7 +4,7 @@ import os
|
||||
import inspect
|
||||
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
|
||||
parentdir = os.path.dirname(os.path.dirname(currentdir))
|
||||
os.sys.path.insert(0,parentdir)
|
||||
os.sys.path.insert(0, parentdir)
|
||||
|
||||
# Importing the libraries
|
||||
import os
|
||||
@@ -20,17 +20,17 @@ import argparse
|
||||
|
||||
# Setting the Hyper Parameters
|
||||
class Hp():
|
||||
|
||||
def __init__(self):
|
||||
self.nb_steps = 10000
|
||||
self.episode_length = 1000
|
||||
self.learning_rate = 0.02
|
||||
self.nb_directions = 16
|
||||
self.nb_best_directions = 16
|
||||
assert self.nb_best_directions <= self.nb_directions
|
||||
self.noise = 0.03
|
||||
self.seed = 1
|
||||
self.env_name = 'HalfCheetahBulletEnv-v0'
|
||||
|
||||
def __init__(self):
|
||||
self.nb_steps = 10000
|
||||
self.episode_length = 1000
|
||||
self.learning_rate = 0.02
|
||||
self.nb_directions = 16
|
||||
self.nb_best_directions = 16
|
||||
assert self.nb_best_directions <= self.nb_directions
|
||||
self.noise = 0.03
|
||||
self.seed = 1
|
||||
self.env_name = 'HalfCheetahBulletEnv-v0'
|
||||
|
||||
|
||||
# Multiprocess Exploring the policy on one specific direction and over one episode
|
||||
@@ -39,111 +39,37 @@ _RESET = 1
|
||||
_CLOSE = 2
|
||||
_EXPLORE = 3
|
||||
|
||||
def ExploreWorker(rank,childPipe, envname, args):
|
||||
env = gym.make(envname)
|
||||
nb_inputs = env.observation_space.shape[0]
|
||||
normalizer = Normalizer(nb_inputs)
|
||||
observation_n = env.reset()
|
||||
n=0
|
||||
while True:
|
||||
n+=1
|
||||
try:
|
||||
# Only block for short times to have keyboard exceptions be raised.
|
||||
if not childPipe.poll(0.001):
|
||||
continue
|
||||
message, payload = childPipe.recv()
|
||||
except (EOFError, KeyboardInterrupt):
|
||||
break
|
||||
if message == _RESET:
|
||||
observation_n = env.reset()
|
||||
childPipe.send(["reset ok"])
|
||||
|
||||
def ExploreWorker(rank, childPipe, envname, args):
|
||||
env = gym.make(envname)
|
||||
nb_inputs = env.observation_space.shape[0]
|
||||
normalizer = Normalizer(nb_inputs)
|
||||
observation_n = env.reset()
|
||||
n = 0
|
||||
while True:
|
||||
n += 1
|
||||
try:
|
||||
# Only block for short times to have keyboard exceptions be raised.
|
||||
if not childPipe.poll(0.001):
|
||||
continue
|
||||
if message == _EXPLORE:
|
||||
#normalizer = payload[0] #use our local normalizer
|
||||
policy = payload[1]
|
||||
hp = payload[2]
|
||||
direction = payload[3]
|
||||
delta = payload[4]
|
||||
state = env.reset()
|
||||
done = False
|
||||
num_plays = 0.
|
||||
sum_rewards = 0
|
||||
while not done and num_plays < hp.episode_length:
|
||||
normalizer.observe(state)
|
||||
state = normalizer.normalize(state)
|
||||
action = policy.evaluate(state, delta, direction,hp)
|
||||
state, reward, done, _ = env.step(action)
|
||||
reward = max(min(reward, 1), -1)
|
||||
sum_rewards += reward
|
||||
num_plays += 1
|
||||
childPipe.send([sum_rewards])
|
||||
continue
|
||||
if message == _CLOSE:
|
||||
childPipe.send(["close ok"])
|
||||
break
|
||||
childPipe.close()
|
||||
|
||||
|
||||
# Normalizing the states
|
||||
|
||||
class Normalizer():
|
||||
|
||||
def __init__(self, nb_inputs):
|
||||
self.n = np.zeros(nb_inputs)
|
||||
self.mean = np.zeros(nb_inputs)
|
||||
self.mean_diff = np.zeros(nb_inputs)
|
||||
self.var = np.zeros(nb_inputs)
|
||||
|
||||
def observe(self, x):
|
||||
self.n += 1.
|
||||
last_mean = self.mean.copy()
|
||||
self.mean += (x - self.mean) / self.n
|
||||
self.mean_diff += (x - last_mean) * (x - self.mean)
|
||||
self.var = (self.mean_diff / self.n).clip(min = 1e-2)
|
||||
|
||||
def normalize(self, inputs):
|
||||
obs_mean = self.mean
|
||||
obs_std = np.sqrt(self.var)
|
||||
return (inputs - obs_mean) / obs_std
|
||||
|
||||
# Building the AI
|
||||
|
||||
class Policy():
|
||||
def __init__(self, input_size, output_size, env_name, args):
|
||||
try:
|
||||
self.theta = np.load(args.policy)
|
||||
except:
|
||||
self.theta = np.zeros((output_size, input_size))
|
||||
self.env_name = env_name
|
||||
print("Starting policy theta=",self.theta)
|
||||
def evaluate(self, input, delta, direction, hp):
|
||||
if direction is None:
|
||||
return np.clip(self.theta.dot(input), -1.0, 1.0)
|
||||
elif direction == "positive":
|
||||
return np.clip((self.theta + hp.noise*delta).dot(input), -1.0, 1.0)
|
||||
else:
|
||||
return np.clip((self.theta - hp.noise*delta).dot(input), -1.0, 1.0)
|
||||
|
||||
def sample_deltas(self):
|
||||
return [np.random.randn(*self.theta.shape) for _ in range(hp.nb_directions)]
|
||||
|
||||
def update(self, rollouts, sigma_r, args):
|
||||
step = np.zeros(self.theta.shape)
|
||||
for r_pos, r_neg, d in rollouts:
|
||||
step += (r_pos - r_neg) * d
|
||||
self.theta += hp.learning_rate / (hp.nb_best_directions * sigma_r) * step
|
||||
timestr = time.strftime("%Y%m%d-%H%M%S")
|
||||
np.save(args.logdir+"/policy_"+self.env_name+"_"+timestr+".npy", self.theta)
|
||||
|
||||
|
||||
# Exploring the policy on one specific direction and over one episode
|
||||
|
||||
def explore(env, normalizer, policy, direction, delta, hp):
|
||||
state = env.reset()
|
||||
done = False
|
||||
num_plays = 0.
|
||||
sum_rewards = 0
|
||||
while not done and num_plays < hp.episode_length:
|
||||
message, payload = childPipe.recv()
|
||||
except (EOFError, KeyboardInterrupt):
|
||||
break
|
||||
if message == _RESET:
|
||||
observation_n = env.reset()
|
||||
childPipe.send(["reset ok"])
|
||||
continue
|
||||
if message == _EXPLORE:
|
||||
#normalizer = payload[0] #use our local normalizer
|
||||
policy = payload[1]
|
||||
hp = payload[2]
|
||||
direction = payload[3]
|
||||
delta = payload[4]
|
||||
state = env.reset()
|
||||
done = False
|
||||
num_plays = 0.
|
||||
sum_rewards = 0
|
||||
while not done and num_plays < hp.episode_length:
|
||||
normalizer.observe(state)
|
||||
state = normalizer.normalize(state)
|
||||
action = policy.evaluate(state, delta, direction, hp)
|
||||
@@ -151,127 +77,217 @@ def explore(env, normalizer, policy, direction, delta, hp):
|
||||
reward = max(min(reward, 1), -1)
|
||||
sum_rewards += reward
|
||||
num_plays += 1
|
||||
return sum_rewards
|
||||
childPipe.send([sum_rewards])
|
||||
continue
|
||||
if message == _CLOSE:
|
||||
childPipe.send(["close ok"])
|
||||
break
|
||||
childPipe.close()
|
||||
|
||||
|
||||
# Normalizing the states
|
||||
|
||||
|
||||
class Normalizer():
|
||||
|
||||
def __init__(self, nb_inputs):
|
||||
self.n = np.zeros(nb_inputs)
|
||||
self.mean = np.zeros(nb_inputs)
|
||||
self.mean_diff = np.zeros(nb_inputs)
|
||||
self.var = np.zeros(nb_inputs)
|
||||
|
||||
def observe(self, x):
|
||||
self.n += 1.
|
||||
last_mean = self.mean.copy()
|
||||
self.mean += (x - self.mean) / self.n
|
||||
self.mean_diff += (x - last_mean) * (x - self.mean)
|
||||
self.var = (self.mean_diff / self.n).clip(min=1e-2)
|
||||
|
||||
def normalize(self, inputs):
|
||||
obs_mean = self.mean
|
||||
obs_std = np.sqrt(self.var)
|
||||
return (inputs - obs_mean) / obs_std
|
||||
|
||||
|
||||
# Building the AI
|
||||
|
||||
|
||||
class Policy():
|
||||
|
||||
def __init__(self, input_size, output_size, env_name, args):
|
||||
try:
|
||||
self.theta = np.load(args.policy)
|
||||
except:
|
||||
self.theta = np.zeros((output_size, input_size))
|
||||
self.env_name = env_name
|
||||
print("Starting policy theta=", self.theta)
|
||||
|
||||
def evaluate(self, input, delta, direction, hp):
|
||||
if direction is None:
|
||||
return np.clip(self.theta.dot(input), -1.0, 1.0)
|
||||
elif direction == "positive":
|
||||
return np.clip((self.theta + hp.noise * delta).dot(input), -1.0, 1.0)
|
||||
else:
|
||||
return np.clip((self.theta - hp.noise * delta).dot(input), -1.0, 1.0)
|
||||
|
||||
def sample_deltas(self):
|
||||
return [np.random.randn(*self.theta.shape) for _ in range(hp.nb_directions)]
|
||||
|
||||
def update(self, rollouts, sigma_r, args):
|
||||
step = np.zeros(self.theta.shape)
|
||||
for r_pos, r_neg, d in rollouts:
|
||||
step += (r_pos - r_neg) * d
|
||||
self.theta += hp.learning_rate / (hp.nb_best_directions * sigma_r) * step
|
||||
timestr = time.strftime("%Y%m%d-%H%M%S")
|
||||
np.save(args.logdir + "/policy_" + self.env_name + "_" + timestr + ".npy", self.theta)
|
||||
|
||||
|
||||
# Exploring the policy on one specific direction and over one episode
|
||||
|
||||
|
||||
def explore(env, normalizer, policy, direction, delta, hp):
|
||||
state = env.reset()
|
||||
done = False
|
||||
num_plays = 0.
|
||||
sum_rewards = 0
|
||||
while not done and num_plays < hp.episode_length:
|
||||
normalizer.observe(state)
|
||||
state = normalizer.normalize(state)
|
||||
action = policy.evaluate(state, delta, direction, hp)
|
||||
state, reward, done, _ = env.step(action)
|
||||
reward = max(min(reward, 1), -1)
|
||||
sum_rewards += reward
|
||||
num_plays += 1
|
||||
return sum_rewards
|
||||
|
||||
|
||||
# Training the AI
|
||||
|
||||
|
||||
def train(env, policy, normalizer, hp, parentPipes, args):
|
||||
|
||||
for step in range(hp.nb_steps):
|
||||
|
||||
# Initializing the perturbations deltas and the positive/negative rewards
|
||||
deltas = policy.sample_deltas()
|
||||
positive_rewards = [0] * hp.nb_directions
|
||||
negative_rewards = [0] * hp.nb_directions
|
||||
|
||||
if parentPipes:
|
||||
for k in range(hp.nb_directions):
|
||||
parentPipe = parentPipes[k]
|
||||
parentPipe.send([_EXPLORE,[normalizer, policy, hp, "positive", deltas[k]]])
|
||||
for k in range(hp.nb_directions):
|
||||
positive_rewards[k] = parentPipes[k].recv()[0]
|
||||
|
||||
for k in range(hp.nb_directions):
|
||||
parentPipe = parentPipes[k]
|
||||
parentPipe.send([_EXPLORE,[normalizer, policy, hp, "negative", deltas[k]]])
|
||||
for k in range(hp.nb_directions):
|
||||
negative_rewards[k] = parentPipes[k].recv()[0]
|
||||
|
||||
else:
|
||||
# Getting the positive rewards in the positive directions
|
||||
for k in range(hp.nb_directions):
|
||||
positive_rewards[k] = explore(env, normalizer, policy, "positive", deltas[k], hp)
|
||||
|
||||
|
||||
# Getting the negative rewards in the negative/opposite directions
|
||||
for k in range(hp.nb_directions):
|
||||
negative_rewards[k] = explore(env, normalizer, policy, "negative", deltas[k], hp)
|
||||
|
||||
|
||||
# Gathering all the positive/negative rewards to compute the standard deviation of these rewards
|
||||
all_rewards = np.array(positive_rewards + negative_rewards)
|
||||
sigma_r = all_rewards.std()
|
||||
|
||||
# Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions
|
||||
scores = {k:max(r_pos, r_neg) for k,(r_pos,r_neg) in enumerate(zip(positive_rewards, negative_rewards))}
|
||||
order = sorted(scores.keys(), key = lambda x:scores[x])[:hp.nb_best_directions]
|
||||
rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order]
|
||||
|
||||
# Updating our policy
|
||||
policy.update(rollouts, sigma_r, args)
|
||||
|
||||
# Printing the final reward of the policy after the update
|
||||
reward_evaluation = explore(env, normalizer, policy, None, None, hp)
|
||||
print('Step:', step, 'Reward:', reward_evaluation)
|
||||
|
||||
for step in range(hp.nb_steps):
|
||||
|
||||
# Initializing the perturbations deltas and the positive/negative rewards
|
||||
deltas = policy.sample_deltas()
|
||||
positive_rewards = [0] * hp.nb_directions
|
||||
negative_rewards = [0] * hp.nb_directions
|
||||
|
||||
if parentPipes:
|
||||
for k in range(hp.nb_directions):
|
||||
parentPipe = parentPipes[k]
|
||||
parentPipe.send([_EXPLORE, [normalizer, policy, hp, "positive", deltas[k]]])
|
||||
for k in range(hp.nb_directions):
|
||||
positive_rewards[k] = parentPipes[k].recv()[0]
|
||||
|
||||
for k in range(hp.nb_directions):
|
||||
parentPipe = parentPipes[k]
|
||||
parentPipe.send([_EXPLORE, [normalizer, policy, hp, "negative", deltas[k]]])
|
||||
for k in range(hp.nb_directions):
|
||||
negative_rewards[k] = parentPipes[k].recv()[0]
|
||||
|
||||
else:
|
||||
# Getting the positive rewards in the positive directions
|
||||
for k in range(hp.nb_directions):
|
||||
positive_rewards[k] = explore(env, normalizer, policy, "positive", deltas[k], hp)
|
||||
|
||||
# Getting the negative rewards in the negative/opposite directions
|
||||
for k in range(hp.nb_directions):
|
||||
negative_rewards[k] = explore(env, normalizer, policy, "negative", deltas[k], hp)
|
||||
|
||||
# Gathering all the positive/negative rewards to compute the standard deviation of these rewards
|
||||
all_rewards = np.array(positive_rewards + negative_rewards)
|
||||
sigma_r = all_rewards.std()
|
||||
|
||||
# Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions
|
||||
scores = {
|
||||
k: max(r_pos, r_neg)
|
||||
for k, (r_pos, r_neg) in enumerate(zip(positive_rewards, negative_rewards))
|
||||
}
|
||||
order = sorted(scores.keys(), key=lambda x: scores[x])[:hp.nb_best_directions]
|
||||
rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order]
|
||||
|
||||
# Updating our policy
|
||||
policy.update(rollouts, sigma_r, args)
|
||||
|
||||
# Printing the final reward of the policy after the update
|
||||
reward_evaluation = explore(env, normalizer, policy, None, None, hp)
|
||||
print('Step:', step, 'Reward:', reward_evaluation)
|
||||
|
||||
|
||||
# Running the main code
|
||||
|
||||
|
||||
def mkdir(base, name):
|
||||
path = os.path.join(base, name)
|
||||
if not os.path.exists(path):
|
||||
os.makedirs(path)
|
||||
return path
|
||||
|
||||
|
||||
path = os.path.join(base, name)
|
||||
if not os.path.exists(path):
|
||||
os.makedirs(path)
|
||||
return path
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
mp.freeze_support()
|
||||
mp.freeze_support()
|
||||
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--env', help='Gym environment name', type=str, default='HalfCheetahBulletEnv-v0')
|
||||
parser.add_argument('--seed', help='RNG seed', type=int, default=1)
|
||||
parser.add_argument('--render', help='OpenGL Visualizer', type=int, default=0)
|
||||
parser.add_argument('--movie',help='rgb_array gym movie',type=int, default=0)
|
||||
parser.add_argument('--steps', help='Number of steps', type=int, default=10000)
|
||||
parser.add_argument('--policy', help='Starting policy file (npy)', type=str, default='')
|
||||
parser.add_argument('--logdir', help='Directory root to log policy files (npy)', type=str, default='.')
|
||||
parser.add_argument('--mp', help='Enable multiprocessing', type=int, default=1)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
hp = Hp()
|
||||
hp.env_name = args.env
|
||||
hp.seed = args.seed
|
||||
hp.nb_steps = args.steps
|
||||
print("seed = ", hp.seed)
|
||||
np.random.seed(hp.seed)
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--env',
|
||||
help='Gym environment name',
|
||||
type=str,
|
||||
default='HalfCheetahBulletEnv-v0')
|
||||
parser.add_argument('--seed', help='RNG seed', type=int, default=1)
|
||||
parser.add_argument('--render', help='OpenGL Visualizer', type=int, default=0)
|
||||
parser.add_argument('--movie', help='rgb_array gym movie', type=int, default=0)
|
||||
parser.add_argument('--steps', help='Number of steps', type=int, default=10000)
|
||||
parser.add_argument('--policy', help='Starting policy file (npy)', type=str, default='')
|
||||
parser.add_argument('--logdir',
|
||||
help='Directory root to log policy files (npy)',
|
||||
type=str,
|
||||
default='.')
|
||||
parser.add_argument('--mp', help='Enable multiprocessing', type=int, default=1)
|
||||
|
||||
parentPipes = None
|
||||
if args.mp:
|
||||
num_processes = hp.nb_directions
|
||||
processes = []
|
||||
childPipes = []
|
||||
parentPipes = []
|
||||
|
||||
for pr in range (num_processes):
|
||||
parentPipe, childPipe = Pipe()
|
||||
parentPipes.append(parentPipe)
|
||||
childPipes.append(childPipe)
|
||||
|
||||
for rank in range(num_processes):
|
||||
p = mp.Process(target=ExploreWorker, args=(rank,childPipes[rank], hp.env_name, args))
|
||||
p.start()
|
||||
processes.append(p)
|
||||
|
||||
work_dir = mkdir('exp', 'brs')
|
||||
monitor_dir = mkdir(work_dir, 'monitor')
|
||||
env = gym.make(hp.env_name)
|
||||
if args.render:
|
||||
env.render(mode = "human")
|
||||
if args.movie:
|
||||
env = wrappers.Monitor(env, monitor_dir, force = True)
|
||||
nb_inputs = env.observation_space.shape[0]
|
||||
nb_outputs = env.action_space.shape[0]
|
||||
policy = Policy(nb_inputs, nb_outputs,hp.env_name, args)
|
||||
normalizer = Normalizer(nb_inputs)
|
||||
|
||||
print("start training")
|
||||
train(env, policy, normalizer, hp, parentPipes, args)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.mp:
|
||||
for parentPipe in parentPipes:
|
||||
parentPipe.send([_CLOSE,"pay2"])
|
||||
|
||||
for p in processes:
|
||||
p.join()
|
||||
hp = Hp()
|
||||
hp.env_name = args.env
|
||||
hp.seed = args.seed
|
||||
hp.nb_steps = args.steps
|
||||
print("seed = ", hp.seed)
|
||||
np.random.seed(hp.seed)
|
||||
|
||||
parentPipes = None
|
||||
if args.mp:
|
||||
num_processes = hp.nb_directions
|
||||
processes = []
|
||||
childPipes = []
|
||||
parentPipes = []
|
||||
|
||||
for pr in range(num_processes):
|
||||
parentPipe, childPipe = Pipe()
|
||||
parentPipes.append(parentPipe)
|
||||
childPipes.append(childPipe)
|
||||
|
||||
for rank in range(num_processes):
|
||||
p = mp.Process(target=ExploreWorker, args=(rank, childPipes[rank], hp.env_name, args))
|
||||
p.start()
|
||||
processes.append(p)
|
||||
|
||||
work_dir = mkdir('exp', 'brs')
|
||||
monitor_dir = mkdir(work_dir, 'monitor')
|
||||
env = gym.make(hp.env_name)
|
||||
if args.render:
|
||||
env.render(mode="human")
|
||||
if args.movie:
|
||||
env = wrappers.Monitor(env, monitor_dir, force=True)
|
||||
nb_inputs = env.observation_space.shape[0]
|
||||
nb_outputs = env.action_space.shape[0]
|
||||
policy = Policy(nb_inputs, nb_outputs, hp.env_name, args)
|
||||
normalizer = Normalizer(nb_inputs)
|
||||
|
||||
print("start training")
|
||||
train(env, policy, normalizer, hp, parentPipes, args)
|
||||
|
||||
if args.mp:
|
||||
for parentPipe in parentPipes:
|
||||
parentPipe.send([_CLOSE, "pay2"])
|
||||
|
||||
for p in processes:
|
||||
p.join()
|
||||
|
||||
Reference in New Issue
Block a user