add ARS to train/eval Minitaur
This commit is contained in:
397
examples/pybullet/gym/pybullet_envs/ARS/ars.py
Normal file
397
examples/pybullet/gym/pybullet_envs/ARS/ars.py
Normal file
@@ -0,0 +1,397 @@
|
|||||||
|
"""Internal implementation of the Augmented Random Search method."""
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import os, inspect
|
||||||
|
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
|
||||||
|
os.sys.path.insert(0,currentdir)
|
||||||
|
|
||||||
|
from concurrent import futures
|
||||||
|
import copy
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import gym
|
||||||
|
import numpy as np
|
||||||
|
import logz
|
||||||
|
import utils
|
||||||
|
import optimizers
|
||||||
|
#from google3.pyglib import gfile
|
||||||
|
import policies
|
||||||
|
import shared_noise
|
||||||
|
import utility
|
||||||
|
|
||||||
|
class Worker(object):
|
||||||
|
"""Object class for parallel rollout generation."""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
env_seed,
|
||||||
|
env_callback,
|
||||||
|
policy_params=None,
|
||||||
|
deltas=None,
|
||||||
|
rollout_length=1000,
|
||||||
|
delta_std=0.02):
|
||||||
|
|
||||||
|
# initialize OpenAI environment for each worker
|
||||||
|
self.env = env_callback()
|
||||||
|
self.env.seed(env_seed)
|
||||||
|
|
||||||
|
# each worker gets access to the shared noise table
|
||||||
|
# with independent random streams for sampling
|
||||||
|
# from the shared noise table.
|
||||||
|
self.deltas = shared_noise.SharedNoiseTable(deltas, env_seed + 7)
|
||||||
|
self.policy_params = policy_params
|
||||||
|
if policy_params['type'] == 'linear':
|
||||||
|
self.policy = policies.LinearPolicy(policy_params)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
self.delta_std = delta_std
|
||||||
|
self.rollout_length = rollout_length
|
||||||
|
|
||||||
|
def get_weights_plus_stats(self):
|
||||||
|
"""
|
||||||
|
Get current policy weights and current statistics of past states.
|
||||||
|
"""
|
||||||
|
assert self.policy_params['type'] == 'linear'
|
||||||
|
return self.policy.get_weights_plus_stats()
|
||||||
|
|
||||||
|
def rollout(self, shift=0., rollout_length=None):
|
||||||
|
"""Performs one rollout of maximum length rollout_length.
|
||||||
|
|
||||||
|
At each time-step it substracts shift from the reward.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if rollout_length is None:
|
||||||
|
rollout_length = self.rollout_length
|
||||||
|
|
||||||
|
total_reward = 0.
|
||||||
|
steps = 0
|
||||||
|
|
||||||
|
ob = self.env.reset()
|
||||||
|
for i in range(rollout_length):
|
||||||
|
action = self.policy.act(ob)
|
||||||
|
ob, reward, done, _ = self.env.step(action)
|
||||||
|
steps += 1
|
||||||
|
total_reward += (reward - shift)
|
||||||
|
if done:
|
||||||
|
break
|
||||||
|
|
||||||
|
return total_reward, steps
|
||||||
|
|
||||||
|
def do_rollouts(self, w_policy, num_rollouts=1, shift=1, evaluate=False):
|
||||||
|
"""
|
||||||
|
Generate multiple rollouts with a policy parametrized by w_policy.
|
||||||
|
"""
|
||||||
|
print('Doing {} rollouts'.format(num_rollouts))
|
||||||
|
rollout_rewards, deltas_idx = [], []
|
||||||
|
steps = 0
|
||||||
|
|
||||||
|
for i in range(num_rollouts):
|
||||||
|
|
||||||
|
if evaluate:
|
||||||
|
self.policy.update_weights(w_policy)
|
||||||
|
deltas_idx.append(-1)
|
||||||
|
|
||||||
|
# set to false so that evaluation rollouts are not used for updating state statistics
|
||||||
|
self.policy.update_filter = False
|
||||||
|
|
||||||
|
# for evaluation we do not shift the rewards (shift = 0) and we use the
|
||||||
|
# default rollout length (1000 for the MuJoCo locomotion tasks)
|
||||||
|
reward, r_steps = self.rollout(
|
||||||
|
shift=0., rollout_length=self.rollout_length)
|
||||||
|
rollout_rewards.append(reward)
|
||||||
|
|
||||||
|
else:
|
||||||
|
idx, delta = self.deltas.get_delta(w_policy.size)
|
||||||
|
|
||||||
|
delta = (self.delta_std * delta).reshape(w_policy.shape)
|
||||||
|
deltas_idx.append(idx)
|
||||||
|
|
||||||
|
# set to true so that state statistics are updated
|
||||||
|
self.policy.update_filter = True
|
||||||
|
|
||||||
|
# compute reward and number of timesteps used for positive perturbation rollout
|
||||||
|
self.policy.update_weights(w_policy + delta)
|
||||||
|
pos_reward, pos_steps = self.rollout(shift=shift)
|
||||||
|
|
||||||
|
# compute reward and number of timesteps used for negative pertubation rollout
|
||||||
|
self.policy.update_weights(w_policy - delta)
|
||||||
|
neg_reward, neg_steps = self.rollout(shift=shift)
|
||||||
|
steps += pos_steps + neg_steps
|
||||||
|
|
||||||
|
rollout_rewards.append([pos_reward, neg_reward])
|
||||||
|
|
||||||
|
return {
|
||||||
|
'deltas_idx': deltas_idx,
|
||||||
|
'rollout_rewards': rollout_rewards,
|
||||||
|
'steps': steps
|
||||||
|
}
|
||||||
|
|
||||||
|
def stats_increment(self):
|
||||||
|
self.policy.observation_filter.stats_increment()
|
||||||
|
return
|
||||||
|
|
||||||
|
def get_weights(self):
|
||||||
|
return self.policy.get_weights()
|
||||||
|
|
||||||
|
def get_filter(self):
|
||||||
|
return self.policy.observation_filter
|
||||||
|
|
||||||
|
def sync_filter(self, other):
|
||||||
|
self.policy.observation_filter.sync(other)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
class ARSLearner(object):
|
||||||
|
"""
|
||||||
|
Object class implementing the ARS algorithm.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
env_callback,
|
||||||
|
policy_params=None,
|
||||||
|
num_workers=32,
|
||||||
|
num_deltas=320,
|
||||||
|
deltas_used=320,
|
||||||
|
delta_std=0.02,
|
||||||
|
logdir=None,
|
||||||
|
rollout_length=1000,
|
||||||
|
step_size=0.01,
|
||||||
|
shift='constant zero',
|
||||||
|
params=None,
|
||||||
|
seed=123):
|
||||||
|
|
||||||
|
logz.configure_output_dir(logdir)
|
||||||
|
# params_to_save = copy.deepcopy(params)
|
||||||
|
# params_to_save['env'] = None
|
||||||
|
# logz.save_params(params_to_save)
|
||||||
|
utility.save_config(params, logdir)
|
||||||
|
env = env_callback()
|
||||||
|
|
||||||
|
self.timesteps = 0
|
||||||
|
self.action_size = env.action_space.shape[0]
|
||||||
|
self.ob_size = env.observation_space.shape[0]
|
||||||
|
self.num_deltas = num_deltas
|
||||||
|
self.deltas_used = deltas_used
|
||||||
|
self.rollout_length = rollout_length
|
||||||
|
self.step_size = step_size
|
||||||
|
self.delta_std = delta_std
|
||||||
|
self.logdir = logdir
|
||||||
|
self.shift = shift
|
||||||
|
self.params = params
|
||||||
|
self.max_past_avg_reward = float('-inf')
|
||||||
|
self.num_episodes_used = float('inf')
|
||||||
|
|
||||||
|
# create shared table for storing noise
|
||||||
|
print('Creating deltas table.')
|
||||||
|
deltas = shared_noise.create_shared_noise()
|
||||||
|
self.deltas = shared_noise.SharedNoiseTable(deltas, seed=seed + 3)
|
||||||
|
print('Created deltas table.')
|
||||||
|
|
||||||
|
# initialize workers with different random seeds
|
||||||
|
print('Initializing workers.')
|
||||||
|
self.num_workers = num_workers
|
||||||
|
self.workers = [
|
||||||
|
Worker(
|
||||||
|
seed + 7 * i,
|
||||||
|
env_callback=env_callback,
|
||||||
|
policy_params=policy_params,
|
||||||
|
deltas=deltas,
|
||||||
|
rollout_length=rollout_length,
|
||||||
|
delta_std=delta_std) for i in range(num_workers)
|
||||||
|
]
|
||||||
|
|
||||||
|
# initialize policy
|
||||||
|
if policy_params['type'] == 'linear':
|
||||||
|
self.policy = policies.LinearPolicy(policy_params)
|
||||||
|
self.w_policy = self.policy.get_weights()
|
||||||
|
else:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
# initialize optimization algorithm
|
||||||
|
self.optimizer = optimizers.SGD(self.w_policy, self.step_size)
|
||||||
|
print('Initialization of ARS complete.')
|
||||||
|
|
||||||
|
def aggregate_rollouts(self, num_rollouts=None, evaluate=False):
|
||||||
|
"""
|
||||||
|
Aggregate update step from rollouts generated in parallel.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if num_rollouts is None:
|
||||||
|
num_deltas = self.num_deltas
|
||||||
|
else:
|
||||||
|
num_deltas = num_rollouts
|
||||||
|
|
||||||
|
results_one = [] #rollout_ids_one
|
||||||
|
results_two = [] #rollout_ids_two
|
||||||
|
|
||||||
|
t1 = time.time()
|
||||||
|
num_rollouts = int(num_deltas / self.num_workers)
|
||||||
|
# if num_rollouts > 0:
|
||||||
|
# with futures.ThreadPoolExecutor(
|
||||||
|
# max_workers=self.num_workers) as executor:
|
||||||
|
# workers = [
|
||||||
|
# executor.submit(
|
||||||
|
# worker.do_rollouts,
|
||||||
|
# self.w_policy,
|
||||||
|
# num_rollouts=num_rollouts,
|
||||||
|
# shift=self.shift,
|
||||||
|
# evaluate=evaluate) for worker in self.workers
|
||||||
|
# ]
|
||||||
|
# for worker in futures.as_completed(workers):
|
||||||
|
# results_one.append(worker.result())
|
||||||
|
#
|
||||||
|
# workers = [
|
||||||
|
# executor.submit(
|
||||||
|
# worker.do_rollouts,
|
||||||
|
# self.w_policy,
|
||||||
|
# num_rollouts=1,
|
||||||
|
# shift=self.shift,
|
||||||
|
# evaluate=evaluate)
|
||||||
|
# for worker in self.workers[:(num_deltas % self.num_workers)]
|
||||||
|
# ]
|
||||||
|
# for worker in futures.as_completed(workers):
|
||||||
|
# results_two.append(worker.result())
|
||||||
|
|
||||||
|
# parallel generation of rollouts
|
||||||
|
rollout_ids_one = [
|
||||||
|
worker.do_rollouts(
|
||||||
|
self.w_policy,
|
||||||
|
num_rollouts=num_rollouts,
|
||||||
|
shift=self.shift,
|
||||||
|
evaluate=evaluate) for worker in self.workers
|
||||||
|
]
|
||||||
|
|
||||||
|
rollout_ids_two = [
|
||||||
|
worker.do_rollouts(
|
||||||
|
self.w_policy, num_rollouts=1, shift=self.shift, evaluate=evaluate)
|
||||||
|
for worker in self.workers[:(num_deltas % self.num_workers)]
|
||||||
|
]
|
||||||
|
results_one = rollout_ids_one
|
||||||
|
results_two = rollout_ids_two
|
||||||
|
# gather results
|
||||||
|
|
||||||
|
rollout_rewards, deltas_idx = [], []
|
||||||
|
|
||||||
|
for result in results_one:
|
||||||
|
if not evaluate:
|
||||||
|
self.timesteps += result['steps']
|
||||||
|
deltas_idx += result['deltas_idx']
|
||||||
|
rollout_rewards += result['rollout_rewards']
|
||||||
|
|
||||||
|
for result in results_two:
|
||||||
|
if not evaluate:
|
||||||
|
self.timesteps += result['steps']
|
||||||
|
deltas_idx += result['deltas_idx']
|
||||||
|
rollout_rewards += result['rollout_rewards']
|
||||||
|
|
||||||
|
deltas_idx = np.array(deltas_idx)
|
||||||
|
rollout_rewards = np.array(rollout_rewards, dtype=np.float64)
|
||||||
|
|
||||||
|
print('Maximum reward of collected rollouts:', rollout_rewards.max())
|
||||||
|
info_dict = {
|
||||||
|
"max_reward": rollout_rewards.max()
|
||||||
|
}
|
||||||
|
t2 = time.time()
|
||||||
|
|
||||||
|
print('Time to generate rollouts:', t2 - t1)
|
||||||
|
|
||||||
|
if evaluate:
|
||||||
|
return rollout_rewards
|
||||||
|
|
||||||
|
# select top performing directions if deltas_used < num_deltas
|
||||||
|
max_rewards = np.max(rollout_rewards, axis=1)
|
||||||
|
if self.deltas_used > self.num_deltas:
|
||||||
|
self.deltas_used = self.num_deltas
|
||||||
|
|
||||||
|
idx = np.arange(max_rewards.size)[max_rewards >= np.percentile(
|
||||||
|
max_rewards, 100 * (1 - (self.deltas_used / self.num_deltas)))]
|
||||||
|
deltas_idx = deltas_idx[idx]
|
||||||
|
rollout_rewards = rollout_rewards[idx, :]
|
||||||
|
|
||||||
|
# normalize rewards by their standard deviation
|
||||||
|
rollout_rewards /= np.std(rollout_rewards)
|
||||||
|
|
||||||
|
t1 = time.time()
|
||||||
|
# aggregate rollouts to form g_hat, the gradient used to compute SGD step
|
||||||
|
g_hat, count = utils.batched_weighted_sum(
|
||||||
|
rollout_rewards[:, 0] - rollout_rewards[:, 1],
|
||||||
|
(self.deltas.get(idx, self.w_policy.size) for idx in deltas_idx),
|
||||||
|
batch_size=500)
|
||||||
|
g_hat /= deltas_idx.size
|
||||||
|
t2 = time.time()
|
||||||
|
print('time to aggregate rollouts', t2 - t1)
|
||||||
|
return g_hat, info_dict
|
||||||
|
|
||||||
|
def train_step(self):
|
||||||
|
"""
|
||||||
|
Perform one update step of the policy weights.
|
||||||
|
"""
|
||||||
|
|
||||||
|
g_hat, info_dict = self.aggregate_rollouts()
|
||||||
|
print('Euclidean norm of update step:', np.linalg.norm(g_hat))
|
||||||
|
self.w_policy -= self.optimizer._compute_step(g_hat).reshape(
|
||||||
|
self.w_policy.shape)
|
||||||
|
return info_dict
|
||||||
|
|
||||||
|
def train(self, num_iter):
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
for i in range(num_iter):
|
||||||
|
|
||||||
|
t1 = time.time()
|
||||||
|
info_dict = self.train_step()
|
||||||
|
t2 = time.time()
|
||||||
|
print('total time of one step', t2 - t1)
|
||||||
|
print('iter ', i, ' done')
|
||||||
|
|
||||||
|
# record statistics every 10 iterations
|
||||||
|
if ((i) % 10 == 0):
|
||||||
|
|
||||||
|
rewards = self.aggregate_rollouts(num_rollouts=8, evaluate=True)
|
||||||
|
w = self.workers[0].get_weights_plus_stats()
|
||||||
|
|
||||||
|
checkpoint_filename = os.path.join(
|
||||||
|
self.logdir, 'lin_policy_plus_{:03d}.npz'.format(i))
|
||||||
|
print('Save checkpoints to {}...', checkpoint_filename)
|
||||||
|
checkpoint_file = open(checkpoint_filename, 'w')
|
||||||
|
np.savez(checkpoint_file, w)
|
||||||
|
print('End save checkpoints.')
|
||||||
|
print(sorted(self.params.items()))
|
||||||
|
logz.log_tabular('Time', time.time() - start)
|
||||||
|
logz.log_tabular('Iteration', i + 1)
|
||||||
|
logz.log_tabular('AverageReward', np.mean(rewards))
|
||||||
|
logz.log_tabular('StdRewards', np.std(rewards))
|
||||||
|
logz.log_tabular('MaxRewardRollout', np.max(rewards))
|
||||||
|
logz.log_tabular('MinRewardRollout', np.min(rewards))
|
||||||
|
logz.log_tabular('timesteps', self.timesteps)
|
||||||
|
logz.dump_tabular()
|
||||||
|
|
||||||
|
t1 = time.time()
|
||||||
|
# get statistics from all workers
|
||||||
|
for j in range(self.num_workers):
|
||||||
|
self.policy.observation_filter.update(self.workers[j].get_filter())
|
||||||
|
self.policy.observation_filter.stats_increment()
|
||||||
|
|
||||||
|
# make sure master filter buffer is clear
|
||||||
|
self.policy.observation_filter.clear_buffer()
|
||||||
|
# sync all workers
|
||||||
|
#filter_id = ray.put(self.policy.observation_filter)
|
||||||
|
setting_filters_ids = [
|
||||||
|
worker.sync_filter(self.policy.observation_filter)
|
||||||
|
for worker in self.workers
|
||||||
|
]
|
||||||
|
# waiting for sync of all workers
|
||||||
|
#ray.get(setting_filters_ids)
|
||||||
|
|
||||||
|
increment_filters_ids = [
|
||||||
|
worker.stats_increment() for worker in self.workers
|
||||||
|
]
|
||||||
|
# waiting for increment of all workers
|
||||||
|
#ray.get(increment_filters_ids)
|
||||||
|
t2 = time.time()
|
||||||
|
print('Time to sync statistics:', t2 - t1)
|
||||||
|
|
||||||
|
return info_dict
|
||||||
62
examples/pybullet/gym/pybullet_envs/ARS/ars_server.py
Normal file
62
examples/pybullet/gym/pybullet_envs/ARS/ars_server.py
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
"""
|
||||||
|
blaze build -c opt //experimental/users/jietan/ARS:ars_server
|
||||||
|
|
||||||
|
blaze-bin/experimental/users/jietan/ARS/ars_server \
|
||||||
|
--config_name=MINITAUR_GYM_CONFIG
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import time
|
||||||
|
from absl import app
|
||||||
|
from absl import flags
|
||||||
|
from concurrent import futures
|
||||||
|
import grpc
|
||||||
|
from grpc import loas2
|
||||||
|
from google3.robotics.reinforcement_learning.minitaur.envs import minitaur_gym_env
|
||||||
|
from google3.robotics.reinforcement_learning.minitaur.envs import minitaur_reactive_env
|
||||||
|
from google3.robotics.reinforcement_learning.minitaur.envs.env_randomizers import minitaur_env_randomizer
|
||||||
|
from google3.robotics.reinforcement_learning.minitaur.envs.env_randomizers import minitaur_env_randomizer_from_config as randomizer_config_lib
|
||||||
|
from google3.experimental.users.jietan.ARS import ars_evaluation_service_pb2_grpc
|
||||||
|
from google3.experimental.users.jietan.ARS import ars_evaluation_service
|
||||||
|
|
||||||
|
FLAGS = flags.FLAGS
|
||||||
|
flags.DEFINE_integer("server_id", 0, "number of servers")
|
||||||
|
flags.DEFINE_integer("port", 20000, "port number.")
|
||||||
|
flags.DEFINE_string("config_name", None, "The name of the config dictionary.")
|
||||||
|
flags.DEFINE_bool('run_on_borg', False,
|
||||||
|
'Whether the servers are running on borg.')
|
||||||
|
|
||||||
|
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||||
|
|
||||||
|
|
||||||
|
def main(unused_argv):
|
||||||
|
servers = []
|
||||||
|
server_creds = loas2.loas2_server_credentials()
|
||||||
|
port = FLAGS.port
|
||||||
|
if not FLAGS.run_on_borg:
|
||||||
|
port = 20000 + FLAGS.server_id
|
||||||
|
server = grpc.server(
|
||||||
|
futures.ThreadPoolExecutor(max_workers=10), ports=(port,))
|
||||||
|
servicer = ars_evaluation_service.ParameterEvaluationServicer(
|
||||||
|
FLAGS.config_name, worker_id=FLAGS.server_id)
|
||||||
|
ars_evaluation_service_pb2_grpc.add_EvaluationServicer_to_server(
|
||||||
|
servicer, server)
|
||||||
|
server.add_secure_port("[::]:{}".format(port), server_creds)
|
||||||
|
servers.append(server)
|
||||||
|
server.start()
|
||||||
|
print("Start server {}".format(FLAGS.server_id))
|
||||||
|
|
||||||
|
# prevent the main thread from exiting
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
time.sleep(_ONE_DAY_IN_SECONDS)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
for server in servers:
|
||||||
|
server.stop(0)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
app.run(main)
|
||||||
83
examples/pybullet/gym/pybullet_envs/ARS/config_ars.py
Normal file
83
examples/pybullet/gym/pybullet_envs/ARS/config_ars.py
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import functools
|
||||||
|
from pybullet_envs.minitaur.envs import minitaur_gym_env
|
||||||
|
from pybullet_envs.minitaur.envs import minitaur_reactive_env
|
||||||
|
from pybullet_envs.minitaur.envs.env_randomizers import minitaur_env_randomizer
|
||||||
|
from pybullet_envs.minitaur.envs.env_randomizers import minitaur_env_randomizer_from_config as randomizer_config_lib
|
||||||
|
|
||||||
|
MAX_LENGTH = 1000
|
||||||
|
|
||||||
|
|
||||||
|
def merge_two_dicts(x, y):
|
||||||
|
"""Given two dicts, merge them into a new dict as a shallow copy."""
|
||||||
|
z = dict(x)
|
||||||
|
z.update(y)
|
||||||
|
return z
|
||||||
|
|
||||||
|
|
||||||
|
# The default configurations.
|
||||||
|
DEFAULT_CONFIG = dict(
|
||||||
|
num_workers=8,
|
||||||
|
num_directions=8,
|
||||||
|
num_iterations=1000,
|
||||||
|
deltas_used=8,
|
||||||
|
step_size=0.02,
|
||||||
|
delta_std=0.03,
|
||||||
|
rollout_length=MAX_LENGTH,
|
||||||
|
shift=0,
|
||||||
|
seed=237,
|
||||||
|
policy_type="linear",
|
||||||
|
filter="MeanStdFilter",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Configuration specific to minitaur_gym_env.MinitaurGymEnv class.
|
||||||
|
MINITAUR_GYM_CONFIG_ADDITIONS = dict(
|
||||||
|
env=functools.partial(
|
||||||
|
minitaur_gym_env.MinitaurGymEnv,
|
||||||
|
urdf_version=minitaur_gym_env.DERPY_V0_URDF_VERSION,
|
||||||
|
accurate_motor_model_enabled=True,
|
||||||
|
motor_overheat_protection=True,
|
||||||
|
pd_control_enabled=True,
|
||||||
|
env_randomizer=None,#minitaur_env_randomizer.MinitaurEnvRandomizer(),
|
||||||
|
render=False,
|
||||||
|
num_steps_to_log=MAX_LENGTH))
|
||||||
|
MINITAUR_GYM_CONFIG = merge_two_dicts(DEFAULT_CONFIG,
|
||||||
|
MINITAUR_GYM_CONFIG_ADDITIONS)
|
||||||
|
|
||||||
|
# Configuration specific to MinitaurReactiveEnv class.
|
||||||
|
MINITAUR_REACTIVE_CONFIG_ADDITIONS = dict(
|
||||||
|
env=functools.partial(
|
||||||
|
minitaur_reactive_env.MinitaurReactiveEnv,
|
||||||
|
urdf_version=minitaur_gym_env.RAINBOW_DASH_V0_URDF_VERSION,
|
||||||
|
energy_weight=0.005,
|
||||||
|
accurate_motor_model_enabled=True,
|
||||||
|
pd_latency=0.003,
|
||||||
|
control_latency=0.02,
|
||||||
|
motor_kd=0.015,
|
||||||
|
remove_default_joint_damping=True,
|
||||||
|
env_randomizer=None,
|
||||||
|
render=False,
|
||||||
|
num_steps_to_log=MAX_LENGTH))
|
||||||
|
MINITAUR_REACTIVE_CONFIG = merge_two_dicts(DEFAULT_CONFIG,
|
||||||
|
MINITAUR_REACTIVE_CONFIG_ADDITIONS)
|
||||||
|
|
||||||
|
# Configuration specific to MinitaurReactiveEnv class with randomizer.
|
||||||
|
MINITAUR_REACTIVE_RANDOMIZER_CONFIG_ADDITIONS = dict(
|
||||||
|
env=functools.partial(
|
||||||
|
minitaur_reactive_env.MinitaurReactiveEnv,
|
||||||
|
urdf_version=minitaur_gym_env.RAINBOW_DASH_V0_URDF_VERSION,
|
||||||
|
energy_weight=0.005,
|
||||||
|
accurate_motor_model_enabled=True,
|
||||||
|
pd_latency=0.003,
|
||||||
|
control_latency=0.02,
|
||||||
|
motor_kd=0.015,
|
||||||
|
remove_default_joint_damping=True,
|
||||||
|
env_randomizer=randomizer_config_lib.MinitaurEnvRandomizerFromConfig(),
|
||||||
|
render=False,
|
||||||
|
num_steps_to_log=MAX_LENGTH))
|
||||||
|
MINITAUR_REACTIVE_RANDOMIZER_CONFIG = merge_two_dicts(
|
||||||
|
DEFAULT_CONFIG, MINITAUR_REACTIVE_RANDOMIZER_CONFIG_ADDITIONS)
|
||||||
99
examples/pybullet/gym/pybullet_envs/ARS/eval_ars.py
Normal file
99
examples/pybullet/gym/pybullet_envs/ARS/eval_ars.py
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
"""
|
||||||
|
blaze run -c opt //experimental/users/jietan/ARS:eval_ars -- \
|
||||||
|
--logdir=/cns/ij-d/home/jietan/experiment/ARS/ars_react_nr01.191950338.191950550/ \
|
||||||
|
--checkpoint=lin_policy_plus_990.npz \
|
||||||
|
--num_rollouts=10
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import os, inspect
|
||||||
|
import time
|
||||||
|
|
||||||
|
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
|
||||||
|
os.sys.path.insert(0,currentdir)
|
||||||
|
|
||||||
|
from absl import app
|
||||||
|
from absl import flags
|
||||||
|
|
||||||
|
import pdb
|
||||||
|
import os
|
||||||
|
import numpy as np
|
||||||
|
import gym
|
||||||
|
import config_ars
|
||||||
|
import utility
|
||||||
|
import policies
|
||||||
|
|
||||||
|
FLAGS = flags.FLAGS
|
||||||
|
|
||||||
|
flags.DEFINE_string('logdir', None, 'The path of the checkpoint.')
|
||||||
|
flags.DEFINE_string('checkpoint', None, 'The file name of the checkpoint.')
|
||||||
|
flags.DEFINE_integer('num_rollouts', 1, 'The number of rollouts.')
|
||||||
|
|
||||||
|
|
||||||
|
def main(argv):
|
||||||
|
del argv # Unused.
|
||||||
|
|
||||||
|
print('loading and building expert policy')
|
||||||
|
checkpoint_file = os.path.join(FLAGS.logdir, FLAGS.checkpoint)
|
||||||
|
lin_policy = np.load(checkpoint_file, encoding='bytes')
|
||||||
|
lin_policy = lin_policy.items()[0][1]
|
||||||
|
|
||||||
|
M = lin_policy[0]
|
||||||
|
# mean and std of state vectors estimated online by ARS.
|
||||||
|
mean = lin_policy[1]
|
||||||
|
std = lin_policy[2]
|
||||||
|
|
||||||
|
config = utility.load_config(FLAGS.logdir)
|
||||||
|
print("config=",config)
|
||||||
|
env = config['env'](hard_reset=True, render=True)
|
||||||
|
ob_dim = env.observation_space.shape[0]
|
||||||
|
ac_dim = env.action_space.shape[0]
|
||||||
|
|
||||||
|
# set policy parameters. Possible filters: 'MeanStdFilter' for v2, 'NoFilter' for v1.
|
||||||
|
policy_params = {
|
||||||
|
'type': 'linear',
|
||||||
|
'ob_filter': config['filter'],
|
||||||
|
'ob_dim': ob_dim,
|
||||||
|
'ac_dim': ac_dim,
|
||||||
|
"weights": M,
|
||||||
|
"mean": mean,
|
||||||
|
"std": std,
|
||||||
|
}
|
||||||
|
policy = policies.LinearPolicy(policy_params, update_filter=False)
|
||||||
|
returns = []
|
||||||
|
observations = []
|
||||||
|
actions = []
|
||||||
|
for i in range(FLAGS.num_rollouts):
|
||||||
|
print('iter', i)
|
||||||
|
obs = env.reset()
|
||||||
|
done = False
|
||||||
|
totalr = 0.
|
||||||
|
steps = 0
|
||||||
|
while not done:
|
||||||
|
action = policy.act(obs)
|
||||||
|
observations.append(obs)
|
||||||
|
actions.append(action)
|
||||||
|
|
||||||
|
obs, r, done, _ = env.step(action)
|
||||||
|
time.sleep(1./100.)
|
||||||
|
totalr += r
|
||||||
|
steps += 1
|
||||||
|
if steps % 100 == 0:
|
||||||
|
print('%i/%i' % (steps, config['rollout_length']))
|
||||||
|
if steps >= config['rollout_length']:
|
||||||
|
break
|
||||||
|
returns.append(totalr)
|
||||||
|
|
||||||
|
print('returns', returns)
|
||||||
|
print('mean return', np.mean(returns))
|
||||||
|
print('std of return', np.std(returns))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
flags.mark_flag_as_required('logdir')
|
||||||
|
flags.mark_flag_as_required('checkpoint')
|
||||||
|
app.run(main)
|
||||||
280
examples/pybullet/gym/pybullet_envs/ARS/filter.py
Normal file
280
examples/pybullet/gym/pybullet_envs/ARS/filter.py
Normal file
@@ -0,0 +1,280 @@
|
|||||||
|
# Code in this file is copied and adapted from
|
||||||
|
# https://github.com/ray-project/ray/blob/master/python/ray/rllib/utils/filter.py
|
||||||
|
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
class Filter(object):
|
||||||
|
"""Processes input, possibly statefully."""
|
||||||
|
|
||||||
|
def update(self, other, *args, **kwargs):
|
||||||
|
"""Updates self with "new state" from other filter."""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def copy(self):
|
||||||
|
"""Creates a new object with same state as self.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
copy (Filter): Copy of self"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def sync(self, other):
|
||||||
|
"""Copies all state from other filter to self."""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
class NoFilter(Filter):
|
||||||
|
def __init__(self, *args):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def __call__(self, x, update=True):
|
||||||
|
return np.asarray(x, dtype = np.float64)
|
||||||
|
|
||||||
|
def update(self, other, *args, **kwargs):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def copy(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def sync(self, other):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def stats_increment(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def clear_buffer(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_stats(self):
|
||||||
|
return 0, 1
|
||||||
|
|
||||||
|
@property
|
||||||
|
def mean(self):
|
||||||
|
return 0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def var(self):
|
||||||
|
return 1
|
||||||
|
|
||||||
|
@property
|
||||||
|
def std(self):
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# http://www.johndcook.com/blog/standard_deviation/
|
||||||
|
class RunningStat(object):
|
||||||
|
|
||||||
|
def __init__(self, shape=None):
|
||||||
|
self._n = 0
|
||||||
|
self._M = np.zeros(shape, dtype = np.float64)
|
||||||
|
self._S = np.zeros(shape, dtype = np.float64)
|
||||||
|
self._M2 = np.zeros(shape, dtype = np.float64)
|
||||||
|
|
||||||
|
def copy(self):
|
||||||
|
other = RunningStat()
|
||||||
|
other._n = self._n
|
||||||
|
other._M = np.copy(self._M)
|
||||||
|
other._S = np.copy(self._S)
|
||||||
|
return other
|
||||||
|
|
||||||
|
def push(self, x):
|
||||||
|
x = np.asarray(x)
|
||||||
|
# Unvectorized update of the running statistics.
|
||||||
|
assert x.shape == self._M.shape, ("x.shape = {}, self.shape = {}"
|
||||||
|
.format(x.shape, self._M.shape))
|
||||||
|
n1 = self._n
|
||||||
|
self._n += 1
|
||||||
|
if self._n == 1:
|
||||||
|
self._M[...] = x
|
||||||
|
else:
|
||||||
|
delta = x - self._M
|
||||||
|
deltaM2 = np.square(x) - self._M2
|
||||||
|
self._M[...] += delta / self._n
|
||||||
|
self._S[...] += delta * delta * n1 / self._n
|
||||||
|
|
||||||
|
|
||||||
|
def update(self, other):
|
||||||
|
n1 = self._n
|
||||||
|
n2 = other._n
|
||||||
|
n = n1 + n2
|
||||||
|
delta = self._M - other._M
|
||||||
|
delta2 = delta * delta
|
||||||
|
M = (n1 * self._M + n2 * other._M) / n
|
||||||
|
S = self._S + other._S + delta2 * n1 * n2 / n
|
||||||
|
self._n = n
|
||||||
|
self._M = M
|
||||||
|
self._S = S
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return '(n={}, mean_mean={}, mean_std={})'.format(
|
||||||
|
self.n, np.mean(self.mean), np.mean(self.std))
|
||||||
|
|
||||||
|
@property
|
||||||
|
def n(self):
|
||||||
|
return self._n
|
||||||
|
|
||||||
|
@property
|
||||||
|
def mean(self):
|
||||||
|
return self._M
|
||||||
|
|
||||||
|
@property
|
||||||
|
def var(self):
|
||||||
|
return self._S / (self._n - 1) if self._n > 1 else np.square(self._M)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def std(self):
|
||||||
|
return np.sqrt(self.var)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def shape(self):
|
||||||
|
return self._M.shape
|
||||||
|
|
||||||
|
|
||||||
|
class MeanStdFilter(Filter):
|
||||||
|
"""Keeps track of a running mean for seen states"""
|
||||||
|
|
||||||
|
def __init__(self, shape, demean=True, destd=True):
|
||||||
|
self.shape = shape
|
||||||
|
self.demean = demean
|
||||||
|
self.destd = destd
|
||||||
|
self.rs = RunningStat(shape)
|
||||||
|
# In distributed rollouts, each worker sees different states.
|
||||||
|
# The buffer is used to keep track of deltas amongst all the
|
||||||
|
# observation filters.
|
||||||
|
|
||||||
|
self.buffer = RunningStat(shape)
|
||||||
|
|
||||||
|
self.mean = np.zeros(shape, dtype = np.float64)
|
||||||
|
self.std = np.ones(shape, dtype = np.float64)
|
||||||
|
|
||||||
|
def clear_buffer(self):
|
||||||
|
self.buffer = RunningStat(self.shape)
|
||||||
|
return
|
||||||
|
|
||||||
|
def update(self, other, copy_buffer=False):
|
||||||
|
"""Takes another filter and only applies the information from the
|
||||||
|
buffer.
|
||||||
|
|
||||||
|
Using notation `F(state, buffer)`
|
||||||
|
Given `Filter1(x1, y1)` and `Filter2(x2, yt)`,
|
||||||
|
`update` modifies `Filter1` to `Filter1(x1 + yt, y1)`
|
||||||
|
If `copy_buffer`, then `Filter1` is modified to
|
||||||
|
`Filter1(x1 + yt, yt)`.
|
||||||
|
"""
|
||||||
|
self.rs.update(other.buffer)
|
||||||
|
if copy_buffer:
|
||||||
|
self.buffer = other.buffer.copy()
|
||||||
|
return
|
||||||
|
|
||||||
|
def copy(self):
|
||||||
|
"""Returns a copy of Filter."""
|
||||||
|
other = MeanStdFilter(self.shape)
|
||||||
|
other.demean = self.demean
|
||||||
|
other.destd = self.destd
|
||||||
|
other.rs = self.rs.copy()
|
||||||
|
other.buffer = self.buffer.copy()
|
||||||
|
return other
|
||||||
|
|
||||||
|
def sync(self, other):
|
||||||
|
"""Syncs all fields together from other filter.
|
||||||
|
|
||||||
|
Using notation `F(state, buffer)`
|
||||||
|
Given `Filter1(x1, y1)` and `Filter2(x2, yt)`,
|
||||||
|
`sync` modifies `Filter1` to `Filter1(x2, yt)`
|
||||||
|
"""
|
||||||
|
assert other.shape == self.shape, "Shapes don't match!"
|
||||||
|
self.demean = other.demean
|
||||||
|
self.destd = other.destd
|
||||||
|
self.rs = other.rs.copy()
|
||||||
|
self.buffer = other.buffer.copy()
|
||||||
|
return
|
||||||
|
|
||||||
|
def __call__(self, x, update=True):
|
||||||
|
x = np.asarray(x, dtype = np.float64)
|
||||||
|
if update:
|
||||||
|
if len(x.shape) == len(self.rs.shape) + 1:
|
||||||
|
# The vectorized case.
|
||||||
|
for i in range(x.shape[0]):
|
||||||
|
self.rs.push(x[i])
|
||||||
|
self.buffer.push(x[i])
|
||||||
|
else:
|
||||||
|
# The unvectorized case.
|
||||||
|
self.rs.push(x)
|
||||||
|
self.buffer.push(x)
|
||||||
|
if self.demean:
|
||||||
|
x = x - self.mean
|
||||||
|
if self.destd:
|
||||||
|
x = x / (self.std + 1e-8)
|
||||||
|
return x
|
||||||
|
|
||||||
|
def stats_increment(self):
|
||||||
|
self.mean = self.rs.mean
|
||||||
|
self.std = self.rs.std
|
||||||
|
|
||||||
|
# Set values for std less than 1e-7 to +inf to avoid
|
||||||
|
# dividing by zero. State elements with zero variance
|
||||||
|
# are set to zero as a result.
|
||||||
|
self.std[self.std < 1e-7] = float("inf")
|
||||||
|
return
|
||||||
|
|
||||||
|
def get_stats(self):
|
||||||
|
return self.rs.mean, (self.rs.std + 1e-8)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return 'MeanStdFilter({}, {}, {}, {}, {}, {})'.format(
|
||||||
|
self.shape, self.demean,
|
||||||
|
self.rs, self.buffer)
|
||||||
|
|
||||||
|
|
||||||
|
def get_filter(filter_config, shape = None):
|
||||||
|
if filter_config == "MeanStdFilter":
|
||||||
|
return MeanStdFilter(shape)
|
||||||
|
elif filter_config == "NoFilter":
|
||||||
|
return NoFilter()
|
||||||
|
else:
|
||||||
|
raise Exception("Unknown observation_filter: " +
|
||||||
|
str(filter_config))
|
||||||
|
|
||||||
|
|
||||||
|
def test_running_stat():
|
||||||
|
for shp in ((), (3,), (3, 4)):
|
||||||
|
li = []
|
||||||
|
rs = RunningStat(shp)
|
||||||
|
for _ in range(5):
|
||||||
|
val = np.random.randn(*shp)
|
||||||
|
rs.push(val)
|
||||||
|
li.append(val)
|
||||||
|
m = np.mean(li, axis=0)
|
||||||
|
assert np.allclose(rs.mean, m)
|
||||||
|
v = np.square(m) if (len(li) == 1) else np.var(li, ddof=1, axis=0)
|
||||||
|
assert np.allclose(rs.var, v)
|
||||||
|
|
||||||
|
|
||||||
|
def test_combining_stat():
|
||||||
|
for shape in [(), (3,), (3, 4)]:
|
||||||
|
li = []
|
||||||
|
rs1 = RunningStat(shape)
|
||||||
|
rs2 = RunningStat(shape)
|
||||||
|
rs = RunningStat(shape)
|
||||||
|
for _ in range(5):
|
||||||
|
val = np.random.randn(*shape)
|
||||||
|
rs1.push(val)
|
||||||
|
rs.push(val)
|
||||||
|
li.append(val)
|
||||||
|
for _ in range(9):
|
||||||
|
rs2.push(val)
|
||||||
|
rs.push(val)
|
||||||
|
li.append(val)
|
||||||
|
rs1.update(rs2)
|
||||||
|
assert np.allclose(rs.mean, rs1.mean)
|
||||||
|
assert np.allclose(rs.std, rs1.std)
|
||||||
|
|
||||||
|
|
||||||
|
test_running_stat()
|
||||||
|
test_combining_stat()
|
||||||
29
examples/pybullet/gym/pybullet_envs/ARS/log/config.yaml
Normal file
29
examples/pybullet/gym/pybullet_envs/ARS/log/config.yaml
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
delta_std: 0.03
|
||||||
|
deltas_used: 8
|
||||||
|
env: !!python/object/apply:functools.partial
|
||||||
|
args:
|
||||||
|
- &id001 !!python/name:pybullet_envs.minitaur.envs.minitaur_reactive_env.MinitaurReactiveEnv ''
|
||||||
|
state: !!python/tuple
|
||||||
|
- *id001
|
||||||
|
- !!python/tuple []
|
||||||
|
- accurate_motor_model_enabled: true
|
||||||
|
control_latency: 0.02
|
||||||
|
energy_weight: 0.005
|
||||||
|
env_randomizer: null
|
||||||
|
motor_kd: 0.015
|
||||||
|
num_steps_to_log: 1000
|
||||||
|
pd_latency: 0.003
|
||||||
|
remove_default_joint_damping: true
|
||||||
|
render: false
|
||||||
|
urdf_version: rainbow_dash_v0
|
||||||
|
- null
|
||||||
|
filter: MeanStdFilter
|
||||||
|
num_directions: 8
|
||||||
|
num_iterations: 1000
|
||||||
|
num_workers: 8
|
||||||
|
policy_type: linear
|
||||||
|
rollout_length: 1000
|
||||||
|
seed: 237
|
||||||
|
shift: 0
|
||||||
|
step_size: 0.02
|
||||||
|
|
||||||
Binary file not shown.
104
examples/pybullet/gym/pybullet_envs/ARS/logz.py
Normal file
104
examples/pybullet/gym/pybullet_envs/ARS/logz.py
Normal file
@@ -0,0 +1,104 @@
|
|||||||
|
# Code in this file is copied and adapted from
|
||||||
|
# https://github.com/berkeleydeeprlcourse
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
Some simple logging functionality, inspired by rllab's logging.
|
||||||
|
Assumes that each diagnostic gets logged each iteration
|
||||||
|
|
||||||
|
Call logz.configure_output_dir() to start logging to a
|
||||||
|
tab-separated-values file (some_folder_name/log.txt)
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os.path as osp, shutil, time, atexit, os, subprocess
|
||||||
|
|
||||||
|
color2num = dict(
|
||||||
|
gray=30,
|
||||||
|
red=31,
|
||||||
|
green=32,
|
||||||
|
yellow=33,
|
||||||
|
blue=34,
|
||||||
|
magenta=35,
|
||||||
|
cyan=36,
|
||||||
|
white=37,
|
||||||
|
crimson=38
|
||||||
|
)
|
||||||
|
|
||||||
|
def colorize(string, color, bold=False, highlight=False):
|
||||||
|
attr = []
|
||||||
|
num = color2num[color]
|
||||||
|
if highlight: num += 10
|
||||||
|
attr.append(str(num))
|
||||||
|
if bold: attr.append('1')
|
||||||
|
return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
|
||||||
|
|
||||||
|
class G(object):
|
||||||
|
output_dir = None
|
||||||
|
output_file = None
|
||||||
|
first_row = True
|
||||||
|
log_headers = []
|
||||||
|
log_current_row = {}
|
||||||
|
|
||||||
|
def configure_output_dir(d=None):
|
||||||
|
"""
|
||||||
|
Set output directory to d, or to /tmp/somerandomnumber if d is None
|
||||||
|
"""
|
||||||
|
G.first_row = True
|
||||||
|
G.log_headers = []
|
||||||
|
G.log_current_row = {}
|
||||||
|
|
||||||
|
G.output_dir = d or "/tmp/experiments/%i"%int(time.time())
|
||||||
|
if not osp.exists(G.output_dir):
|
||||||
|
os.makedirs(G.output_dir)
|
||||||
|
G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w')
|
||||||
|
atexit.register(G.output_file.close)
|
||||||
|
print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True))
|
||||||
|
|
||||||
|
def log_tabular(key, val):
|
||||||
|
"""
|
||||||
|
Log a value of some diagnostic
|
||||||
|
Call this once for each diagnostic quantity, each iteration
|
||||||
|
"""
|
||||||
|
if G.first_row:
|
||||||
|
G.log_headers.append(key)
|
||||||
|
else:
|
||||||
|
assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key
|
||||||
|
assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
|
||||||
|
G.log_current_row[key] = val
|
||||||
|
|
||||||
|
|
||||||
|
def save_params(params):
|
||||||
|
with open(osp.join(G.output_dir, "params.json"), 'w') as out:
|
||||||
|
out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True))
|
||||||
|
|
||||||
|
|
||||||
|
def dump_tabular():
|
||||||
|
"""
|
||||||
|
Write all of the diagnostics from the current iteration
|
||||||
|
"""
|
||||||
|
vals = []
|
||||||
|
key_lens = [len(key) for key in G.log_headers]
|
||||||
|
max_key_len = max(15,max(key_lens))
|
||||||
|
keystr = '%'+'%d'%max_key_len
|
||||||
|
fmt = "| " + keystr + "s | %15s |"
|
||||||
|
n_slashes = 22 + max_key_len
|
||||||
|
print("-"*n_slashes)
|
||||||
|
for key in G.log_headers:
|
||||||
|
val = G.log_current_row.get(key, "")
|
||||||
|
if hasattr(val, "__float__"): valstr = "%8.3g"%val
|
||||||
|
else: valstr = val
|
||||||
|
print(fmt%(key, valstr))
|
||||||
|
vals.append(val)
|
||||||
|
print("-"*n_slashes)
|
||||||
|
if G.output_file is not None:
|
||||||
|
if G.first_row:
|
||||||
|
G.output_file.write("\t".join(G.log_headers))
|
||||||
|
G.output_file.write("\n")
|
||||||
|
G.output_file.write("\t".join(map(str,vals)))
|
||||||
|
G.output_file.write("\n")
|
||||||
|
G.output_file.flush()
|
||||||
|
G.log_current_row.clear()
|
||||||
|
G.first_row=False
|
||||||
35
examples/pybullet/gym/pybullet_envs/ARS/optimizers.py
Normal file
35
examples/pybullet/gym/pybullet_envs/ARS/optimizers.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
# Code in this file is copied and adapted from
|
||||||
|
# https://github.com/openai/evolution-strategies-starter.
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
# OPTIMIZERS FOR MINIMIZING OBJECTIVES
|
||||||
|
class Optimizer(object):
|
||||||
|
def __init__(self, w_policy):
|
||||||
|
self.w_policy = w_policy.flatten()
|
||||||
|
self.dim = w_policy.size
|
||||||
|
self.t = 0
|
||||||
|
|
||||||
|
def update(self, globalg):
|
||||||
|
self.t += 1
|
||||||
|
step = self._compute_step(globalg)
|
||||||
|
ratio = np.linalg.norm(step) / (np.linalg.norm(self.w_policy) + 1e-5)
|
||||||
|
return self.w_policy + step, ratio
|
||||||
|
|
||||||
|
def _compute_step(self, globalg):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
class SGD(Optimizer):
|
||||||
|
def __init__(self, pi, stepsize):
|
||||||
|
Optimizer.__init__(self, pi)
|
||||||
|
self.stepsize = stepsize
|
||||||
|
|
||||||
|
def _compute_step(self, globalg):
|
||||||
|
step = -self.stepsize * globalg
|
||||||
|
return step
|
||||||
|
|
||||||
72
examples/pybullet/gym/pybullet_envs/ARS/policies.py
Normal file
72
examples/pybullet/gym/pybullet_envs/ARS/policies.py
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
"""
|
||||||
|
Policy class for computing action from weights and observation vector.
|
||||||
|
Horia Mania --- hmania@berkeley.edu
|
||||||
|
Aurelia Guy
|
||||||
|
Benjamin Recht
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import filter
|
||||||
|
|
||||||
|
|
||||||
|
class Policy(object):
|
||||||
|
|
||||||
|
def __init__(self, policy_params):
|
||||||
|
|
||||||
|
self.ob_dim = policy_params['ob_dim']
|
||||||
|
self.ac_dim = policy_params['ac_dim']
|
||||||
|
self.weights = np.empty(0)
|
||||||
|
|
||||||
|
# a filter for updating statistics of the observations and normalizing
|
||||||
|
# inputs to the policies
|
||||||
|
self.observation_filter = filter.get_filter(
|
||||||
|
policy_params['ob_filter'], shape=(self.ob_dim,))
|
||||||
|
self.update_filter = True
|
||||||
|
|
||||||
|
def update_weights(self, new_weights):
|
||||||
|
self.weights[:] = new_weights[:]
|
||||||
|
return
|
||||||
|
|
||||||
|
def get_weights(self):
|
||||||
|
return self.weights
|
||||||
|
|
||||||
|
def get_observation_filter(self):
|
||||||
|
return self.observation_filter
|
||||||
|
|
||||||
|
def act(self, ob):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def copy(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
class LinearPolicy(Policy):
|
||||||
|
"""
|
||||||
|
Linear policy class that computes action as <w, ob>.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, policy_params, update_filter=True):
|
||||||
|
Policy.__init__(self, policy_params)
|
||||||
|
self.weights = np.zeros(self.ac_dim * self.ob_dim, dtype=np.float64)
|
||||||
|
if "weights" in policy_params:
|
||||||
|
self.weights = policy_params["weights"]
|
||||||
|
if "mean" in policy_params:
|
||||||
|
self.observation_filter.mean = policy_params["mean"]
|
||||||
|
if "std" in policy_params:
|
||||||
|
self.observation_filter.std = policy_params["std"]
|
||||||
|
self.update_filter = update_filter
|
||||||
|
|
||||||
|
def act(self, ob):
|
||||||
|
ob = self.observation_filter(ob, update=self.update_filter)
|
||||||
|
matrix_weights = np.reshape(self.weights, (self.ac_dim, self.ob_dim))
|
||||||
|
return np.clip(np.dot(matrix_weights, ob), -1.0, 1.0)
|
||||||
|
|
||||||
|
def get_weights_plus_stats(self):
|
||||||
|
|
||||||
|
mu, std = self.observation_filter.get_stats()
|
||||||
|
aux = np.asarray([self.weights, mu, std])
|
||||||
|
return aux
|
||||||
41
examples/pybullet/gym/pybullet_envs/ARS/shared_noise.py
Normal file
41
examples/pybullet/gym/pybullet_envs/ARS/shared_noise.py
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
"""TODO(jietan): DO NOT SUBMIT without one-line documentation for shared_noise.
|
||||||
|
Code in this file is copied and adapted from
|
||||||
|
https://github.com/ray-project/ray/tree/master/python/ray/rllib/es
|
||||||
|
TODO(jietan): DO NOT SUBMIT without a detailed description of shared_noise.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def create_shared_noise():
|
||||||
|
"""
|
||||||
|
Create a large array of noise to be shared by all workers. Used
|
||||||
|
for avoiding the communication of the random perturbations delta.
|
||||||
|
"""
|
||||||
|
|
||||||
|
seed = 12345
|
||||||
|
count = 250000000
|
||||||
|
noise = np.random.RandomState(seed).randn(count).astype(np.float64)
|
||||||
|
return noise
|
||||||
|
|
||||||
|
|
||||||
|
class SharedNoiseTable(object):
|
||||||
|
def __init__(self, noise, seed = 11):
|
||||||
|
|
||||||
|
self.rg = np.random.RandomState(seed)
|
||||||
|
self.noise = noise
|
||||||
|
assert self.noise.dtype == np.float64
|
||||||
|
|
||||||
|
def get(self, i, dim):
|
||||||
|
return self.noise[i:i + dim]
|
||||||
|
|
||||||
|
def sample_index(self, dim):
|
||||||
|
return self.rg.randint(0, len(self.noise) - dim + 1)
|
||||||
|
|
||||||
|
def get_delta(self, dim):
|
||||||
|
idx = self.sample_index(dim)
|
||||||
|
return idx, self.get(idx, dim)
|
||||||
|
|
||||||
28
examples/pybullet/gym/pybullet_envs/ARS/start_ars_servers.py
Normal file
28
examples/pybullet/gym/pybullet_envs/ARS/start_ars_servers.py
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
"""TODO(jietan): DO NOT SUBMIT without one-line documentation for start_ars_servers.
|
||||||
|
|
||||||
|
blaze build -c opt //experimental/users/jietan/ARS:start_ars_servers
|
||||||
|
blaze-bin/experimental/users/jietan/ARS/start_ars_servers
|
||||||
|
|
||||||
|
TODO(jietan): DO NOT SUBMIT without a detailed description of start_ars_servers.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
import time
|
||||||
|
import subprocess
|
||||||
|
from absl import app
|
||||||
|
from absl import flags
|
||||||
|
|
||||||
|
FLAGS = flags.FLAGS
|
||||||
|
flags.DEFINE_integer("num_servers", 8, "number of servers")
|
||||||
|
|
||||||
|
def main(argv):
|
||||||
|
del argv # Unused.
|
||||||
|
for server_id in xrange(FLAGS.num_servers):
|
||||||
|
args = ["blaze-bin/experimental/users/jietan/ARS/ars_server", "--config_name=MINITAUR_GYM_CONFIG", "--server_id={}".format(server_id), "--run_on_borg=False"]
|
||||||
|
subprocess.Popen(args)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
app.run(main)
|
||||||
93
examples/pybullet/gym/pybullet_envs/ARS/train_ars.borg
Normal file
93
examples/pybullet/gym/pybullet_envs/ARS/train_ars.borg
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
// Example borg file to do a parameter sweep.
|
||||||
|
//
|
||||||
|
// To run:
|
||||||
|
// echo `srcfs get_readonly`-`g4 p | head -1 | awk '{print $2}'`
|
||||||
|
// blaze build -c opt experimental/users/jietan/ARS:ars_server.par
|
||||||
|
// blaze build -c opt experimental/users/jietan/ARS:ars_client.par
|
||||||
|
// borgcfg --skip_confirmation --vars 'base_cl=191950338,my_cl=191950550,label=ars_react_nr01,config=MINITAUR_REACTIVE_CONFIG' experimental/users/jietan/ARS/train_ars.borg reload
|
||||||
|
// borgcfg --skip_confirmation --vars 'base_cl=191950338,my_cl=191950550,label=ars_react_rd01,config=MINITAUR_REACTIVE_RANDOMIZER_CONFIG' experimental/users/jietan/ARS/train_ars.borg reload
|
||||||
|
|
||||||
|
|
||||||
|
import '//production/borg/templates/lambda/buildtool_support.borg' as build
|
||||||
|
import '//production/borg/templates/lambda/dnsname.borg' as dns
|
||||||
|
|
||||||
|
vars = {
|
||||||
|
cell = 'atlanta'
|
||||||
|
charged_user = 'robotics'
|
||||||
|
base_cl = 0
|
||||||
|
my_cl = 0
|
||||||
|
label = external
|
||||||
|
user = real_username()
|
||||||
|
workers = 8
|
||||||
|
config = external
|
||||||
|
cns_home = "/cns/ij-d/home/%user%"
|
||||||
|
logdir = "%cns_home%/experiment/ARS/%label%.%base_cl%.%my_cl%/"
|
||||||
|
}
|
||||||
|
|
||||||
|
service augmented_random_search {
|
||||||
|
runtime {
|
||||||
|
cell = vars.cell
|
||||||
|
}
|
||||||
|
|
||||||
|
scheduling = {
|
||||||
|
priority = 100
|
||||||
|
batch_quota = {
|
||||||
|
strategy = 'RUN_SOON'
|
||||||
|
}
|
||||||
|
deadline = 3600 * 24
|
||||||
|
}
|
||||||
|
accounting = {
|
||||||
|
charged_user = vars.charged_user
|
||||||
|
}
|
||||||
|
requirements {
|
||||||
|
autopilot = true
|
||||||
|
}
|
||||||
|
params = {
|
||||||
|
mygoogle3 = build.google3dir(myfilename())
|
||||||
|
experiment_dir = 'experimental/users/jietan/ARS/'
|
||||||
|
}
|
||||||
|
|
||||||
|
job ars_server = {
|
||||||
|
runtime {
|
||||||
|
cell = vars.cell
|
||||||
|
}
|
||||||
|
name = real_username() + '_server_' + vars.label
|
||||||
|
replicas = vars.workers
|
||||||
|
binary_path = build.binfile_v2(params.mygoogle3,
|
||||||
|
params.experiment_dir + 'ars_server')
|
||||||
|
runfiles = binary_path + '.runfiles/google3/'
|
||||||
|
packages = {
|
||||||
|
package third_party = {
|
||||||
|
directory = runfiles + 'third_party/'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
binary = build.binfile(params.mygoogle3,
|
||||||
|
params.experiment_dir + 'ars_server.par')
|
||||||
|
args = {
|
||||||
|
server_id = '%task%'
|
||||||
|
config_name = vars.config
|
||||||
|
port = '%port%'
|
||||||
|
run_on_borg = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
job ars_client = {
|
||||||
|
name = real_username() + '_client_' + vars.label
|
||||||
|
binary_path = build.binfile_v2(params.mygoogle3,
|
||||||
|
params.experiment_dir + 'ars_client')
|
||||||
|
runfiles = binary_path + '.runfiles/google3/'
|
||||||
|
packages = {
|
||||||
|
package third_party = {
|
||||||
|
directory = runfiles + 'third_party/'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
binary = build.binfile(params.mygoogle3,
|
||||||
|
params.experiment_dir + 'ars_client.par')
|
||||||
|
args = {
|
||||||
|
server_address = dns.borg_dns_name(ars_server)
|
||||||
|
num_servers = vars.workers
|
||||||
|
config_name = vars.config
|
||||||
|
logdir = vars.logdir
|
||||||
|
run_on_borg = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
64
examples/pybullet/gym/pybullet_envs/ARS/train_ars.py
Normal file
64
examples/pybullet/gym/pybullet_envs/ARS/train_ars.py
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
"""TODO(jietan): DO NOT SUBMIT without one-line documentation for train_ars.
|
||||||
|
|
||||||
|
blaze build -c opt //experimental/users/jietan/ARS:train_ars
|
||||||
|
blaze-bin/experimental/users/jietan/ARS/train_ars \
|
||||||
|
--logdir=/cns/ij-d/home/jietan/experiment/ARS/test1 \
|
||||||
|
--config_name=MINITAUR_GYM_CONFIG
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
|
||||||
|
import os
|
||||||
|
from absl import app
|
||||||
|
from absl import flags
|
||||||
|
import ars
|
||||||
|
import config_ars
|
||||||
|
|
||||||
|
FLAGS = flags.FLAGS
|
||||||
|
flags.DEFINE_string('logdir', None, 'The directory to write the log file.')
|
||||||
|
flags.DEFINE_string('config_name', None, 'The name of the config dictionary')
|
||||||
|
|
||||||
|
|
||||||
|
def run_ars(config, logdir):
|
||||||
|
|
||||||
|
env = config["env"]()
|
||||||
|
ob_dim = env.observation_space.shape[0]
|
||||||
|
ac_dim = env.action_space.shape[0]
|
||||||
|
|
||||||
|
# set policy parameters. Possible filters: 'MeanStdFilter' for v2, 'NoFilter' for v1.
|
||||||
|
policy_params = {
|
||||||
|
'type': 'linear',
|
||||||
|
'ob_filter': config['filter'],
|
||||||
|
'ob_dim': ob_dim,
|
||||||
|
'ac_dim': ac_dim
|
||||||
|
}
|
||||||
|
|
||||||
|
ARS = ars.ARSLearner(
|
||||||
|
env_callback=config['env'],
|
||||||
|
policy_params=policy_params,
|
||||||
|
num_deltas=config['num_directions'],
|
||||||
|
deltas_used=config['deltas_used'],
|
||||||
|
step_size=config['step_size'],
|
||||||
|
delta_std=config['delta_std'],
|
||||||
|
logdir=logdir,
|
||||||
|
rollout_length=config['rollout_length'],
|
||||||
|
shift=config['shift'],
|
||||||
|
params=config,
|
||||||
|
seed=config['seed'])
|
||||||
|
|
||||||
|
return ARS.train(config['num_iterations'])
|
||||||
|
|
||||||
|
|
||||||
|
def main(argv):
|
||||||
|
del argv # Unused.
|
||||||
|
config = getattr(config_ars, FLAGS.config_name)
|
||||||
|
run_ars(config=config, logdir=FLAGS.logdir)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
flags.mark_flag_as_required('logdir')
|
||||||
|
flags.mark_flag_as_required('config_name')
|
||||||
|
app.run(main)
|
||||||
29
examples/pybullet/gym/pybullet_envs/ARS/train_ars_test.py
Normal file
29
examples/pybullet/gym/pybullet_envs/ARS/train_ars_test.py
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
"""Tests for google3.experimental.users.jietan.ARS.train_ars.
|
||||||
|
blaze build -c opt //experimental/users/jietan/ARS:train_ars_test
|
||||||
|
blaze-bin/experimental/users/jietan/ARS/train_ars_test
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
from absl import flags
|
||||||
|
from google3.testing.pybase import googletest
|
||||||
|
from google3.experimental.users.jietan.ARS import train_ars
|
||||||
|
from google3.experimental.users.jietan.ARS import config_ars
|
||||||
|
|
||||||
|
FLAGS = flags.FLAGS
|
||||||
|
MAX_RETURN_AFTER_TWO_ITEATIONS = 0.0890905394617
|
||||||
|
|
||||||
|
class TrainArsTest(googletest.TestCase):
|
||||||
|
|
||||||
|
def testArsTwoStepResult(self):
|
||||||
|
config = getattr(config_ars, "MINITAUR_REACTIVE_CONFIG")
|
||||||
|
config['num_iterations'] = 2
|
||||||
|
info = train_ars.run_ars(config=config, logdir=FLAGS.test_tmpdir)
|
||||||
|
print (info)
|
||||||
|
self.assertAlmostEqual(info["max_reward"], MAX_RETURN_AFTER_TWO_ITEATIONS)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
googletest.main()
|
||||||
52
examples/pybullet/gym/pybullet_envs/ARS/utility.py
Normal file
52
examples/pybullet/gym/pybullet_envs/ARS/utility.py
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
import os
|
||||||
|
import ruamel.yaml as yaml
|
||||||
|
|
||||||
|
def save_config(config, logdir):
|
||||||
|
"""Save a new configuration by name.
|
||||||
|
|
||||||
|
If a logging directory is specified, is will be created and the configuration
|
||||||
|
will be stored there. Otherwise, a log message will be printed.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Configuration object.
|
||||||
|
logdir: Location for writing summaries and checkpoints if specified.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Configuration object.
|
||||||
|
"""
|
||||||
|
message = 'Start a new run and write summaries and checkpoints to {}.'
|
||||||
|
print(message.format(logdir))
|
||||||
|
config_path = os.path.join(logdir, 'config.yaml')
|
||||||
|
yaml.dump(config, config_path, default_flow_style=False)
|
||||||
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
def load_config(logdir):
|
||||||
|
"""Load a configuration from the log directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
logdir: The logging directory containing the configuration file.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
IOError: The logging directory does not contain a configuration file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Configuration object.
|
||||||
|
"""
|
||||||
|
config_path = logdir and os.path.join(logdir, 'config.yaml')
|
||||||
|
if not config_path:
|
||||||
|
message = (
|
||||||
|
'Cannot resume an existing run since the logging directory does not '
|
||||||
|
'contain a configuration file.')
|
||||||
|
raise IOError(message)
|
||||||
|
print("config_path=",config_path)
|
||||||
|
|
||||||
|
stream = open(config_path, 'r')
|
||||||
|
config = yaml.load(stream)
|
||||||
|
message = 'Resume run and write summaries and checkpoints to {}.'
|
||||||
|
print(message.format(logdir))
|
||||||
|
return config
|
||||||
28
examples/pybullet/gym/pybullet_envs/ARS/utils.py
Normal file
28
examples/pybullet/gym/pybullet_envs/ARS/utils.py
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
# Code in this file is copied and adapted from
|
||||||
|
# https://github.com/openai/evolution-strategies-starter.
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
def itergroups(items, group_size):
|
||||||
|
assert group_size >= 1
|
||||||
|
group = []
|
||||||
|
for x in items:
|
||||||
|
group.append(x)
|
||||||
|
if len(group) == group_size:
|
||||||
|
yield tuple(group)
|
||||||
|
del group[:]
|
||||||
|
if group:
|
||||||
|
yield tuple(group)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def batched_weighted_sum(weights, vecs, batch_size):
|
||||||
|
total = 0
|
||||||
|
num_items_summed = 0
|
||||||
|
for batch_weights, batch_vecs in zip(itergroups(weights, batch_size),
|
||||||
|
itergroups(vecs, batch_size)):
|
||||||
|
assert len(batch_weights) == len(batch_vecs) <= batch_size
|
||||||
|
total += np.dot(np.asarray(batch_weights, dtype=np.float64),
|
||||||
|
np.asarray(batch_vecs, dtype=np.float64))
|
||||||
|
num_items_summed += len(batch_weights)
|
||||||
|
return total, num_items_summed
|
||||||
Reference in New Issue
Block a user