diff --git a/examples/pybullet/gym/pybullet_envs/ARS/ars.py b/examples/pybullet/gym/pybullet_envs/ARS/ars.py new file mode 100644 index 000000000..cf59ede85 --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/ARS/ars.py @@ -0,0 +1,397 @@ +"""Internal implementation of the Augmented Random Search method.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os, inspect +currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) +os.sys.path.insert(0,currentdir) + +from concurrent import futures +import copy +import os +import time +import gym +import numpy as np +import logz +import utils +import optimizers +#from google3.pyglib import gfile +import policies +import shared_noise +import utility + +class Worker(object): + """Object class for parallel rollout generation.""" + + def __init__(self, + env_seed, + env_callback, + policy_params=None, + deltas=None, + rollout_length=1000, + delta_std=0.02): + + # initialize OpenAI environment for each worker + self.env = env_callback() + self.env.seed(env_seed) + + # each worker gets access to the shared noise table + # with independent random streams for sampling + # from the shared noise table. + self.deltas = shared_noise.SharedNoiseTable(deltas, env_seed + 7) + self.policy_params = policy_params + if policy_params['type'] == 'linear': + self.policy = policies.LinearPolicy(policy_params) + else: + raise NotImplementedError + + self.delta_std = delta_std + self.rollout_length = rollout_length + + def get_weights_plus_stats(self): + """ + Get current policy weights and current statistics of past states. + """ + assert self.policy_params['type'] == 'linear' + return self.policy.get_weights_plus_stats() + + def rollout(self, shift=0., rollout_length=None): + """Performs one rollout of maximum length rollout_length. + + At each time-step it substracts shift from the reward. + """ + + if rollout_length is None: + rollout_length = self.rollout_length + + total_reward = 0. + steps = 0 + + ob = self.env.reset() + for i in range(rollout_length): + action = self.policy.act(ob) + ob, reward, done, _ = self.env.step(action) + steps += 1 + total_reward += (reward - shift) + if done: + break + + return total_reward, steps + + def do_rollouts(self, w_policy, num_rollouts=1, shift=1, evaluate=False): + """ + Generate multiple rollouts with a policy parametrized by w_policy. + """ + print('Doing {} rollouts'.format(num_rollouts)) + rollout_rewards, deltas_idx = [], [] + steps = 0 + + for i in range(num_rollouts): + + if evaluate: + self.policy.update_weights(w_policy) + deltas_idx.append(-1) + + # set to false so that evaluation rollouts are not used for updating state statistics + self.policy.update_filter = False + + # for evaluation we do not shift the rewards (shift = 0) and we use the + # default rollout length (1000 for the MuJoCo locomotion tasks) + reward, r_steps = self.rollout( + shift=0., rollout_length=self.rollout_length) + rollout_rewards.append(reward) + + else: + idx, delta = self.deltas.get_delta(w_policy.size) + + delta = (self.delta_std * delta).reshape(w_policy.shape) + deltas_idx.append(idx) + + # set to true so that state statistics are updated + self.policy.update_filter = True + + # compute reward and number of timesteps used for positive perturbation rollout + self.policy.update_weights(w_policy + delta) + pos_reward, pos_steps = self.rollout(shift=shift) + + # compute reward and number of timesteps used for negative pertubation rollout + self.policy.update_weights(w_policy - delta) + neg_reward, neg_steps = self.rollout(shift=shift) + steps += pos_steps + neg_steps + + rollout_rewards.append([pos_reward, neg_reward]) + + return { + 'deltas_idx': deltas_idx, + 'rollout_rewards': rollout_rewards, + 'steps': steps + } + + def stats_increment(self): + self.policy.observation_filter.stats_increment() + return + + def get_weights(self): + return self.policy.get_weights() + + def get_filter(self): + return self.policy.observation_filter + + def sync_filter(self, other): + self.policy.observation_filter.sync(other) + return + + +class ARSLearner(object): + """ + Object class implementing the ARS algorithm. + """ + + def __init__(self, + env_callback, + policy_params=None, + num_workers=32, + num_deltas=320, + deltas_used=320, + delta_std=0.02, + logdir=None, + rollout_length=1000, + step_size=0.01, + shift='constant zero', + params=None, + seed=123): + + logz.configure_output_dir(logdir) + # params_to_save = copy.deepcopy(params) + # params_to_save['env'] = None + # logz.save_params(params_to_save) + utility.save_config(params, logdir) + env = env_callback() + + self.timesteps = 0 + self.action_size = env.action_space.shape[0] + self.ob_size = env.observation_space.shape[0] + self.num_deltas = num_deltas + self.deltas_used = deltas_used + self.rollout_length = rollout_length + self.step_size = step_size + self.delta_std = delta_std + self.logdir = logdir + self.shift = shift + self.params = params + self.max_past_avg_reward = float('-inf') + self.num_episodes_used = float('inf') + + # create shared table for storing noise + print('Creating deltas table.') + deltas = shared_noise.create_shared_noise() + self.deltas = shared_noise.SharedNoiseTable(deltas, seed=seed + 3) + print('Created deltas table.') + + # initialize workers with different random seeds + print('Initializing workers.') + self.num_workers = num_workers + self.workers = [ + Worker( + seed + 7 * i, + env_callback=env_callback, + policy_params=policy_params, + deltas=deltas, + rollout_length=rollout_length, + delta_std=delta_std) for i in range(num_workers) + ] + + # initialize policy + if policy_params['type'] == 'linear': + self.policy = policies.LinearPolicy(policy_params) + self.w_policy = self.policy.get_weights() + else: + raise NotImplementedError + + # initialize optimization algorithm + self.optimizer = optimizers.SGD(self.w_policy, self.step_size) + print('Initialization of ARS complete.') + + def aggregate_rollouts(self, num_rollouts=None, evaluate=False): + """ + Aggregate update step from rollouts generated in parallel. + """ + + if num_rollouts is None: + num_deltas = self.num_deltas + else: + num_deltas = num_rollouts + + results_one = [] #rollout_ids_one + results_two = [] #rollout_ids_two + + t1 = time.time() + num_rollouts = int(num_deltas / self.num_workers) +# if num_rollouts > 0: +# with futures.ThreadPoolExecutor( +# max_workers=self.num_workers) as executor: +# workers = [ +# executor.submit( +# worker.do_rollouts, +# self.w_policy, +# num_rollouts=num_rollouts, +# shift=self.shift, +# evaluate=evaluate) for worker in self.workers +# ] +# for worker in futures.as_completed(workers): +# results_one.append(worker.result()) +# +# workers = [ +# executor.submit( +# worker.do_rollouts, +# self.w_policy, +# num_rollouts=1, +# shift=self.shift, +# evaluate=evaluate) +# for worker in self.workers[:(num_deltas % self.num_workers)] +# ] +# for worker in futures.as_completed(workers): +# results_two.append(worker.result()) + + # parallel generation of rollouts + rollout_ids_one = [ + worker.do_rollouts( + self.w_policy, + num_rollouts=num_rollouts, + shift=self.shift, + evaluate=evaluate) for worker in self.workers + ] + + rollout_ids_two = [ + worker.do_rollouts( + self.w_policy, num_rollouts=1, shift=self.shift, evaluate=evaluate) + for worker in self.workers[:(num_deltas % self.num_workers)] + ] + results_one = rollout_ids_one + results_two = rollout_ids_two +# gather results + + rollout_rewards, deltas_idx = [], [] + + for result in results_one: + if not evaluate: + self.timesteps += result['steps'] + deltas_idx += result['deltas_idx'] + rollout_rewards += result['rollout_rewards'] + + for result in results_two: + if not evaluate: + self.timesteps += result['steps'] + deltas_idx += result['deltas_idx'] + rollout_rewards += result['rollout_rewards'] + + deltas_idx = np.array(deltas_idx) + rollout_rewards = np.array(rollout_rewards, dtype=np.float64) + + print('Maximum reward of collected rollouts:', rollout_rewards.max()) + info_dict = { + "max_reward": rollout_rewards.max() + } + t2 = time.time() + + print('Time to generate rollouts:', t2 - t1) + + if evaluate: + return rollout_rewards + + # select top performing directions if deltas_used < num_deltas + max_rewards = np.max(rollout_rewards, axis=1) + if self.deltas_used > self.num_deltas: + self.deltas_used = self.num_deltas + + idx = np.arange(max_rewards.size)[max_rewards >= np.percentile( + max_rewards, 100 * (1 - (self.deltas_used / self.num_deltas)))] + deltas_idx = deltas_idx[idx] + rollout_rewards = rollout_rewards[idx, :] + + # normalize rewards by their standard deviation + rollout_rewards /= np.std(rollout_rewards) + + t1 = time.time() + # aggregate rollouts to form g_hat, the gradient used to compute SGD step + g_hat, count = utils.batched_weighted_sum( + rollout_rewards[:, 0] - rollout_rewards[:, 1], + (self.deltas.get(idx, self.w_policy.size) for idx in deltas_idx), + batch_size=500) + g_hat /= deltas_idx.size + t2 = time.time() + print('time to aggregate rollouts', t2 - t1) + return g_hat, info_dict + + def train_step(self): + """ + Perform one update step of the policy weights. + """ + + g_hat, info_dict = self.aggregate_rollouts() + print('Euclidean norm of update step:', np.linalg.norm(g_hat)) + self.w_policy -= self.optimizer._compute_step(g_hat).reshape( + self.w_policy.shape) + return info_dict + + def train(self, num_iter): + + start = time.time() + for i in range(num_iter): + + t1 = time.time() + info_dict = self.train_step() + t2 = time.time() + print('total time of one step', t2 - t1) + print('iter ', i, ' done') + + # record statistics every 10 iterations + if ((i) % 10 == 0): + + rewards = self.aggregate_rollouts(num_rollouts=8, evaluate=True) + w = self.workers[0].get_weights_plus_stats() + + checkpoint_filename = os.path.join( + self.logdir, 'lin_policy_plus_{:03d}.npz'.format(i)) + print('Save checkpoints to {}...', checkpoint_filename) + checkpoint_file = open(checkpoint_filename, 'w') + np.savez(checkpoint_file, w) + print('End save checkpoints.') + print(sorted(self.params.items())) + logz.log_tabular('Time', time.time() - start) + logz.log_tabular('Iteration', i + 1) + logz.log_tabular('AverageReward', np.mean(rewards)) + logz.log_tabular('StdRewards', np.std(rewards)) + logz.log_tabular('MaxRewardRollout', np.max(rewards)) + logz.log_tabular('MinRewardRollout', np.min(rewards)) + logz.log_tabular('timesteps', self.timesteps) + logz.dump_tabular() + + t1 = time.time() + # get statistics from all workers + for j in range(self.num_workers): + self.policy.observation_filter.update(self.workers[j].get_filter()) + self.policy.observation_filter.stats_increment() + + # make sure master filter buffer is clear + self.policy.observation_filter.clear_buffer() + # sync all workers + #filter_id = ray.put(self.policy.observation_filter) + setting_filters_ids = [ + worker.sync_filter(self.policy.observation_filter) + for worker in self.workers + ] + # waiting for sync of all workers + #ray.get(setting_filters_ids) + + increment_filters_ids = [ + worker.stats_increment() for worker in self.workers + ] + # waiting for increment of all workers + #ray.get(increment_filters_ids) + t2 = time.time() + print('Time to sync statistics:', t2 - t1) + + return info_dict diff --git a/examples/pybullet/gym/pybullet_envs/ARS/ars_server.py b/examples/pybullet/gym/pybullet_envs/ARS/ars_server.py new file mode 100644 index 000000000..f680dd632 --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/ARS/ars_server.py @@ -0,0 +1,62 @@ +""" +blaze build -c opt //experimental/users/jietan/ARS:ars_server + +blaze-bin/experimental/users/jietan/ARS/ars_server \ +--config_name=MINITAUR_GYM_CONFIG +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import time +from absl import app +from absl import flags +from concurrent import futures +import grpc +from grpc import loas2 +from google3.robotics.reinforcement_learning.minitaur.envs import minitaur_gym_env +from google3.robotics.reinforcement_learning.minitaur.envs import minitaur_reactive_env +from google3.robotics.reinforcement_learning.minitaur.envs.env_randomizers import minitaur_env_randomizer +from google3.robotics.reinforcement_learning.minitaur.envs.env_randomizers import minitaur_env_randomizer_from_config as randomizer_config_lib +from google3.experimental.users.jietan.ARS import ars_evaluation_service_pb2_grpc +from google3.experimental.users.jietan.ARS import ars_evaluation_service + +FLAGS = flags.FLAGS +flags.DEFINE_integer("server_id", 0, "number of servers") +flags.DEFINE_integer("port", 20000, "port number.") +flags.DEFINE_string("config_name", None, "The name of the config dictionary.") +flags.DEFINE_bool('run_on_borg', False, + 'Whether the servers are running on borg.') + +_ONE_DAY_IN_SECONDS = 60 * 60 * 24 + + +def main(unused_argv): + servers = [] + server_creds = loas2.loas2_server_credentials() + port = FLAGS.port + if not FLAGS.run_on_borg: + port = 20000 + FLAGS.server_id + server = grpc.server( + futures.ThreadPoolExecutor(max_workers=10), ports=(port,)) + servicer = ars_evaluation_service.ParameterEvaluationServicer( + FLAGS.config_name, worker_id=FLAGS.server_id) + ars_evaluation_service_pb2_grpc.add_EvaluationServicer_to_server( + servicer, server) + server.add_secure_port("[::]:{}".format(port), server_creds) + servers.append(server) + server.start() + print("Start server {}".format(FLAGS.server_id)) + + # prevent the main thread from exiting + try: + while True: + time.sleep(_ONE_DAY_IN_SECONDS) + except KeyboardInterrupt: + for server in servers: + server.stop(0) + + +if __name__ == "__main__": + app.run(main) diff --git a/examples/pybullet/gym/pybullet_envs/ARS/config_ars.py b/examples/pybullet/gym/pybullet_envs/ARS/config_ars.py new file mode 100644 index 000000000..e333f88ab --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/ARS/config_ars.py @@ -0,0 +1,83 @@ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import functools +from pybullet_envs.minitaur.envs import minitaur_gym_env +from pybullet_envs.minitaur.envs import minitaur_reactive_env +from pybullet_envs.minitaur.envs.env_randomizers import minitaur_env_randomizer +from pybullet_envs.minitaur.envs.env_randomizers import minitaur_env_randomizer_from_config as randomizer_config_lib + +MAX_LENGTH = 1000 + + +def merge_two_dicts(x, y): + """Given two dicts, merge them into a new dict as a shallow copy.""" + z = dict(x) + z.update(y) + return z + + +# The default configurations. +DEFAULT_CONFIG = dict( + num_workers=8, + num_directions=8, + num_iterations=1000, + deltas_used=8, + step_size=0.02, + delta_std=0.03, + rollout_length=MAX_LENGTH, + shift=0, + seed=237, + policy_type="linear", + filter="MeanStdFilter", +) + +# Configuration specific to minitaur_gym_env.MinitaurGymEnv class. +MINITAUR_GYM_CONFIG_ADDITIONS = dict( + env=functools.partial( + minitaur_gym_env.MinitaurGymEnv, + urdf_version=minitaur_gym_env.DERPY_V0_URDF_VERSION, + accurate_motor_model_enabled=True, + motor_overheat_protection=True, + pd_control_enabled=True, + env_randomizer=None,#minitaur_env_randomizer.MinitaurEnvRandomizer(), + render=False, + num_steps_to_log=MAX_LENGTH)) +MINITAUR_GYM_CONFIG = merge_two_dicts(DEFAULT_CONFIG, + MINITAUR_GYM_CONFIG_ADDITIONS) + +# Configuration specific to MinitaurReactiveEnv class. +MINITAUR_REACTIVE_CONFIG_ADDITIONS = dict( + env=functools.partial( + minitaur_reactive_env.MinitaurReactiveEnv, + urdf_version=minitaur_gym_env.RAINBOW_DASH_V0_URDF_VERSION, + energy_weight=0.005, + accurate_motor_model_enabled=True, + pd_latency=0.003, + control_latency=0.02, + motor_kd=0.015, + remove_default_joint_damping=True, + env_randomizer=None, + render=False, + num_steps_to_log=MAX_LENGTH)) +MINITAUR_REACTIVE_CONFIG = merge_two_dicts(DEFAULT_CONFIG, + MINITAUR_REACTIVE_CONFIG_ADDITIONS) + +# Configuration specific to MinitaurReactiveEnv class with randomizer. +MINITAUR_REACTIVE_RANDOMIZER_CONFIG_ADDITIONS = dict( + env=functools.partial( + minitaur_reactive_env.MinitaurReactiveEnv, + urdf_version=minitaur_gym_env.RAINBOW_DASH_V0_URDF_VERSION, + energy_weight=0.005, + accurate_motor_model_enabled=True, + pd_latency=0.003, + control_latency=0.02, + motor_kd=0.015, + remove_default_joint_damping=True, + env_randomizer=randomizer_config_lib.MinitaurEnvRandomizerFromConfig(), + render=False, + num_steps_to_log=MAX_LENGTH)) +MINITAUR_REACTIVE_RANDOMIZER_CONFIG = merge_two_dicts( + DEFAULT_CONFIG, MINITAUR_REACTIVE_RANDOMIZER_CONFIG_ADDITIONS) diff --git a/examples/pybullet/gym/pybullet_envs/ARS/eval_ars.py b/examples/pybullet/gym/pybullet_envs/ARS/eval_ars.py new file mode 100644 index 000000000..d0c144bfa --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/ARS/eval_ars.py @@ -0,0 +1,99 @@ +""" +blaze run -c opt //experimental/users/jietan/ARS:eval_ars -- \ +--logdir=/cns/ij-d/home/jietan/experiment/ARS/ars_react_nr01.191950338.191950550/ \ +--checkpoint=lin_policy_plus_990.npz \ +--num_rollouts=10 +""" + + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os, inspect +import time + +currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) +os.sys.path.insert(0,currentdir) + +from absl import app +from absl import flags + +import pdb +import os +import numpy as np +import gym +import config_ars +import utility +import policies + +FLAGS = flags.FLAGS + +flags.DEFINE_string('logdir', None, 'The path of the checkpoint.') +flags.DEFINE_string('checkpoint', None, 'The file name of the checkpoint.') +flags.DEFINE_integer('num_rollouts', 1, 'The number of rollouts.') + + +def main(argv): + del argv # Unused. + + print('loading and building expert policy') + checkpoint_file = os.path.join(FLAGS.logdir, FLAGS.checkpoint) + lin_policy = np.load(checkpoint_file, encoding='bytes') + lin_policy = lin_policy.items()[0][1] + + M = lin_policy[0] + # mean and std of state vectors estimated online by ARS. + mean = lin_policy[1] + std = lin_policy[2] + + config = utility.load_config(FLAGS.logdir) + print("config=",config) + env = config['env'](hard_reset=True, render=True) + ob_dim = env.observation_space.shape[0] + ac_dim = env.action_space.shape[0] + + # set policy parameters. Possible filters: 'MeanStdFilter' for v2, 'NoFilter' for v1. + policy_params = { + 'type': 'linear', + 'ob_filter': config['filter'], + 'ob_dim': ob_dim, + 'ac_dim': ac_dim, + "weights": M, + "mean": mean, + "std": std, + } + policy = policies.LinearPolicy(policy_params, update_filter=False) + returns = [] + observations = [] + actions = [] + for i in range(FLAGS.num_rollouts): + print('iter', i) + obs = env.reset() + done = False + totalr = 0. + steps = 0 + while not done: + action = policy.act(obs) + observations.append(obs) + actions.append(action) + + obs, r, done, _ = env.step(action) + time.sleep(1./100.) + totalr += r + steps += 1 + if steps % 100 == 0: + print('%i/%i' % (steps, config['rollout_length'])) + if steps >= config['rollout_length']: + break + returns.append(totalr) + + print('returns', returns) + print('mean return', np.mean(returns)) + print('std of return', np.std(returns)) + + +if __name__ == '__main__': + flags.mark_flag_as_required('logdir') + flags.mark_flag_as_required('checkpoint') + app.run(main) diff --git a/examples/pybullet/gym/pybullet_envs/ARS/filter.py b/examples/pybullet/gym/pybullet_envs/ARS/filter.py new file mode 100644 index 000000000..ac68a2341 --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/ARS/filter.py @@ -0,0 +1,280 @@ +# Code in this file is copied and adapted from +# https://github.com/ray-project/ray/blob/master/python/ray/rllib/utils/filter.py + + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + + +class Filter(object): + """Processes input, possibly statefully.""" + + def update(self, other, *args, **kwargs): + """Updates self with "new state" from other filter.""" + raise NotImplementedError + + def copy(self): + """Creates a new object with same state as self. + + Returns: + copy (Filter): Copy of self""" + raise NotImplementedError + + def sync(self, other): + """Copies all state from other filter to self.""" + raise NotImplementedError + + +class NoFilter(Filter): + def __init__(self, *args): + pass + + def __call__(self, x, update=True): + return np.asarray(x, dtype = np.float64) + + def update(self, other, *args, **kwargs): + pass + + def copy(self): + return self + + def sync(self, other): + pass + + def stats_increment(self): + pass + + def clear_buffer(self): + pass + + def get_stats(self): + return 0, 1 + + @property + def mean(self): + return 0 + + @property + def var(self): + return 1 + + @property + def std(self): + return 1 + + + +# http://www.johndcook.com/blog/standard_deviation/ +class RunningStat(object): + + def __init__(self, shape=None): + self._n = 0 + self._M = np.zeros(shape, dtype = np.float64) + self._S = np.zeros(shape, dtype = np.float64) + self._M2 = np.zeros(shape, dtype = np.float64) + + def copy(self): + other = RunningStat() + other._n = self._n + other._M = np.copy(self._M) + other._S = np.copy(self._S) + return other + + def push(self, x): + x = np.asarray(x) + # Unvectorized update of the running statistics. + assert x.shape == self._M.shape, ("x.shape = {}, self.shape = {}" + .format(x.shape, self._M.shape)) + n1 = self._n + self._n += 1 + if self._n == 1: + self._M[...] = x + else: + delta = x - self._M + deltaM2 = np.square(x) - self._M2 + self._M[...] += delta / self._n + self._S[...] += delta * delta * n1 / self._n + + + def update(self, other): + n1 = self._n + n2 = other._n + n = n1 + n2 + delta = self._M - other._M + delta2 = delta * delta + M = (n1 * self._M + n2 * other._M) / n + S = self._S + other._S + delta2 * n1 * n2 / n + self._n = n + self._M = M + self._S = S + + def __repr__(self): + return '(n={}, mean_mean={}, mean_std={})'.format( + self.n, np.mean(self.mean), np.mean(self.std)) + + @property + def n(self): + return self._n + + @property + def mean(self): + return self._M + + @property + def var(self): + return self._S / (self._n - 1) if self._n > 1 else np.square(self._M) + + @property + def std(self): + return np.sqrt(self.var) + + @property + def shape(self): + return self._M.shape + + +class MeanStdFilter(Filter): + """Keeps track of a running mean for seen states""" + + def __init__(self, shape, demean=True, destd=True): + self.shape = shape + self.demean = demean + self.destd = destd + self.rs = RunningStat(shape) + # In distributed rollouts, each worker sees different states. + # The buffer is used to keep track of deltas amongst all the + # observation filters. + + self.buffer = RunningStat(shape) + + self.mean = np.zeros(shape, dtype = np.float64) + self.std = np.ones(shape, dtype = np.float64) + + def clear_buffer(self): + self.buffer = RunningStat(self.shape) + return + + def update(self, other, copy_buffer=False): + """Takes another filter and only applies the information from the + buffer. + + Using notation `F(state, buffer)` + Given `Filter1(x1, y1)` and `Filter2(x2, yt)`, + `update` modifies `Filter1` to `Filter1(x1 + yt, y1)` + If `copy_buffer`, then `Filter1` is modified to + `Filter1(x1 + yt, yt)`. + """ + self.rs.update(other.buffer) + if copy_buffer: + self.buffer = other.buffer.copy() + return + + def copy(self): + """Returns a copy of Filter.""" + other = MeanStdFilter(self.shape) + other.demean = self.demean + other.destd = self.destd + other.rs = self.rs.copy() + other.buffer = self.buffer.copy() + return other + + def sync(self, other): + """Syncs all fields together from other filter. + + Using notation `F(state, buffer)` + Given `Filter1(x1, y1)` and `Filter2(x2, yt)`, + `sync` modifies `Filter1` to `Filter1(x2, yt)` + """ + assert other.shape == self.shape, "Shapes don't match!" + self.demean = other.demean + self.destd = other.destd + self.rs = other.rs.copy() + self.buffer = other.buffer.copy() + return + + def __call__(self, x, update=True): + x = np.asarray(x, dtype = np.float64) + if update: + if len(x.shape) == len(self.rs.shape) + 1: + # The vectorized case. + for i in range(x.shape[0]): + self.rs.push(x[i]) + self.buffer.push(x[i]) + else: + # The unvectorized case. + self.rs.push(x) + self.buffer.push(x) + if self.demean: + x = x - self.mean + if self.destd: + x = x / (self.std + 1e-8) + return x + + def stats_increment(self): + self.mean = self.rs.mean + self.std = self.rs.std + + # Set values for std less than 1e-7 to +inf to avoid + # dividing by zero. State elements with zero variance + # are set to zero as a result. + self.std[self.std < 1e-7] = float("inf") + return + + def get_stats(self): + return self.rs.mean, (self.rs.std + 1e-8) + + def __repr__(self): + return 'MeanStdFilter({}, {}, {}, {}, {}, {})'.format( + self.shape, self.demean, + self.rs, self.buffer) + + +def get_filter(filter_config, shape = None): + if filter_config == "MeanStdFilter": + return MeanStdFilter(shape) + elif filter_config == "NoFilter": + return NoFilter() + else: + raise Exception("Unknown observation_filter: " + + str(filter_config)) + + +def test_running_stat(): + for shp in ((), (3,), (3, 4)): + li = [] + rs = RunningStat(shp) + for _ in range(5): + val = np.random.randn(*shp) + rs.push(val) + li.append(val) + m = np.mean(li, axis=0) + assert np.allclose(rs.mean, m) + v = np.square(m) if (len(li) == 1) else np.var(li, ddof=1, axis=0) + assert np.allclose(rs.var, v) + + +def test_combining_stat(): + for shape in [(), (3,), (3, 4)]: + li = [] + rs1 = RunningStat(shape) + rs2 = RunningStat(shape) + rs = RunningStat(shape) + for _ in range(5): + val = np.random.randn(*shape) + rs1.push(val) + rs.push(val) + li.append(val) + for _ in range(9): + rs2.push(val) + rs.push(val) + li.append(val) + rs1.update(rs2) + assert np.allclose(rs.mean, rs1.mean) + assert np.allclose(rs.std, rs1.std) + + +test_running_stat() +test_combining_stat() diff --git a/examples/pybullet/gym/pybullet_envs/ARS/log/config.yaml b/examples/pybullet/gym/pybullet_envs/ARS/log/config.yaml new file mode 100644 index 000000000..36690d51d --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/ARS/log/config.yaml @@ -0,0 +1,29 @@ +delta_std: 0.03 +deltas_used: 8 +env: !!python/object/apply:functools.partial + args: + - &id001 !!python/name:pybullet_envs.minitaur.envs.minitaur_reactive_env.MinitaurReactiveEnv '' + state: !!python/tuple + - *id001 + - !!python/tuple [] + - accurate_motor_model_enabled: true + control_latency: 0.02 + energy_weight: 0.005 + env_randomizer: null + motor_kd: 0.015 + num_steps_to_log: 1000 + pd_latency: 0.003 + remove_default_joint_damping: true + render: false + urdf_version: rainbow_dash_v0 + - null +filter: MeanStdFilter +num_directions: 8 +num_iterations: 1000 +num_workers: 8 +policy_type: linear +rollout_length: 1000 +seed: 237 +shift: 0 +step_size: 0.02 + diff --git a/examples/pybullet/gym/pybullet_envs/ARS/log/lin_policy_plus_990_bla.npz b/examples/pybullet/gym/pybullet_envs/ARS/log/lin_policy_plus_990_bla.npz new file mode 100644 index 000000000..54fd39ba7 Binary files /dev/null and b/examples/pybullet/gym/pybullet_envs/ARS/log/lin_policy_plus_990_bla.npz differ diff --git a/examples/pybullet/gym/pybullet_envs/ARS/logz.py b/examples/pybullet/gym/pybullet_envs/ARS/logz.py new file mode 100644 index 000000000..790d83c21 --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/ARS/logz.py @@ -0,0 +1,104 @@ +# Code in this file is copied and adapted from +# https://github.com/berkeleydeeprlcourse + +import json + +""" + +Some simple logging functionality, inspired by rllab's logging. +Assumes that each diagnostic gets logged each iteration + +Call logz.configure_output_dir() to start logging to a +tab-separated-values file (some_folder_name/log.txt) + +""" + +import os.path as osp, shutil, time, atexit, os, subprocess + +color2num = dict( + gray=30, + red=31, + green=32, + yellow=33, + blue=34, + magenta=35, + cyan=36, + white=37, + crimson=38 +) + +def colorize(string, color, bold=False, highlight=False): + attr = [] + num = color2num[color] + if highlight: num += 10 + attr.append(str(num)) + if bold: attr.append('1') + return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) + +class G(object): + output_dir = None + output_file = None + first_row = True + log_headers = [] + log_current_row = {} + +def configure_output_dir(d=None): + """ + Set output directory to d, or to /tmp/somerandomnumber if d is None + """ + G.first_row = True + G.log_headers = [] + G.log_current_row = {} + + G.output_dir = d or "/tmp/experiments/%i"%int(time.time()) + if not osp.exists(G.output_dir): + os.makedirs(G.output_dir) + G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w') + atexit.register(G.output_file.close) + print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True)) + +def log_tabular(key, val): + """ + Log a value of some diagnostic + Call this once for each diagnostic quantity, each iteration + """ + if G.first_row: + G.log_headers.append(key) + else: + assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key + assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key + G.log_current_row[key] = val + + +def save_params(params): + with open(osp.join(G.output_dir, "params.json"), 'w') as out: + out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True)) + + +def dump_tabular(): + """ + Write all of the diagnostics from the current iteration + """ + vals = [] + key_lens = [len(key) for key in G.log_headers] + max_key_len = max(15,max(key_lens)) + keystr = '%'+'%d'%max_key_len + fmt = "| " + keystr + "s | %15s |" + n_slashes = 22 + max_key_len + print("-"*n_slashes) + for key in G.log_headers: + val = G.log_current_row.get(key, "") + if hasattr(val, "__float__"): valstr = "%8.3g"%val + else: valstr = val + print(fmt%(key, valstr)) + vals.append(val) + print("-"*n_slashes) + if G.output_file is not None: + if G.first_row: + G.output_file.write("\t".join(G.log_headers)) + G.output_file.write("\n") + G.output_file.write("\t".join(map(str,vals))) + G.output_file.write("\n") + G.output_file.flush() + G.log_current_row.clear() + G.first_row=False diff --git a/examples/pybullet/gym/pybullet_envs/ARS/optimizers.py b/examples/pybullet/gym/pybullet_envs/ARS/optimizers.py new file mode 100644 index 000000000..063be9b87 --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/ARS/optimizers.py @@ -0,0 +1,35 @@ +# Code in this file is copied and adapted from +# https://github.com/openai/evolution-strategies-starter. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +# OPTIMIZERS FOR MINIMIZING OBJECTIVES +class Optimizer(object): + def __init__(self, w_policy): + self.w_policy = w_policy.flatten() + self.dim = w_policy.size + self.t = 0 + + def update(self, globalg): + self.t += 1 + step = self._compute_step(globalg) + ratio = np.linalg.norm(step) / (np.linalg.norm(self.w_policy) + 1e-5) + return self.w_policy + step, ratio + + def _compute_step(self, globalg): + raise NotImplementedError + + +class SGD(Optimizer): + def __init__(self, pi, stepsize): + Optimizer.__init__(self, pi) + self.stepsize = stepsize + + def _compute_step(self, globalg): + step = -self.stepsize * globalg + return step + diff --git a/examples/pybullet/gym/pybullet_envs/ARS/policies.py b/examples/pybullet/gym/pybullet_envs/ARS/policies.py new file mode 100644 index 000000000..0adc27451 --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/ARS/policies.py @@ -0,0 +1,72 @@ +""" +Policy class for computing action from weights and observation vector. +Horia Mania --- hmania@berkeley.edu +Aurelia Guy +Benjamin Recht +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import filter + + +class Policy(object): + + def __init__(self, policy_params): + + self.ob_dim = policy_params['ob_dim'] + self.ac_dim = policy_params['ac_dim'] + self.weights = np.empty(0) + + # a filter for updating statistics of the observations and normalizing + # inputs to the policies + self.observation_filter = filter.get_filter( + policy_params['ob_filter'], shape=(self.ob_dim,)) + self.update_filter = True + + def update_weights(self, new_weights): + self.weights[:] = new_weights[:] + return + + def get_weights(self): + return self.weights + + def get_observation_filter(self): + return self.observation_filter + + def act(self, ob): + raise NotImplementedError + + def copy(self): + raise NotImplementedError + + +class LinearPolicy(Policy): + """ + Linear policy class that computes action as . + """ + + def __init__(self, policy_params, update_filter=True): + Policy.__init__(self, policy_params) + self.weights = np.zeros(self.ac_dim * self.ob_dim, dtype=np.float64) + if "weights" in policy_params: + self.weights = policy_params["weights"] + if "mean" in policy_params: + self.observation_filter.mean = policy_params["mean"] + if "std" in policy_params: + self.observation_filter.std = policy_params["std"] + self.update_filter = update_filter + + def act(self, ob): + ob = self.observation_filter(ob, update=self.update_filter) + matrix_weights = np.reshape(self.weights, (self.ac_dim, self.ob_dim)) + return np.clip(np.dot(matrix_weights, ob), -1.0, 1.0) + + def get_weights_plus_stats(self): + + mu, std = self.observation_filter.get_stats() + aux = np.asarray([self.weights, mu, std]) + return aux diff --git a/examples/pybullet/gym/pybullet_envs/ARS/shared_noise.py b/examples/pybullet/gym/pybullet_envs/ARS/shared_noise.py new file mode 100644 index 000000000..1f4074920 --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/ARS/shared_noise.py @@ -0,0 +1,41 @@ +"""TODO(jietan): DO NOT SUBMIT without one-line documentation for shared_noise. +Code in this file is copied and adapted from +https://github.com/ray-project/ray/tree/master/python/ray/rllib/es +TODO(jietan): DO NOT SUBMIT without a detailed description of shared_noise. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import numpy as np + + +def create_shared_noise(): + """ + Create a large array of noise to be shared by all workers. Used + for avoiding the communication of the random perturbations delta. + """ + + seed = 12345 + count = 250000000 + noise = np.random.RandomState(seed).randn(count).astype(np.float64) + return noise + + +class SharedNoiseTable(object): + def __init__(self, noise, seed = 11): + + self.rg = np.random.RandomState(seed) + self.noise = noise + assert self.noise.dtype == np.float64 + + def get(self, i, dim): + return self.noise[i:i + dim] + + def sample_index(self, dim): + return self.rg.randint(0, len(self.noise) - dim + 1) + + def get_delta(self, dim): + idx = self.sample_index(dim) + return idx, self.get(idx, dim) + diff --git a/examples/pybullet/gym/pybullet_envs/ARS/start_ars_servers.py b/examples/pybullet/gym/pybullet_envs/ARS/start_ars_servers.py new file mode 100644 index 000000000..94aa6835f --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/ARS/start_ars_servers.py @@ -0,0 +1,28 @@ +"""TODO(jietan): DO NOT SUBMIT without one-line documentation for start_ars_servers. + +blaze build -c opt //experimental/users/jietan/ARS:start_ars_servers +blaze-bin/experimental/users/jietan/ARS/start_ars_servers + +TODO(jietan): DO NOT SUBMIT without a detailed description of start_ars_servers. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import time +import subprocess +from absl import app +from absl import flags + +FLAGS = flags.FLAGS +flags.DEFINE_integer("num_servers", 8, "number of servers") + +def main(argv): + del argv # Unused. + for server_id in xrange(FLAGS.num_servers): + args = ["blaze-bin/experimental/users/jietan/ARS/ars_server", "--config_name=MINITAUR_GYM_CONFIG", "--server_id={}".format(server_id), "--run_on_borg=False"] + subprocess.Popen(args) + + +if __name__ == '__main__': + app.run(main) diff --git a/examples/pybullet/gym/pybullet_envs/ARS/train_ars.borg b/examples/pybullet/gym/pybullet_envs/ARS/train_ars.borg new file mode 100644 index 000000000..c09775b50 --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/ARS/train_ars.borg @@ -0,0 +1,93 @@ +// Example borg file to do a parameter sweep. +// +// To run: +// echo `srcfs get_readonly`-`g4 p | head -1 | awk '{print $2}'` +// blaze build -c opt experimental/users/jietan/ARS:ars_server.par +// blaze build -c opt experimental/users/jietan/ARS:ars_client.par +// borgcfg --skip_confirmation --vars 'base_cl=191950338,my_cl=191950550,label=ars_react_nr01,config=MINITAUR_REACTIVE_CONFIG' experimental/users/jietan/ARS/train_ars.borg reload +// borgcfg --skip_confirmation --vars 'base_cl=191950338,my_cl=191950550,label=ars_react_rd01,config=MINITAUR_REACTIVE_RANDOMIZER_CONFIG' experimental/users/jietan/ARS/train_ars.borg reload + + +import '//production/borg/templates/lambda/buildtool_support.borg' as build +import '//production/borg/templates/lambda/dnsname.borg' as dns + +vars = { + cell = 'atlanta' + charged_user = 'robotics' + base_cl = 0 + my_cl = 0 + label = external + user = real_username() + workers = 8 + config = external + cns_home = "/cns/ij-d/home/%user%" + logdir = "%cns_home%/experiment/ARS/%label%.%base_cl%.%my_cl%/" +} + +service augmented_random_search { + runtime { + cell = vars.cell + } + + scheduling = { + priority = 100 + batch_quota = { + strategy = 'RUN_SOON' + } + deadline = 3600 * 24 + } + accounting = { + charged_user = vars.charged_user + } + requirements { + autopilot = true + } + params = { + mygoogle3 = build.google3dir(myfilename()) + experiment_dir = 'experimental/users/jietan/ARS/' + } + + job ars_server = { + runtime { + cell = vars.cell + } + name = real_username() + '_server_' + vars.label + replicas = vars.workers + binary_path = build.binfile_v2(params.mygoogle3, + params.experiment_dir + 'ars_server') + runfiles = binary_path + '.runfiles/google3/' + packages = { + package third_party = { + directory = runfiles + 'third_party/' + } + } + binary = build.binfile(params.mygoogle3, + params.experiment_dir + 'ars_server.par') + args = { + server_id = '%task%' + config_name = vars.config + port = '%port%' + run_on_borg = true + } + } + job ars_client = { + name = real_username() + '_client_' + vars.label + binary_path = build.binfile_v2(params.mygoogle3, + params.experiment_dir + 'ars_client') + runfiles = binary_path + '.runfiles/google3/' + packages = { + package third_party = { + directory = runfiles + 'third_party/' + } + } + binary = build.binfile(params.mygoogle3, + params.experiment_dir + 'ars_client.par') + args = { + server_address = dns.borg_dns_name(ars_server) + num_servers = vars.workers + config_name = vars.config + logdir = vars.logdir + run_on_borg = true + } + } +} diff --git a/examples/pybullet/gym/pybullet_envs/ARS/train_ars.py b/examples/pybullet/gym/pybullet_envs/ARS/train_ars.py new file mode 100644 index 000000000..147dbc6eb --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/ARS/train_ars.py @@ -0,0 +1,64 @@ +"""TODO(jietan): DO NOT SUBMIT without one-line documentation for train_ars. + +blaze build -c opt //experimental/users/jietan/ARS:train_ars +blaze-bin/experimental/users/jietan/ARS/train_ars \ +--logdir=/cns/ij-d/home/jietan/experiment/ARS/test1 \ +--config_name=MINITAUR_GYM_CONFIG +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + +import os +from absl import app +from absl import flags +import ars +import config_ars + +FLAGS = flags.FLAGS +flags.DEFINE_string('logdir', None, 'The directory to write the log file.') +flags.DEFINE_string('config_name', None, 'The name of the config dictionary') + + +def run_ars(config, logdir): + + env = config["env"]() + ob_dim = env.observation_space.shape[0] + ac_dim = env.action_space.shape[0] + + # set policy parameters. Possible filters: 'MeanStdFilter' for v2, 'NoFilter' for v1. + policy_params = { + 'type': 'linear', + 'ob_filter': config['filter'], + 'ob_dim': ob_dim, + 'ac_dim': ac_dim + } + + ARS = ars.ARSLearner( + env_callback=config['env'], + policy_params=policy_params, + num_deltas=config['num_directions'], + deltas_used=config['deltas_used'], + step_size=config['step_size'], + delta_std=config['delta_std'], + logdir=logdir, + rollout_length=config['rollout_length'], + shift=config['shift'], + params=config, + seed=config['seed']) + + return ARS.train(config['num_iterations']) + + +def main(argv): + del argv # Unused. + config = getattr(config_ars, FLAGS.config_name) + run_ars(config=config, logdir=FLAGS.logdir) + + +if __name__ == '__main__': + flags.mark_flag_as_required('logdir') + flags.mark_flag_as_required('config_name') + app.run(main) diff --git a/examples/pybullet/gym/pybullet_envs/ARS/train_ars_test.py b/examples/pybullet/gym/pybullet_envs/ARS/train_ars_test.py new file mode 100644 index 000000000..13f149fc5 --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/ARS/train_ars_test.py @@ -0,0 +1,29 @@ +"""Tests for google3.experimental.users.jietan.ARS.train_ars. +blaze build -c opt //experimental/users/jietan/ARS:train_ars_test +blaze-bin/experimental/users/jietan/ARS/train_ars_test +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from absl import flags +from google3.testing.pybase import googletest +from google3.experimental.users.jietan.ARS import train_ars +from google3.experimental.users.jietan.ARS import config_ars + +FLAGS = flags.FLAGS +MAX_RETURN_AFTER_TWO_ITEATIONS = 0.0890905394617 + +class TrainArsTest(googletest.TestCase): + + def testArsTwoStepResult(self): + config = getattr(config_ars, "MINITAUR_REACTIVE_CONFIG") + config['num_iterations'] = 2 + info = train_ars.run_ars(config=config, logdir=FLAGS.test_tmpdir) + print (info) + self.assertAlmostEqual(info["max_reward"], MAX_RETURN_AFTER_TWO_ITEATIONS) + + +if __name__ == '__main__': + googletest.main() diff --git a/examples/pybullet/gym/pybullet_envs/ARS/utility.py b/examples/pybullet/gym/pybullet_envs/ARS/utility.py new file mode 100644 index 000000000..591418be5 --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/ARS/utility.py @@ -0,0 +1,52 @@ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import os +import ruamel.yaml as yaml + +def save_config(config, logdir): + """Save a new configuration by name. + + If a logging directory is specified, is will be created and the configuration + will be stored there. Otherwise, a log message will be printed. + + Args: + config: Configuration object. + logdir: Location for writing summaries and checkpoints if specified. + + Returns: + Configuration object. + """ + message = 'Start a new run and write summaries and checkpoints to {}.' + print(message.format(logdir)) + config_path = os.path.join(logdir, 'config.yaml') + yaml.dump(config, config_path, default_flow_style=False) + return config + + +def load_config(logdir): + """Load a configuration from the log directory. + + Args: + logdir: The logging directory containing the configuration file. + + Raises: + IOError: The logging directory does not contain a configuration file. + + Returns: + Configuration object. + """ + config_path = logdir and os.path.join(logdir, 'config.yaml') + if not config_path: + message = ( + 'Cannot resume an existing run since the logging directory does not ' + 'contain a configuration file.') + raise IOError(message) + print("config_path=",config_path) + + stream = open(config_path, 'r') + config = yaml.load(stream) + message = 'Resume run and write summaries and checkpoints to {}.' + print(message.format(logdir)) + return config diff --git a/examples/pybullet/gym/pybullet_envs/ARS/utils.py b/examples/pybullet/gym/pybullet_envs/ARS/utils.py new file mode 100644 index 000000000..3b7e03afa --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/ARS/utils.py @@ -0,0 +1,28 @@ +# Code in this file is copied and adapted from +# https://github.com/openai/evolution-strategies-starter. + +import numpy as np + +def itergroups(items, group_size): + assert group_size >= 1 + group = [] + for x in items: + group.append(x) + if len(group) == group_size: + yield tuple(group) + del group[:] + if group: + yield tuple(group) + + + +def batched_weighted_sum(weights, vecs, batch_size): + total = 0 + num_items_summed = 0 + for batch_weights, batch_vecs in zip(itergroups(weights, batch_size), + itergroups(vecs, batch_size)): + assert len(batch_weights) == len(batch_vecs) <= batch_size + total += np.dot(np.asarray(batch_weights, dtype=np.float64), + np.asarray(batch_vecs, dtype=np.float64)) + num_items_summed += len(batch_weights) + return total, num_items_summed