add yapf style and apply yapf to format all Python files

This recreates pull request #2192
This commit is contained in:
Erwin Coumans
2019-04-27 07:31:15 -07:00
parent c591735042
commit ef9570c315
347 changed files with 70304 additions and 22752 deletions

View File

@@ -5,17 +5,18 @@ import pybullet_data
AGENT_TYPE_KEY = "AgentType"
def build_agent(world, id, file):
agent = None
with open(pybullet_data.getDataPath()+"/"+file) as data_file:
json_data = json.load(data_file)
assert AGENT_TYPE_KEY in json_data
agent_type = json_data[AGENT_TYPE_KEY]
if (agent_type == PPOAgent.NAME):
agent = PPOAgent(world, id, json_data)
else:
assert False, 'Unsupported agent type: ' + agent_type
return agent
def build_agent(world, id, file):
agent = None
with open(pybullet_data.getDataPath() + "/" + file) as data_file:
json_data = json.load(data_file)
assert AGENT_TYPE_KEY in json_data
agent_type = json_data[AGENT_TYPE_KEY]
if (agent_type == PPOAgent.NAME):
agent = PPOAgent(world, id, json_data)
else:
assert False, 'Unsupported agent type: ' + agent_type
return agent

View File

@@ -2,53 +2,54 @@ import json
import numpy as np
import pybullet_utils.math_util as MathUtil
class ExpParams(object):
RATE_KEY = 'Rate'
INIT_ACTION_RATE_KEY = 'InitActionRate'
NOISE_KEY = 'Noise'
NOISE_INTERNAL_KEY = 'NoiseInternal'
TEMP_KEY = 'Temp'
RATE_KEY = 'Rate'
INIT_ACTION_RATE_KEY = 'InitActionRate'
NOISE_KEY = 'Noise'
NOISE_INTERNAL_KEY = 'NoiseInternal'
TEMP_KEY = 'Temp'
def __init__(self):
self.rate = 0.2
self.init_action_rate = 0
self.noise = 0.1
self.noise_internal = 0
self.temp = 0.1
return
def __init__(self):
self.rate = 0.2
self.init_action_rate = 0
self.noise = 0.1
self.noise_internal = 0
self.temp = 0.1
return
def __str__(self):
str = ''
str += '{}: {:.2f}\n'.format(self.RATE_KEY, self.rate)
str += '{}: {:.2f}\n'.format(self.INIT_ACTION_RATE_KEY, self.init_action_rate)
str += '{}: {:.2f}\n'.format(self.NOISE_KEY, self.noise)
str += '{}: {:.2f}\n'.format(self.NOISE_INTERNAL_KEY, self.noise_internal)
str += '{}: {:.2f}\n'.format(self.TEMP_KEY, self.temp)
return str
def __str__(self):
str = ''
str += '{}: {:.2f}\n'.format(self.RATE_KEY, self.rate)
str += '{}: {:.2f}\n'.format(self.INIT_ACTION_RATE_KEY, self.init_action_rate)
str += '{}: {:.2f}\n'.format(self.NOISE_KEY, self.noise)
str += '{}: {:.2f}\n'.format(self.NOISE_INTERNAL_KEY, self.noise_internal)
str += '{}: {:.2f}\n'.format(self.TEMP_KEY, self.temp)
return str
def load(self, json_data):
if (self.RATE_KEY in json_data):
self.rate = json_data[self.RATE_KEY]
def load(self, json_data):
if (self.RATE_KEY in json_data):
self.rate = json_data[self.RATE_KEY]
if (self.INIT_ACTION_RATE_KEY in json_data):
self.init_action_rate = json_data[self.INIT_ACTION_RATE_KEY]
if (self.INIT_ACTION_RATE_KEY in json_data):
self.init_action_rate = json_data[self.INIT_ACTION_RATE_KEY]
if (self.NOISE_KEY in json_data):
self.noise = json_data[self.NOISE_KEY]
if (self.NOISE_KEY in json_data):
self.noise = json_data[self.NOISE_KEY]
if (self.NOISE_INTERNAL_KEY in json_data):
self.noise_internal = json_data[self.NOISE_INTERNAL_KEY]
if (self.NOISE_INTERNAL_KEY in json_data):
self.noise_internal = json_data[self.NOISE_INTERNAL_KEY]
if (self.TEMP_KEY in json_data):
self.temp = json_data[self.TEMP_KEY]
if (self.TEMP_KEY in json_data):
self.temp = json_data[self.TEMP_KEY]
return
return
def lerp(self, other, t):
lerp_params = ExpParams()
lerp_params.rate = MathUtil.lerp(self.rate, other.rate, t)
lerp_params.init_action_rate = MathUtil.lerp(self.init_action_rate, other.init_action_rate, t)
lerp_params.noise = MathUtil.lerp(self.noise, other.noise, t)
lerp_params.noise_internal = MathUtil.lerp(self.noise_internal, other.noise_internal, t)
lerp_params.temp = MathUtil.log_lerp(self.temp, other.temp, t)
return lerp_params
def lerp(self, other, t):
lerp_params = ExpParams()
lerp_params.rate = MathUtil.lerp(self.rate, other.rate, t)
lerp_params.init_action_rate = MathUtil.lerp(self.init_action_rate, other.init_action_rate, t)
lerp_params.noise = MathUtil.lerp(self.noise, other.noise, t)
lerp_params.noise_internal = MathUtil.lerp(self.noise_internal, other.noise_internal, t)
lerp_params.temp = MathUtil.log_lerp(self.temp, other.temp, t)
return lerp_params

View File

@@ -1 +1 @@
from . import *
from . import *

View File

@@ -3,11 +3,12 @@ import pybullet_envs.deep_mimic.learning.tf_util as TFUtil
NAME = "fc_2layers_1024units"
def build_net(input_tfs, reuse=False):
layers = [1024, 512]
activation = tf.nn.relu
input_tf = tf.concat(axis=-1, values=input_tfs)
h = TFUtil.fc_net(input_tf, layers, activation=activation, reuse=reuse)
h = activation(h)
return h
def build_net(input_tfs, reuse=False):
layers = [1024, 512]
activation = tf.nn.relu
input_tf = tf.concat(axis=-1, values=input_tfs)
h = TFUtil.fc_net(input_tf, layers, activation=activation, reuse=reuse)
h = activation(h)
return h

View File

@@ -1,11 +1,12 @@
import pybullet_envs.deep_mimic.learning.nets.fc_2layers_1024units as fc_2layers_1024units
def build_net(net_name, input_tfs, reuse=False):
net = None
if (net_name == fc_2layers_1024units.NAME):
net = fc_2layers_1024units.build_net(input_tfs, reuse)
else:
assert False, 'Unsupported net: ' + net_name
return net
def build_net(net_name, input_tfs, reuse=False):
net = None
if (net_name == fc_2layers_1024units.NAME):
net = fc_2layers_1024units.build_net(input_tfs, reuse)
else:
assert False, 'Unsupported net: ' + net_name
return net

View File

@@ -3,147 +3,149 @@ import copy
import pybullet_utils.mpi_util as MPIUtil
from pybullet_utils.logger import Logger
class Normalizer(object):
CHECK_SYNC_COUNT = 50000 # check synchronization after a certain number of entries
CHECK_SYNC_COUNT = 50000 # check synchronization after a certain number of entries
# these group IDs must be the same as those in CharController.h
NORM_GROUP_SINGLE = 0
NORM_GROUP_NONE = -1
# these group IDs must be the same as those in CharController.h
NORM_GROUP_SINGLE = 0
NORM_GROUP_NONE = -1
class Group(object):
def __init__(self, id, indices):
self.id = id
self.indices = indices
return
class Group(object):
def __init__(self, size, groups_ids=None, eps=0.02, clip=np.inf):
self.eps = eps
self.clip = clip
self.mean = np.zeros(size)
self.mean_sq = np.zeros(size)
self.std = np.ones(size)
self.count = 0
self.groups = self._build_groups(groups_ids)
def __init__(self, id, indices):
self.id = id
self.indices = indices
return
self.new_count = 0
self.new_sum = np.zeros_like(self.mean)
self.new_sum_sq = np.zeros_like(self.mean_sq)
return
def __init__(self, size, groups_ids=None, eps=0.02, clip=np.inf):
self.eps = eps
self.clip = clip
self.mean = np.zeros(size)
self.mean_sq = np.zeros(size)
self.std = np.ones(size)
self.count = 0
self.groups = self._build_groups(groups_ids)
def record(self, x):
size = self.get_size()
is_array = isinstance(x, np.ndarray)
if not is_array:
assert(size == 1)
x = np.array([[x]])
self.new_count = 0
self.new_sum = np.zeros_like(self.mean)
self.new_sum_sq = np.zeros_like(self.mean_sq)
return
assert x.shape[-1] == size, \
Logger.print2('Normalizer shape mismatch, expecting size {:d}, but got {:d}'.format(size, x.shape[-1]))
x = np.reshape(x, [-1, size])
def record(self, x):
size = self.get_size()
is_array = isinstance(x, np.ndarray)
if not is_array:
assert (size == 1)
x = np.array([[x]])
self.new_count += x.shape[0]
self.new_sum += np.sum(x, axis=0)
self.new_sum_sq += np.sum(np.square(x), axis=0)
return
assert x.shape[-1] == size, \
Logger.print2('Normalizer shape mismatch, expecting size {:d}, but got {:d}'.format(size, x.shape[-1]))
x = np.reshape(x, [-1, size])
def update(self):
new_count = MPIUtil.reduce_sum(self.new_count)
new_sum = MPIUtil.reduce_sum(self.new_sum)
new_sum_sq = MPIUtil.reduce_sum(self.new_sum_sq)
self.new_count += x.shape[0]
self.new_sum += np.sum(x, axis=0)
self.new_sum_sq += np.sum(np.square(x), axis=0)
return
new_total = self.count + new_count
if (self.count // self.CHECK_SYNC_COUNT != new_total // self.CHECK_SYNC_COUNT):
assert self.check_synced(), Logger.print2('Normalizer parameters desynchronized')
def update(self):
new_count = MPIUtil.reduce_sum(self.new_count)
new_sum = MPIUtil.reduce_sum(self.new_sum)
new_sum_sq = MPIUtil.reduce_sum(self.new_sum_sq)
if new_count > 0:
new_mean = self._process_group_data(new_sum / new_count, self.mean)
new_mean_sq = self._process_group_data(new_sum_sq / new_count, self.mean_sq)
w_old = float(self.count) / new_total
w_new = float(new_count) / new_total
new_total = self.count + new_count
if (self.count // self.CHECK_SYNC_COUNT != new_total // self.CHECK_SYNC_COUNT):
assert self.check_synced(), Logger.print2('Normalizer parameters desynchronized')
self.mean = w_old * self.mean + w_new * new_mean
self.mean_sq = w_old * self.mean_sq + w_new * new_mean_sq
self.count = new_total
self.std = self.calc_std(self.mean, self.mean_sq)
if new_count > 0:
new_mean = self._process_group_data(new_sum / new_count, self.mean)
new_mean_sq = self._process_group_data(new_sum_sq / new_count, self.mean_sq)
w_old = float(self.count) / new_total
w_new = float(new_count) / new_total
self.new_count = 0
self.new_sum.fill(0)
self.new_sum_sq.fill(0)
self.mean = w_old * self.mean + w_new * new_mean
self.mean_sq = w_old * self.mean_sq + w_new * new_mean_sq
self.count = new_total
self.std = self.calc_std(self.mean, self.mean_sq)
return
self.new_count = 0
self.new_sum.fill(0)
self.new_sum_sq.fill(0)
def get_size(self):
return self.mean.size
return
def set_mean_std(self, mean, std):
size = self.get_size()
is_array = isinstance(mean, np.ndarray) and isinstance(std, np.ndarray)
if not is_array:
assert(size == 1)
mean = np.array([mean])
std = np.array([std])
def get_size(self):
return self.mean.size
assert len(mean) == size and len(std) == size, \
Logger.print2('Normalizer shape mismatch, expecting size {:d}, but got {:d} and {:d}'.format(size, len(mean), len(std)))
self.mean = mean
self.std = std
self.mean_sq = self.calc_mean_sq(self.mean, self.std)
return
def set_mean_std(self, mean, std):
size = self.get_size()
is_array = isinstance(mean, np.ndarray) and isinstance(std, np.ndarray)
def normalize(self, x):
norm_x = (x - self.mean) / self.std
norm_x = np.clip(norm_x, -self.clip, self.clip)
return norm_x
if not is_array:
assert (size == 1)
mean = np.array([mean])
std = np.array([std])
def unnormalize(self, norm_x):
x = norm_x * self.std + self.mean
return x
assert len(mean) == size and len(std) == size, \
Logger.print2('Normalizer shape mismatch, expecting size {:d}, but got {:d} and {:d}'.format(size, len(mean), len(std)))
def calc_std(self, mean, mean_sq):
var = mean_sq - np.square(mean)
# some time floating point errors can lead to small negative numbers
var = np.maximum(var, 0)
std = np.sqrt(var)
std = np.maximum(std, self.eps)
return std
self.mean = mean
self.std = std
self.mean_sq = self.calc_mean_sq(self.mean, self.std)
return
def calc_mean_sq(self, mean, std):
return np.square(std) + np.square(self.mean)
def normalize(self, x):
norm_x = (x - self.mean) / self.std
norm_x = np.clip(norm_x, -self.clip, self.clip)
return norm_x
def check_synced(self):
synced = True
if MPIUtil.is_root_proc():
vars = np.concatenate([self.mean, self.mean_sq])
MPIUtil.bcast(vars)
else:
vars_local = np.concatenate([self.mean, self.mean_sq])
vars_root = np.empty_like(vars_local)
MPIUtil.bcast(vars_root)
synced = (vars_local == vars_root).all()
return synced
def unnormalize(self, norm_x):
x = norm_x * self.std + self.mean
return x
def _build_groups(self, groups_ids):
groups = []
if groups_ids is None:
curr_id = self.NORM_GROUP_SINGLE
curr_list = np.arange(self.get_size()).astype(np.int32)
groups.append(self.Group(curr_id, curr_list))
else:
ids = np.unique(groups_ids)
for id in ids:
curr_list = np.nonzero(groups_ids == id)[0].astype(np.int32)
groups.append(self.Group(id, curr_list))
def calc_std(self, mean, mean_sq):
var = mean_sq - np.square(mean)
# some time floating point errors can lead to small negative numbers
var = np.maximum(var, 0)
std = np.sqrt(var)
std = np.maximum(std, self.eps)
return std
return groups
def calc_mean_sq(self, mean, std):
return np.square(std) + np.square(self.mean)
def _process_group_data(self, new_data, old_data):
proc_data = new_data.copy()
for group in self.groups:
if group.id == self.NORM_GROUP_NONE:
proc_data[group.indices] = old_data[group.indices]
elif group.id != self.NORM_GROUP_SINGLE:
avg = np.mean(new_data[group.indices])
proc_data[group.indices] = avg
return proc_data
def check_synced(self):
synced = True
if MPIUtil.is_root_proc():
vars = np.concatenate([self.mean, self.mean_sq])
MPIUtil.bcast(vars)
else:
vars_local = np.concatenate([self.mean, self.mean_sq])
vars_root = np.empty_like(vars_local)
MPIUtil.bcast(vars_root)
synced = (vars_local == vars_root).all()
return synced
def _build_groups(self, groups_ids):
groups = []
if groups_ids is None:
curr_id = self.NORM_GROUP_SINGLE
curr_list = np.arange(self.get_size()).astype(np.int32)
groups.append(self.Group(curr_id, curr_list))
else:
ids = np.unique(groups_ids)
for id in ids:
curr_list = np.nonzero(groups_ids == id)[0].astype(np.int32)
groups.append(self.Group(id, curr_list))
return groups
def _process_group_data(self, new_data, old_data):
proc_data = new_data.copy()
for group in self.groups:
if group.id == self.NORM_GROUP_NONE:
proc_data[group.indices] = old_data[group.indices]
elif group.id != self.NORM_GROUP_SINGLE:
avg = np.mean(new_data[group.indices])
proc_data[group.indices] = avg
return proc_data

View File

@@ -1,46 +1,47 @@
import numpy as np
from pybullet_envs.deep_mimic.env.env import Env
class Path(object):
def __init__(self):
self.clear()
return
def pathlength(self):
return len(self.actions)
def __init__(self):
self.clear()
return
def is_valid(self):
valid = True
l = self.pathlength()
valid &= len(self.states) == l + 1
valid &= len(self.goals) == l + 1
valid &= len(self.actions) == l
valid &= len(self.logps) == l
valid &= len(self.rewards) == l
valid &= len(self.flags) == l
def pathlength(self):
return len(self.actions)
return valid
def is_valid(self):
valid = True
l = self.pathlength()
valid &= len(self.states) == l + 1
valid &= len(self.goals) == l + 1
valid &= len(self.actions) == l
valid &= len(self.logps) == l
valid &= len(self.rewards) == l
valid &= len(self.flags) == l
def check_vals(self):
for vals in [self.states, self.goals, self.actions, self.logps,
self.rewards]:
for v in vals:
if not np.isfinite(v).all():
return False
return True
return valid
def clear(self):
self.states = []
self.goals = []
self.actions = []
self.logps = []
self.rewards = []
self.flags = []
self.terminate = Env.Terminate.Null
return
def check_vals(self):
for vals in [self.states, self.goals, self.actions, self.logps, self.rewards]:
for v in vals:
if not np.isfinite(v).all():
return False
return True
def get_pathlen(self):
return len(self.rewards)
def clear(self):
self.states = []
self.goals = []
self.actions = []
self.logps = []
self.rewards = []
self.flags = []
self.terminate = Env.Terminate.Null
return
def calc_return(self):
return sum(self.rewards)
def get_pathlen(self):
return len(self.rewards)
def calc_return(self):
return sum(self.rewards)

View File

@@ -13,341 +13,343 @@ import pybullet_utils.mpi_util as MPIUtil
import pybullet_utils.math_util as MathUtil
from pybullet_envs.deep_mimic.env.action_space import ActionSpace
from pybullet_envs.deep_mimic.env.env import Env
'''
Policy Gradient Agent
'''
class PGAgent(TFAgent):
NAME = 'PG'
NAME = 'PG'
ACTOR_NET_KEY = 'ActorNet'
ACTOR_STEPSIZE_KEY = 'ActorStepsize'
ACTOR_MOMENTUM_KEY = 'ActorMomentum'
ACTOR_WEIGHT_DECAY_KEY = 'ActorWeightDecay'
ACTOR_INIT_OUTPUT_SCALE_KEY = 'ActorInitOutputScale'
ACTOR_NET_KEY = 'ActorNet'
ACTOR_STEPSIZE_KEY = 'ActorStepsize'
ACTOR_MOMENTUM_KEY = 'ActorMomentum'
ACTOR_WEIGHT_DECAY_KEY = 'ActorWeightDecay'
ACTOR_INIT_OUTPUT_SCALE_KEY = 'ActorInitOutputScale'
CRITIC_NET_KEY = 'CriticNet'
CRITIC_STEPSIZE_KEY = 'CriticStepsize'
CRITIC_MOMENTUM_KEY = 'CriticMomentum'
CRITIC_WEIGHT_DECAY_KEY = 'CriticWeightDecay'
EXP_ACTION_FLAG = 1 << 0
CRITIC_NET_KEY = 'CriticNet'
CRITIC_STEPSIZE_KEY = 'CriticStepsize'
CRITIC_MOMENTUM_KEY = 'CriticMomentum'
CRITIC_WEIGHT_DECAY_KEY = 'CriticWeightDecay'
def __init__(self, world, id, json_data):
self._exp_action = False
super().__init__(world, id, json_data)
return
EXP_ACTION_FLAG = 1 << 0
def reset(self):
super().reset()
self._exp_action = False
return
def __init__(self, world, id, json_data):
self._exp_action = False
super().__init__(world, id, json_data)
return
def _check_action_space(self):
action_space = self.get_action_space()
return action_space == ActionSpace.Continuous
def reset(self):
super().reset()
self._exp_action = False
return
def _load_params(self, json_data):
super()._load_params(json_data)
self.val_min, self.val_max = self._calc_val_bounds(self.discount)
self.val_fail, self.val_succ = self._calc_term_vals(self.discount)
return
def _check_action_space(self):
action_space = self.get_action_space()
return action_space == ActionSpace.Continuous
def _build_nets(self, json_data):
assert self.ACTOR_NET_KEY in json_data
assert self.CRITIC_NET_KEY in json_data
def _load_params(self, json_data):
super()._load_params(json_data)
self.val_min, self.val_max = self._calc_val_bounds(self.discount)
self.val_fail, self.val_succ = self._calc_term_vals(self.discount)
return
actor_net_name = json_data[self.ACTOR_NET_KEY]
critic_net_name = json_data[self.CRITIC_NET_KEY]
actor_init_output_scale = 1 if (self.ACTOR_INIT_OUTPUT_SCALE_KEY not in json_data) else json_data[self.ACTOR_INIT_OUTPUT_SCALE_KEY]
s_size = self.get_state_size()
g_size = self.get_goal_size()
a_size = self.get_action_size()
def _build_nets(self, json_data):
assert self.ACTOR_NET_KEY in json_data
assert self.CRITIC_NET_KEY in json_data
# setup input tensors
self.s_tf = tf.placeholder(tf.float32, shape=[None, s_size], name="s") # observations
self.tar_val_tf = tf.placeholder(tf.float32, shape=[None], name="tar_val") # target value s
self.adv_tf = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage
self.a_tf = tf.placeholder(tf.float32, shape=[None, a_size], name="a") # target actions
self.g_tf = tf.placeholder(tf.float32, shape=([None, g_size] if self.has_goal() else None), name="g") # goals
actor_net_name = json_data[self.ACTOR_NET_KEY]
critic_net_name = json_data[self.CRITIC_NET_KEY]
actor_init_output_scale = 1 if (self.ACTOR_INIT_OUTPUT_SCALE_KEY not in json_data
) else json_data[self.ACTOR_INIT_OUTPUT_SCALE_KEY]
with tf.variable_scope('main'):
with tf.variable_scope('actor'):
self.actor_tf = self._build_net_actor(actor_net_name, actor_init_output_scale)
with tf.variable_scope('critic'):
self.critic_tf = self._build_net_critic(critic_net_name)
s_size = self.get_state_size()
g_size = self.get_goal_size()
a_size = self.get_action_size()
if (self.actor_tf != None):
Logger.print2('Built actor net: ' + actor_net_name)
# setup input tensors
self.s_tf = tf.placeholder(tf.float32, shape=[None, s_size], name="s") # observations
self.tar_val_tf = tf.placeholder(tf.float32, shape=[None], name="tar_val") # target value s
self.adv_tf = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage
self.a_tf = tf.placeholder(tf.float32, shape=[None, a_size], name="a") # target actions
self.g_tf = tf.placeholder(tf.float32,
shape=([None, g_size] if self.has_goal() else None),
name="g") # goals
if (self.critic_tf != None):
Logger.print2('Built critic net: ' + critic_net_name)
with tf.variable_scope('main'):
with tf.variable_scope('actor'):
self.actor_tf = self._build_net_actor(actor_net_name, actor_init_output_scale)
with tf.variable_scope('critic'):
self.critic_tf = self._build_net_critic(critic_net_name)
return
if (self.actor_tf != None):
Logger.print2('Built actor net: ' + actor_net_name)
def _build_normalizers(self):
super()._build_normalizers()
with self.sess.as_default(), self.graph.as_default(), tf.variable_scope(self.tf_scope):
with tf.variable_scope(self.RESOURCE_SCOPE):
val_offset, val_scale = self._calc_val_offset_scale(self.discount)
self.val_norm = TFNormalizer(self.sess, 'val_norm', 1)
self.val_norm.set_mean_std(-val_offset, 1.0 / val_scale)
return
if (self.critic_tf != None):
Logger.print2('Built critic net: ' + critic_net_name)
def _init_normalizers(self):
super()._init_normalizers()
with self.sess.as_default(), self.graph.as_default():
self.val_norm.update()
return
return
def _load_normalizers(self):
super()._load_normalizers()
self.val_norm.load()
return
def _build_normalizers(self):
super()._build_normalizers()
with self.sess.as_default(), self.graph.as_default(), tf.variable_scope(self.tf_scope):
with tf.variable_scope(self.RESOURCE_SCOPE):
val_offset, val_scale = self._calc_val_offset_scale(self.discount)
self.val_norm = TFNormalizer(self.sess, 'val_norm', 1)
self.val_norm.set_mean_std(-val_offset, 1.0 / val_scale)
return
def _build_losses(self, json_data):
actor_weight_decay = 0 if (self.ACTOR_WEIGHT_DECAY_KEY not in json_data) else json_data[self.ACTOR_WEIGHT_DECAY_KEY]
critic_weight_decay = 0 if (self.CRITIC_WEIGHT_DECAY_KEY not in json_data) else json_data[self.CRITIC_WEIGHT_DECAY_KEY]
def _init_normalizers(self):
super()._init_normalizers()
with self.sess.as_default(), self.graph.as_default():
self.val_norm.update()
return
norm_val_diff = self.val_norm.normalize_tf(self.tar_val_tf) - self.val_norm.normalize_tf(self.critic_tf)
self.critic_loss_tf = 0.5 * tf.reduce_mean(tf.square(norm_val_diff))
def _load_normalizers(self):
super()._load_normalizers()
self.val_norm.load()
return
if (critic_weight_decay != 0):
self.critic_loss_tf += critic_weight_decay * self._weight_decay_loss('main/critic')
norm_a_mean_tf = self.a_norm.normalize_tf(self.actor_tf)
norm_a_diff = self.a_norm.normalize_tf(self.a_tf) - norm_a_mean_tf
def _build_losses(self, json_data):
actor_weight_decay = 0 if (
self.ACTOR_WEIGHT_DECAY_KEY not in json_data) else json_data[self.ACTOR_WEIGHT_DECAY_KEY]
critic_weight_decay = 0 if (
self.CRITIC_WEIGHT_DECAY_KEY not in json_data) else json_data[self.CRITIC_WEIGHT_DECAY_KEY]
self.actor_loss_tf = tf.reduce_sum(tf.square(norm_a_diff), axis=-1)
self.actor_loss_tf *= self.adv_tf
self.actor_loss_tf = 0.5 * tf.reduce_mean(self.actor_loss_tf)
norm_val_diff = self.val_norm.normalize_tf(self.tar_val_tf) - self.val_norm.normalize_tf(
self.critic_tf)
self.critic_loss_tf = 0.5 * tf.reduce_mean(tf.square(norm_val_diff))
norm_a_bound_min = self.a_norm.normalize(self.a_bound_min)
norm_a_bound_max = self.a_norm.normalize(self.a_bound_max)
a_bound_loss = TFUtil.calc_bound_loss(norm_a_mean_tf, norm_a_bound_min, norm_a_bound_max)
a_bound_loss /= self.exp_params_curr.noise
self.actor_loss_tf += a_bound_loss
if (critic_weight_decay != 0):
self.critic_loss_tf += critic_weight_decay * self._weight_decay_loss('main/critic')
if (actor_weight_decay != 0):
self.actor_loss_tf += actor_weight_decay * self._weight_decay_loss('main/actor')
return
norm_a_mean_tf = self.a_norm.normalize_tf(self.actor_tf)
norm_a_diff = self.a_norm.normalize_tf(self.a_tf) - norm_a_mean_tf
def _build_solvers(self, json_data):
actor_stepsize = 0.001 if (self.ACTOR_STEPSIZE_KEY not in json_data) else json_data[self.ACTOR_STEPSIZE_KEY]
actor_momentum = 0.9 if (self.ACTOR_MOMENTUM_KEY not in json_data) else json_data[self.ACTOR_MOMENTUM_KEY]
critic_stepsize = 0.01 if (self.CRITIC_STEPSIZE_KEY not in json_data) else json_data[self.CRITIC_STEPSIZE_KEY]
critic_momentum = 0.9 if (self.CRITIC_MOMENTUM_KEY not in json_data) else json_data[self.CRITIC_MOMENTUM_KEY]
critic_vars = self._tf_vars('main/critic')
critic_opt = tf.train.MomentumOptimizer(learning_rate=critic_stepsize, momentum=critic_momentum)
self.critic_grad_tf = tf.gradients(self.critic_loss_tf, critic_vars)
self.critic_solver = MPISolver(self.sess, critic_opt, critic_vars)
self.actor_loss_tf = tf.reduce_sum(tf.square(norm_a_diff), axis=-1)
self.actor_loss_tf *= self.adv_tf
self.actor_loss_tf = 0.5 * tf.reduce_mean(self.actor_loss_tf)
actor_vars = self._tf_vars('main/actor')
actor_opt = tf.train.MomentumOptimizer(learning_rate=actor_stepsize, momentum=actor_momentum)
self.actor_grad_tf = tf.gradients(self.actor_loss_tf, actor_vars)
self.actor_solver = MPISolver(self.sess, actor_opt, actor_vars)
norm_a_bound_min = self.a_norm.normalize(self.a_bound_min)
norm_a_bound_max = self.a_norm.normalize(self.a_bound_max)
a_bound_loss = TFUtil.calc_bound_loss(norm_a_mean_tf, norm_a_bound_min, norm_a_bound_max)
a_bound_loss /= self.exp_params_curr.noise
self.actor_loss_tf += a_bound_loss
return
if (actor_weight_decay != 0):
self.actor_loss_tf += actor_weight_decay * self._weight_decay_loss('main/actor')
def _build_net_actor(self, net_name, init_output_scale):
norm_s_tf = self.s_norm.normalize_tf(self.s_tf)
input_tfs = [norm_s_tf]
if (self.has_goal()):
norm_g_tf = self.g_norm.normalize_tf(self.g_tf)
input_tfs += [norm_g_tf]
h = NetBuilder.build_net(net_name, input_tfs)
norm_a_tf = tf.layers.dense(inputs=h, units=self.get_action_size(), activation=None,
kernel_initializer=tf.random_uniform_initializer(minval=-init_output_scale, maxval=init_output_scale))
a_tf = self.a_norm.unnormalize_tf(norm_a_tf)
return a_tf
def _build_net_critic(self, net_name):
norm_s_tf = self.s_norm.normalize_tf(self.s_tf)
input_tfs = [norm_s_tf]
if (self.has_goal()):
norm_g_tf = self.g_norm.normalize_tf(self.g_tf)
input_tfs += [norm_g_tf]
h = NetBuilder.build_net(net_name, input_tfs)
norm_val_tf = tf.layers.dense(inputs=h, units=1, activation=None,
kernel_initializer=TFUtil.xavier_initializer);
return
norm_val_tf = tf.reshape(norm_val_tf, [-1])
val_tf = self.val_norm.unnormalize_tf(norm_val_tf)
return val_tf
def _build_solvers(self, json_data):
actor_stepsize = 0.001 if (
self.ACTOR_STEPSIZE_KEY not in json_data) else json_data[self.ACTOR_STEPSIZE_KEY]
actor_momentum = 0.9 if (
self.ACTOR_MOMENTUM_KEY not in json_data) else json_data[self.ACTOR_MOMENTUM_KEY]
critic_stepsize = 0.01 if (
self.CRITIC_STEPSIZE_KEY not in json_data) else json_data[self.CRITIC_STEPSIZE_KEY]
critic_momentum = 0.9 if (
self.CRITIC_MOMENTUM_KEY not in json_data) else json_data[self.CRITIC_MOMENTUM_KEY]
def _initialize_vars(self):
super()._initialize_vars()
self._sync_solvers()
return
critic_vars = self._tf_vars('main/critic')
critic_opt = tf.train.MomentumOptimizer(learning_rate=critic_stepsize,
momentum=critic_momentum)
self.critic_grad_tf = tf.gradients(self.critic_loss_tf, critic_vars)
self.critic_solver = MPISolver(self.sess, critic_opt, critic_vars)
def _sync_solvers(self):
self.actor_solver.sync()
self.critic_solver.sync()
return
actor_vars = self._tf_vars('main/actor')
actor_opt = tf.train.MomentumOptimizer(learning_rate=actor_stepsize, momentum=actor_momentum)
self.actor_grad_tf = tf.gradients(self.actor_loss_tf, actor_vars)
self.actor_solver = MPISolver(self.sess, actor_opt, actor_vars)
def _decide_action(self, s, g):
with self.sess.as_default(), self.graph.as_default():
self._exp_action = False
a = self._eval_actor(s, g)[0]
logp = 0
return
if self._enable_stoch_policy():
# epsilon-greedy
rand_action = MathUtil.flip_coin(self.exp_params_curr.rate)
if rand_action:
norm_exp_noise = np.random.randn(*a.shape)
norm_exp_noise *= self.exp_params_curr.noise
exp_noise = norm_exp_noise * self.a_norm.std
a += exp_noise
def _build_net_actor(self, net_name, init_output_scale):
norm_s_tf = self.s_norm.normalize_tf(self.s_tf)
input_tfs = [norm_s_tf]
if (self.has_goal()):
norm_g_tf = self.g_norm.normalize_tf(self.g_tf)
input_tfs += [norm_g_tf]
logp = self._calc_action_logp(norm_exp_noise)
self._exp_action = True
h = NetBuilder.build_net(net_name, input_tfs)
norm_a_tf = tf.layers.dense(inputs=h,
units=self.get_action_size(),
activation=None,
kernel_initializer=tf.random_uniform_initializer(
minval=-init_output_scale, maxval=init_output_scale))
return a, logp
a_tf = self.a_norm.unnormalize_tf(norm_a_tf)
return a_tf
def _enable_stoch_policy(self):
return self.enable_training and (self._mode == self.Mode.TRAIN or self._mode == self.Mode.TRAIN_END)
def _build_net_critic(self, net_name):
norm_s_tf = self.s_norm.normalize_tf(self.s_tf)
input_tfs = [norm_s_tf]
if (self.has_goal()):
norm_g_tf = self.g_norm.normalize_tf(self.g_tf)
input_tfs += [norm_g_tf]
def _eval_actor(self, s, g):
s = np.reshape(s, [-1, self.get_state_size()])
g = np.reshape(g, [-1, self.get_goal_size()]) if self.has_goal() else None
feed = {
self.s_tf : s,
self.g_tf : g
}
h = NetBuilder.build_net(net_name, input_tfs)
norm_val_tf = tf.layers.dense(inputs=h,
units=1,
activation=None,
kernel_initializer=TFUtil.xavier_initializer)
a = self.actor_tf.eval(feed)
return a
def _eval_critic(self, s, g):
with self.sess.as_default(), self.graph.as_default():
s = np.reshape(s, [-1, self.get_state_size()])
g = np.reshape(g, [-1, self.get_goal_size()]) if self.has_goal() else None
norm_val_tf = tf.reshape(norm_val_tf, [-1])
val_tf = self.val_norm.unnormalize_tf(norm_val_tf)
return val_tf
feed = {
self.s_tf : s,
self.g_tf : g
}
def _initialize_vars(self):
super()._initialize_vars()
self._sync_solvers()
return
val = self.critic_tf.eval(feed)
return val
def _sync_solvers(self):
self.actor_solver.sync()
self.critic_solver.sync()
return
def _record_flags(self):
flags = int(0)
if (self._exp_action):
flags = flags | self.EXP_ACTION_FLAG
return flags
def _decide_action(self, s, g):
with self.sess.as_default(), self.graph.as_default():
self._exp_action = False
a = self._eval_actor(s, g)[0]
logp = 0
def _train_step(self):
super()._train_step()
if self._enable_stoch_policy():
# epsilon-greedy
rand_action = MathUtil.flip_coin(self.exp_params_curr.rate)
if rand_action:
norm_exp_noise = np.random.randn(*a.shape)
norm_exp_noise *= self.exp_params_curr.noise
exp_noise = norm_exp_noise * self.a_norm.std
a += exp_noise
critic_loss = self._update_critic()
actor_loss = self._update_actor()
critic_loss = MPIUtil.reduce_avg(critic_loss)
actor_loss = MPIUtil.reduce_avg(actor_loss)
logp = self._calc_action_logp(norm_exp_noise)
self._exp_action = True
critic_stepsize = self.critic_solver.get_stepsize()
actor_stepsize = self.actor_solver.get_stepsize()
self.logger.log_tabular('Critic_Loss', critic_loss)
self.logger.log_tabular('Critic_Stepsize', critic_stepsize)
self.logger.log_tabular('Actor_Loss', actor_loss)
self.logger.log_tabular('Actor_Stepsize', actor_stepsize)
return a, logp
return
def _enable_stoch_policy(self):
return self.enable_training and (self._mode == self.Mode.TRAIN or
self._mode == self.Mode.TRAIN_END)
def _update_critic(self):
idx = self.replay_buffer.sample(self._local_mini_batch_size)
s = self.replay_buffer.get('states', idx)
g = self.replay_buffer.get('goals', idx) if self.has_goal() else None
tar_V = self._calc_updated_vals(idx)
tar_V = np.clip(tar_V, self.val_min, self.val_max)
def _eval_actor(self, s, g):
s = np.reshape(s, [-1, self.get_state_size()])
g = np.reshape(g, [-1, self.get_goal_size()]) if self.has_goal() else None
feed = {
self.s_tf: s,
self.g_tf: g,
self.tar_val_tf: tar_V
}
feed = {self.s_tf: s, self.g_tf: g}
loss, grads = self.sess.run([self.critic_loss_tf, self.critic_grad_tf], feed)
self.critic_solver.update(grads)
return loss
def _update_actor(self):
key = self.EXP_ACTION_FLAG
idx = self.replay_buffer.sample_filtered(self._local_mini_batch_size, key)
has_goal = self.has_goal()
a = self.actor_tf.eval(feed)
return a
s = self.replay_buffer.get('states', idx)
g = self.replay_buffer.get('goals', idx) if has_goal else None
a = self.replay_buffer.get('actions', idx)
def _eval_critic(self, s, g):
with self.sess.as_default(), self.graph.as_default():
s = np.reshape(s, [-1, self.get_state_size()])
g = np.reshape(g, [-1, self.get_goal_size()]) if self.has_goal() else None
V_new = self._calc_updated_vals(idx)
V_old = self._eval_critic(s, g)
adv = V_new - V_old
feed = {self.s_tf: s, self.g_tf: g}
feed = {
self.s_tf: s,
self.g_tf: g,
self.a_tf: a,
self.adv_tf: adv
}
val = self.critic_tf.eval(feed)
return val
loss, grads = self.sess.run([self.actor_loss_tf, self.actor_grad_tf], feed)
self.actor_solver.update(grads)
def _record_flags(self):
flags = int(0)
if (self._exp_action):
flags = flags | self.EXP_ACTION_FLAG
return flags
return loss
def _train_step(self):
super()._train_step()
def _calc_updated_vals(self, idx):
r = self.replay_buffer.get('rewards', idx)
critic_loss = self._update_critic()
actor_loss = self._update_actor()
critic_loss = MPIUtil.reduce_avg(critic_loss)
actor_loss = MPIUtil.reduce_avg(actor_loss)
if self.discount == 0:
new_V = r
else:
next_idx = self.replay_buffer.get_next_idx(idx)
s_next = self.replay_buffer.get('states', next_idx)
g_next = self.replay_buffer.get('goals', next_idx) if self.has_goal() else None
critic_stepsize = self.critic_solver.get_stepsize()
actor_stepsize = self.actor_solver.get_stepsize()
is_end = self.replay_buffer.is_path_end(idx)
is_fail = self.replay_buffer.check_terminal_flag(idx, Env.Terminate.Fail)
is_succ = self.replay_buffer.check_terminal_flag(idx, Env.Terminate.Succ)
is_fail = np.logical_and(is_end, is_fail)
is_succ = np.logical_and(is_end, is_succ)
self.logger.log_tabular('Critic_Loss', critic_loss)
self.logger.log_tabular('Critic_Stepsize', critic_stepsize)
self.logger.log_tabular('Actor_Loss', actor_loss)
self.logger.log_tabular('Actor_Stepsize', actor_stepsize)
V_next = self._eval_critic(s_next, g_next)
V_next[is_fail] = self.val_fail
V_next[is_succ] = self.val_succ
return
new_V = r + self.discount * V_next
return new_V
def _update_critic(self):
idx = self.replay_buffer.sample(self._local_mini_batch_size)
s = self.replay_buffer.get('states', idx)
g = self.replay_buffer.get('goals', idx) if self.has_goal() else None
def _calc_action_logp(self, norm_action_deltas):
# norm action delta are for the normalized actions (scaled by self.a_norm.std)
stdev = self.exp_params_curr.noise
assert stdev > 0
tar_V = self._calc_updated_vals(idx)
tar_V = np.clip(tar_V, self.val_min, self.val_max)
a_size = self.get_action_size()
logp = -0.5 / (stdev * stdev) * np.sum(np.square(norm_action_deltas), axis=-1)
logp += -0.5 * a_size * np.log(2 * np.pi)
logp += -a_size * np.log(stdev)
return logp
feed = {self.s_tf: s, self.g_tf: g, self.tar_val_tf: tar_V}
def _log_val(self, s, g):
val = self._eval_critic(s, g)
norm_val = self.val_norm.normalize(val)
self.world.env.log_val(self.id, norm_val[0])
return
loss, grads = self.sess.run([self.critic_loss_tf, self.critic_grad_tf], feed)
self.critic_solver.update(grads)
return loss
def _build_replay_buffer(self, buffer_size):
super()._build_replay_buffer(buffer_size)
self.replay_buffer.add_filter_key(self.EXP_ACTION_FLAG)
return
def _update_actor(self):
key = self.EXP_ACTION_FLAG
idx = self.replay_buffer.sample_filtered(self._local_mini_batch_size, key)
has_goal = self.has_goal()
s = self.replay_buffer.get('states', idx)
g = self.replay_buffer.get('goals', idx) if has_goal else None
a = self.replay_buffer.get('actions', idx)
V_new = self._calc_updated_vals(idx)
V_old = self._eval_critic(s, g)
adv = V_new - V_old
feed = {self.s_tf: s, self.g_tf: g, self.a_tf: a, self.adv_tf: adv}
loss, grads = self.sess.run([self.actor_loss_tf, self.actor_grad_tf], feed)
self.actor_solver.update(grads)
return loss
def _calc_updated_vals(self, idx):
r = self.replay_buffer.get('rewards', idx)
if self.discount == 0:
new_V = r
else:
next_idx = self.replay_buffer.get_next_idx(idx)
s_next = self.replay_buffer.get('states', next_idx)
g_next = self.replay_buffer.get('goals', next_idx) if self.has_goal() else None
is_end = self.replay_buffer.is_path_end(idx)
is_fail = self.replay_buffer.check_terminal_flag(idx, Env.Terminate.Fail)
is_succ = self.replay_buffer.check_terminal_flag(idx, Env.Terminate.Succ)
is_fail = np.logical_and(is_end, is_fail)
is_succ = np.logical_and(is_end, is_succ)
V_next = self._eval_critic(s_next, g_next)
V_next[is_fail] = self.val_fail
V_next[is_succ] = self.val_succ
new_V = r + self.discount * V_next
return new_V
def _calc_action_logp(self, norm_action_deltas):
# norm action delta are for the normalized actions (scaled by self.a_norm.std)
stdev = self.exp_params_curr.noise
assert stdev > 0
a_size = self.get_action_size()
logp = -0.5 / (stdev * stdev) * np.sum(np.square(norm_action_deltas), axis=-1)
logp += -0.5 * a_size * np.log(2 * np.pi)
logp += -a_size * np.log(stdev)
return logp
def _log_val(self, s, g):
val = self._eval_critic(s, g)
norm_val = self.val_norm.normalize(val)
self.world.env.log_val(self.id, norm_val[0])
return
def _build_replay_buffer(self, buffer_size):
super()._build_replay_buffer(buffer_size)
self.replay_buffer.add_filter_key(self.EXP_ACTION_FLAG)
return

View File

@@ -10,359 +10,374 @@ from pybullet_utils.logger import Logger
import pybullet_utils.mpi_util as MPIUtil
import pybullet_utils.math_util as MathUtil
from pybullet_envs.deep_mimic.env.env import Env
'''
Proximal Policy Optimization Agent
'''
class PPOAgent(PGAgent):
NAME = "PPO"
EPOCHS_KEY = "Epochs"
BATCH_SIZE_KEY = "BatchSize"
RATIO_CLIP_KEY = "RatioClip"
NORM_ADV_CLIP_KEY = "NormAdvClip"
TD_LAMBDA_KEY = "TDLambda"
TAR_CLIP_FRAC = "TarClipFrac"
ACTOR_STEPSIZE_DECAY = "ActorStepsizeDecay"
NAME = "PPO"
EPOCHS_KEY = "Epochs"
BATCH_SIZE_KEY = "BatchSize"
RATIO_CLIP_KEY = "RatioClip"
NORM_ADV_CLIP_KEY = "NormAdvClip"
TD_LAMBDA_KEY = "TDLambda"
TAR_CLIP_FRAC = "TarClipFrac"
ACTOR_STEPSIZE_DECAY = "ActorStepsizeDecay"
def __init__(self, world, id, json_data):
super().__init__(world, id, json_data)
return
def __init__(self, world, id, json_data):
super().__init__(world, id, json_data)
return
def _load_params(self, json_data):
super()._load_params(json_data)
def _load_params(self, json_data):
super()._load_params(json_data)
self.epochs = 1 if (self.EPOCHS_KEY not in json_data) else json_data[self.EPOCHS_KEY]
self.batch_size = 1024 if (self.BATCH_SIZE_KEY not in json_data) else json_data[self.BATCH_SIZE_KEY]
self.ratio_clip = 0.2 if (self.RATIO_CLIP_KEY not in json_data) else json_data[self.RATIO_CLIP_KEY]
self.norm_adv_clip = 5 if (self.NORM_ADV_CLIP_KEY not in json_data) else json_data[self.NORM_ADV_CLIP_KEY]
self.td_lambda = 0.95 if (self.TD_LAMBDA_KEY not in json_data) else json_data[self.TD_LAMBDA_KEY]
self.tar_clip_frac = -1 if (self.TAR_CLIP_FRAC not in json_data) else json_data[self.TAR_CLIP_FRAC]
self.actor_stepsize_decay = 0.5 if (self.ACTOR_STEPSIZE_DECAY not in json_data) else json_data[self.ACTOR_STEPSIZE_DECAY]
self.epochs = 1 if (self.EPOCHS_KEY not in json_data) else json_data[self.EPOCHS_KEY]
self.batch_size = 1024 if (
self.BATCH_SIZE_KEY not in json_data) else json_data[self.BATCH_SIZE_KEY]
self.ratio_clip = 0.2 if (
self.RATIO_CLIP_KEY not in json_data) else json_data[self.RATIO_CLIP_KEY]
self.norm_adv_clip = 5 if (
self.NORM_ADV_CLIP_KEY not in json_data) else json_data[self.NORM_ADV_CLIP_KEY]
self.td_lambda = 0.95 if (
self.TD_LAMBDA_KEY not in json_data) else json_data[self.TD_LAMBDA_KEY]
self.tar_clip_frac = -1 if (
self.TAR_CLIP_FRAC not in json_data) else json_data[self.TAR_CLIP_FRAC]
self.actor_stepsize_decay = 0.5 if (
self.ACTOR_STEPSIZE_DECAY not in json_data) else json_data[self.ACTOR_STEPSIZE_DECAY]
num_procs = MPIUtil.get_num_procs()
local_batch_size = int(self.batch_size / num_procs)
min_replay_size = 2 * local_batch_size # needed to prevent buffer overflow
assert(self.replay_buffer_size > min_replay_size)
num_procs = MPIUtil.get_num_procs()
local_batch_size = int(self.batch_size / num_procs)
min_replay_size = 2 * local_batch_size # needed to prevent buffer overflow
assert (self.replay_buffer_size > min_replay_size)
self.replay_buffer_size = np.maximum(min_replay_size, self.replay_buffer_size)
self.replay_buffer_size = np.maximum(min_replay_size, self.replay_buffer_size)
return
return
def _build_nets(self, json_data):
assert self.ACTOR_NET_KEY in json_data
assert self.CRITIC_NET_KEY in json_data
def _build_nets(self, json_data):
assert self.ACTOR_NET_KEY in json_data
assert self.CRITIC_NET_KEY in json_data
actor_net_name = json_data[self.ACTOR_NET_KEY]
critic_net_name = json_data[self.CRITIC_NET_KEY]
actor_init_output_scale = 1 if (self.ACTOR_INIT_OUTPUT_SCALE_KEY not in json_data) else json_data[self.ACTOR_INIT_OUTPUT_SCALE_KEY]
actor_net_name = json_data[self.ACTOR_NET_KEY]
critic_net_name = json_data[self.CRITIC_NET_KEY]
actor_init_output_scale = 1 if (self.ACTOR_INIT_OUTPUT_SCALE_KEY not in json_data
) else json_data[self.ACTOR_INIT_OUTPUT_SCALE_KEY]
s_size = self.get_state_size()
g_size = self.get_goal_size()
a_size = self.get_action_size()
s_size = self.get_state_size()
g_size = self.get_goal_size()
a_size = self.get_action_size()
# setup input tensors
self.s_tf = tf.placeholder(tf.float32, shape=[None, s_size], name="s")
self.a_tf = tf.placeholder(tf.float32, shape=[None, a_size], name="a")
self.tar_val_tf = tf.placeholder(tf.float32, shape=[None], name="tar_val")
self.adv_tf = tf.placeholder(tf.float32, shape=[None], name="adv")
self.g_tf = tf.placeholder(tf.float32, shape=([None, g_size] if self.has_goal() else None), name="g")
self.old_logp_tf = tf.placeholder(tf.float32, shape=[None], name="old_logp")
self.exp_mask_tf = tf.placeholder(tf.float32, shape=[None], name="exp_mask")
# setup input tensors
self.s_tf = tf.placeholder(tf.float32, shape=[None, s_size], name="s")
self.a_tf = tf.placeholder(tf.float32, shape=[None, a_size], name="a")
self.tar_val_tf = tf.placeholder(tf.float32, shape=[None], name="tar_val")
self.adv_tf = tf.placeholder(tf.float32, shape=[None], name="adv")
self.g_tf = tf.placeholder(tf.float32,
shape=([None, g_size] if self.has_goal() else None),
name="g")
self.old_logp_tf = tf.placeholder(tf.float32, shape=[None], name="old_logp")
self.exp_mask_tf = tf.placeholder(tf.float32, shape=[None], name="exp_mask")
with tf.variable_scope('main'):
with tf.variable_scope('actor'):
self.a_mean_tf = self._build_net_actor(actor_net_name, actor_init_output_scale)
with tf.variable_scope('critic'):
self.critic_tf = self._build_net_critic(critic_net_name)
if (self.a_mean_tf != None):
Logger.print2('Built actor net: ' + actor_net_name)
with tf.variable_scope('main'):
with tf.variable_scope('actor'):
self.a_mean_tf = self._build_net_actor(actor_net_name, actor_init_output_scale)
with tf.variable_scope('critic'):
self.critic_tf = self._build_net_critic(critic_net_name)
if (self.critic_tf != None):
Logger.print2('Built critic net: ' + critic_net_name)
self.norm_a_std_tf = self.exp_params_curr.noise * tf.ones(a_size)
norm_a_noise_tf = self.norm_a_std_tf * tf.random_normal(shape=tf.shape(self.a_mean_tf))
norm_a_noise_tf *= tf.expand_dims(self.exp_mask_tf, axis=-1)
self.sample_a_tf = self.a_mean_tf + norm_a_noise_tf * self.a_norm.std_tf
self.sample_a_logp_tf = TFUtil.calc_logp_gaussian(x_tf=norm_a_noise_tf, mean_tf=None, std_tf=self.norm_a_std_tf)
if (self.a_mean_tf != None):
Logger.print2('Built actor net: ' + actor_net_name)
return
if (self.critic_tf != None):
Logger.print2('Built critic net: ' + critic_net_name)
def _build_losses(self, json_data):
actor_weight_decay = 0 if (self.ACTOR_WEIGHT_DECAY_KEY not in json_data) else json_data[self.ACTOR_WEIGHT_DECAY_KEY]
critic_weight_decay = 0 if (self.CRITIC_WEIGHT_DECAY_KEY not in json_data) else json_data[self.CRITIC_WEIGHT_DECAY_KEY]
norm_val_diff = self.val_norm.normalize_tf(self.tar_val_tf) - self.val_norm.normalize_tf(self.critic_tf)
self.critic_loss_tf = 0.5 * tf.reduce_mean(tf.square(norm_val_diff))
self.norm_a_std_tf = self.exp_params_curr.noise * tf.ones(a_size)
norm_a_noise_tf = self.norm_a_std_tf * tf.random_normal(shape=tf.shape(self.a_mean_tf))
norm_a_noise_tf *= tf.expand_dims(self.exp_mask_tf, axis=-1)
self.sample_a_tf = self.a_mean_tf + norm_a_noise_tf * self.a_norm.std_tf
self.sample_a_logp_tf = TFUtil.calc_logp_gaussian(x_tf=norm_a_noise_tf,
mean_tf=None,
std_tf=self.norm_a_std_tf)
if (critic_weight_decay != 0):
self.critic_loss_tf += critic_weight_decay * self._weight_decay_loss('main/critic')
norm_tar_a_tf = self.a_norm.normalize_tf(self.a_tf)
self._norm_a_mean_tf = self.a_norm.normalize_tf(self.a_mean_tf)
return
self.logp_tf = TFUtil.calc_logp_gaussian(norm_tar_a_tf, self._norm_a_mean_tf, self.norm_a_std_tf)
ratio_tf = tf.exp(self.logp_tf - self.old_logp_tf)
actor_loss0 = self.adv_tf * ratio_tf
actor_loss1 = self.adv_tf * tf.clip_by_value(ratio_tf, 1.0 - self.ratio_clip, 1 + self.ratio_clip)
self.actor_loss_tf = -tf.reduce_mean(tf.minimum(actor_loss0, actor_loss1))
def _build_losses(self, json_data):
actor_weight_decay = 0 if (
self.ACTOR_WEIGHT_DECAY_KEY not in json_data) else json_data[self.ACTOR_WEIGHT_DECAY_KEY]
critic_weight_decay = 0 if (
self.CRITIC_WEIGHT_DECAY_KEY not in json_data) else json_data[self.CRITIC_WEIGHT_DECAY_KEY]
norm_a_bound_min = self.a_norm.normalize(self.a_bound_min)
norm_a_bound_max = self.a_norm.normalize(self.a_bound_max)
a_bound_loss = TFUtil.calc_bound_loss(self._norm_a_mean_tf, norm_a_bound_min, norm_a_bound_max)
self.actor_loss_tf += a_bound_loss
norm_val_diff = self.val_norm.normalize_tf(self.tar_val_tf) - self.val_norm.normalize_tf(
self.critic_tf)
self.critic_loss_tf = 0.5 * tf.reduce_mean(tf.square(norm_val_diff))
if (actor_weight_decay != 0):
self.actor_loss_tf += actor_weight_decay * self._weight_decay_loss('main/actor')
# for debugging
self.clip_frac_tf = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio_tf - 1.0), self.ratio_clip)))
if (critic_weight_decay != 0):
self.critic_loss_tf += critic_weight_decay * self._weight_decay_loss('main/critic')
return
norm_tar_a_tf = self.a_norm.normalize_tf(self.a_tf)
self._norm_a_mean_tf = self.a_norm.normalize_tf(self.a_mean_tf)
def _build_solvers(self, json_data):
actor_stepsize = 0.001 if (self.ACTOR_STEPSIZE_KEY not in json_data) else json_data[self.ACTOR_STEPSIZE_KEY]
actor_momentum = 0.9 if (self.ACTOR_MOMENTUM_KEY not in json_data) else json_data[self.ACTOR_MOMENTUM_KEY]
critic_stepsize = 0.01 if (self.CRITIC_STEPSIZE_KEY not in json_data) else json_data[self.CRITIC_STEPSIZE_KEY]
critic_momentum = 0.9 if (self.CRITIC_MOMENTUM_KEY not in json_data) else json_data[self.CRITIC_MOMENTUM_KEY]
critic_vars = self._tf_vars('main/critic')
critic_opt = tf.train.MomentumOptimizer(learning_rate=critic_stepsize, momentum=critic_momentum)
self.critic_grad_tf = tf.gradients(self.critic_loss_tf, critic_vars)
self.critic_solver = MPISolver(self.sess, critic_opt, critic_vars)
self.logp_tf = TFUtil.calc_logp_gaussian(norm_tar_a_tf, self._norm_a_mean_tf,
self.norm_a_std_tf)
ratio_tf = tf.exp(self.logp_tf - self.old_logp_tf)
actor_loss0 = self.adv_tf * ratio_tf
actor_loss1 = self.adv_tf * tf.clip_by_value(ratio_tf, 1.0 - self.ratio_clip,
1 + self.ratio_clip)
self.actor_loss_tf = -tf.reduce_mean(tf.minimum(actor_loss0, actor_loss1))
self._actor_stepsize_tf = tf.get_variable(dtype=tf.float32, name='actor_stepsize', initializer=actor_stepsize, trainable=False)
self._actor_stepsize_ph = tf.get_variable(dtype=tf.float32, name='actor_stepsize_ph', shape=[])
self._actor_stepsize_update_op = self._actor_stepsize_tf.assign(self._actor_stepsize_ph)
norm_a_bound_min = self.a_norm.normalize(self.a_bound_min)
norm_a_bound_max = self.a_norm.normalize(self.a_bound_max)
a_bound_loss = TFUtil.calc_bound_loss(self._norm_a_mean_tf, norm_a_bound_min, norm_a_bound_max)
self.actor_loss_tf += a_bound_loss
actor_vars = self._tf_vars('main/actor')
actor_opt = tf.train.MomentumOptimizer(learning_rate=self._actor_stepsize_tf, momentum=actor_momentum)
self.actor_grad_tf = tf.gradients(self.actor_loss_tf, actor_vars)
self.actor_solver = MPISolver(self.sess, actor_opt, actor_vars)
return
if (actor_weight_decay != 0):
self.actor_loss_tf += actor_weight_decay * self._weight_decay_loss('main/actor')
def _decide_action(self, s, g):
with self.sess.as_default(), self.graph.as_default():
self._exp_action = self._enable_stoch_policy() and MathUtil.flip_coin(self.exp_params_curr.rate)
#print("_decide_action._exp_action=",self._exp_action)
a, logp = self._eval_actor(s, g, self._exp_action)
return a[0], logp[0]
# for debugging
self.clip_frac_tf = tf.reduce_mean(
tf.to_float(tf.greater(tf.abs(ratio_tf - 1.0), self.ratio_clip)))
def _eval_actor(self, s, g, enable_exp):
s = np.reshape(s, [-1, self.get_state_size()])
g = np.reshape(g, [-1, self.get_goal_size()]) if self.has_goal() else None
feed = {
self.s_tf : s,
self.g_tf : g,
self.exp_mask_tf: np.array([1 if enable_exp else 0])
}
return
a, logp = self.sess.run([self.sample_a_tf, self.sample_a_logp_tf], feed_dict=feed)
return a, logp
def _build_solvers(self, json_data):
actor_stepsize = 0.001 if (
self.ACTOR_STEPSIZE_KEY not in json_data) else json_data[self.ACTOR_STEPSIZE_KEY]
actor_momentum = 0.9 if (
self.ACTOR_MOMENTUM_KEY not in json_data) else json_data[self.ACTOR_MOMENTUM_KEY]
critic_stepsize = 0.01 if (
self.CRITIC_STEPSIZE_KEY not in json_data) else json_data[self.CRITIC_STEPSIZE_KEY]
critic_momentum = 0.9 if (
self.CRITIC_MOMENTUM_KEY not in json_data) else json_data[self.CRITIC_MOMENTUM_KEY]
def _train_step(self):
adv_eps = 1e-5
critic_vars = self._tf_vars('main/critic')
critic_opt = tf.train.MomentumOptimizer(learning_rate=critic_stepsize,
momentum=critic_momentum)
self.critic_grad_tf = tf.gradients(self.critic_loss_tf, critic_vars)
self.critic_solver = MPISolver(self.sess, critic_opt, critic_vars)
start_idx = self.replay_buffer.buffer_tail
end_idx = self.replay_buffer.buffer_head
assert(start_idx == 0)
assert(self.replay_buffer.get_current_size() <= self.replay_buffer.buffer_size) # must avoid overflow
assert(start_idx < end_idx)
self._actor_stepsize_tf = tf.get_variable(dtype=tf.float32,
name='actor_stepsize',
initializer=actor_stepsize,
trainable=False)
self._actor_stepsize_ph = tf.get_variable(dtype=tf.float32, name='actor_stepsize_ph', shape=[])
self._actor_stepsize_update_op = self._actor_stepsize_tf.assign(self._actor_stepsize_ph)
idx = np.array(list(range(start_idx, end_idx)))
end_mask = self.replay_buffer.is_path_end(idx)
end_mask = np.logical_not(end_mask)
vals = self._compute_batch_vals(start_idx, end_idx)
new_vals = self._compute_batch_new_vals(start_idx, end_idx, vals)
actor_vars = self._tf_vars('main/actor')
actor_opt = tf.train.MomentumOptimizer(learning_rate=self._actor_stepsize_tf,
momentum=actor_momentum)
self.actor_grad_tf = tf.gradients(self.actor_loss_tf, actor_vars)
self.actor_solver = MPISolver(self.sess, actor_opt, actor_vars)
valid_idx = idx[end_mask]
exp_idx = self.replay_buffer.get_idx_filtered(self.EXP_ACTION_FLAG).copy()
num_valid_idx = valid_idx.shape[0]
num_exp_idx = exp_idx.shape[0]
exp_idx = np.column_stack([exp_idx, np.array(list(range(0, num_exp_idx)), dtype=np.int32)])
local_sample_count = valid_idx.size
global_sample_count = int(MPIUtil.reduce_sum(local_sample_count))
mini_batches = int(np.ceil(global_sample_count / self.mini_batch_size))
adv = new_vals[exp_idx[:,0]] - vals[exp_idx[:,0]]
new_vals = np.clip(new_vals, self.val_min, self.val_max)
return
adv_mean = np.mean(adv)
adv_std = np.std(adv)
adv = (adv - adv_mean) / (adv_std + adv_eps)
adv = np.clip(adv, -self.norm_adv_clip, self.norm_adv_clip)
def _decide_action(self, s, g):
with self.sess.as_default(), self.graph.as_default():
self._exp_action = self._enable_stoch_policy() and MathUtil.flip_coin(
self.exp_params_curr.rate)
#print("_decide_action._exp_action=",self._exp_action)
a, logp = self._eval_actor(s, g, self._exp_action)
return a[0], logp[0]
critic_loss = 0
actor_loss = 0
actor_clip_frac = 0
def _eval_actor(self, s, g, enable_exp):
s = np.reshape(s, [-1, self.get_state_size()])
g = np.reshape(g, [-1, self.get_goal_size()]) if self.has_goal() else None
for e in range(self.epochs):
np.random.shuffle(valid_idx)
np.random.shuffle(exp_idx)
feed = {self.s_tf: s, self.g_tf: g, self.exp_mask_tf: np.array([1 if enable_exp else 0])}
for b in range(mini_batches):
batch_idx_beg = b * self._local_mini_batch_size
batch_idx_end = batch_idx_beg + self._local_mini_batch_size
a, logp = self.sess.run([self.sample_a_tf, self.sample_a_logp_tf], feed_dict=feed)
return a, logp
critic_batch = np.array(range(batch_idx_beg, batch_idx_end), dtype=np.int32)
actor_batch = critic_batch.copy()
critic_batch = np.mod(critic_batch, num_valid_idx)
actor_batch = np.mod(actor_batch, num_exp_idx)
shuffle_actor = (actor_batch[-1] < actor_batch[0]) or (actor_batch[-1] == num_exp_idx - 1)
def _train_step(self):
adv_eps = 1e-5
critic_batch = valid_idx[critic_batch]
actor_batch = exp_idx[actor_batch]
critic_batch_vals = new_vals[critic_batch]
actor_batch_adv = adv[actor_batch[:,1]]
start_idx = self.replay_buffer.buffer_tail
end_idx = self.replay_buffer.buffer_head
assert (start_idx == 0)
assert (self.replay_buffer.get_current_size() <= self.replay_buffer.buffer_size
) # must avoid overflow
assert (start_idx < end_idx)
critic_s = self.replay_buffer.get('states', critic_batch)
critic_g = self.replay_buffer.get('goals', critic_batch) if self.has_goal() else None
curr_critic_loss = self._update_critic(critic_s, critic_g, critic_batch_vals)
idx = np.array(list(range(start_idx, end_idx)))
end_mask = self.replay_buffer.is_path_end(idx)
end_mask = np.logical_not(end_mask)
actor_s = self.replay_buffer.get("states", actor_batch[:,0])
actor_g = self.replay_buffer.get("goals", actor_batch[:,0]) if self.has_goal() else None
actor_a = self.replay_buffer.get("actions", actor_batch[:,0])
actor_logp = self.replay_buffer.get("logps", actor_batch[:,0])
curr_actor_loss, curr_actor_clip_frac = self._update_actor(actor_s, actor_g, actor_a, actor_logp, actor_batch_adv)
critic_loss += curr_critic_loss
actor_loss += np.abs(curr_actor_loss)
actor_clip_frac += curr_actor_clip_frac
vals = self._compute_batch_vals(start_idx, end_idx)
new_vals = self._compute_batch_new_vals(start_idx, end_idx, vals)
if (shuffle_actor):
np.random.shuffle(exp_idx)
valid_idx = idx[end_mask]
exp_idx = self.replay_buffer.get_idx_filtered(self.EXP_ACTION_FLAG).copy()
num_valid_idx = valid_idx.shape[0]
num_exp_idx = exp_idx.shape[0]
exp_idx = np.column_stack([exp_idx, np.array(list(range(0, num_exp_idx)), dtype=np.int32)])
total_batches = mini_batches * self.epochs
critic_loss /= total_batches
actor_loss /= total_batches
actor_clip_frac /= total_batches
local_sample_count = valid_idx.size
global_sample_count = int(MPIUtil.reduce_sum(local_sample_count))
mini_batches = int(np.ceil(global_sample_count / self.mini_batch_size))
critic_loss = MPIUtil.reduce_avg(critic_loss)
actor_loss = MPIUtil.reduce_avg(actor_loss)
actor_clip_frac = MPIUtil.reduce_avg(actor_clip_frac)
adv = new_vals[exp_idx[:, 0]] - vals[exp_idx[:, 0]]
new_vals = np.clip(new_vals, self.val_min, self.val_max)
critic_stepsize = self.critic_solver.get_stepsize()
actor_stepsize = self.update_actor_stepsize(actor_clip_frac)
adv_mean = np.mean(adv)
adv_std = np.std(adv)
adv = (adv - adv_mean) / (adv_std + adv_eps)
adv = np.clip(adv, -self.norm_adv_clip, self.norm_adv_clip)
self.logger.log_tabular('Critic_Loss', critic_loss)
self.logger.log_tabular('Critic_Stepsize', critic_stepsize)
self.logger.log_tabular('Actor_Loss', actor_loss)
self.logger.log_tabular('Actor_Stepsize', actor_stepsize)
self.logger.log_tabular('Clip_Frac', actor_clip_frac)
self.logger.log_tabular('Adv_Mean', adv_mean)
self.logger.log_tabular('Adv_Std', adv_std)
critic_loss = 0
actor_loss = 0
actor_clip_frac = 0
self.replay_buffer.clear()
for e in range(self.epochs):
np.random.shuffle(valid_idx)
np.random.shuffle(exp_idx)
return
for b in range(mini_batches):
batch_idx_beg = b * self._local_mini_batch_size
batch_idx_end = batch_idx_beg + self._local_mini_batch_size
def _get_iters_per_update(self):
return 1
critic_batch = np.array(range(batch_idx_beg, batch_idx_end), dtype=np.int32)
actor_batch = critic_batch.copy()
critic_batch = np.mod(critic_batch, num_valid_idx)
actor_batch = np.mod(actor_batch, num_exp_idx)
shuffle_actor = (actor_batch[-1] < actor_batch[0]) or (actor_batch[-1] == num_exp_idx - 1)
def _valid_train_step(self):
samples = self.replay_buffer.get_current_size()
exp_samples = self.replay_buffer.count_filtered(self.EXP_ACTION_FLAG)
global_sample_count = int(MPIUtil.reduce_sum(samples))
global_exp_min = int(MPIUtil.reduce_min(exp_samples))
return (global_sample_count > self.batch_size) and (global_exp_min > 0)
critic_batch = valid_idx[critic_batch]
actor_batch = exp_idx[actor_batch]
critic_batch_vals = new_vals[critic_batch]
actor_batch_adv = adv[actor_batch[:, 1]]
def _compute_batch_vals(self, start_idx, end_idx):
states = self.replay_buffer.get_all("states")[start_idx:end_idx]
goals = self.replay_buffer.get_all("goals")[start_idx:end_idx] if self.has_goal() else None
idx = np.array(list(range(start_idx, end_idx)))
is_end = self.replay_buffer.is_path_end(idx)
is_fail = self.replay_buffer.check_terminal_flag(idx, Env.Terminate.Fail)
is_succ = self.replay_buffer.check_terminal_flag(idx, Env.Terminate.Succ)
is_fail = np.logical_and(is_end, is_fail)
is_succ = np.logical_and(is_end, is_succ)
critic_s = self.replay_buffer.get('states', critic_batch)
critic_g = self.replay_buffer.get('goals', critic_batch) if self.has_goal() else None
curr_critic_loss = self._update_critic(critic_s, critic_g, critic_batch_vals)
vals = self._eval_critic(states, goals)
vals[is_fail] = self.val_fail
vals[is_succ] = self.val_succ
actor_s = self.replay_buffer.get("states", actor_batch[:, 0])
actor_g = self.replay_buffer.get("goals", actor_batch[:, 0]) if self.has_goal() else None
actor_a = self.replay_buffer.get("actions", actor_batch[:, 0])
actor_logp = self.replay_buffer.get("logps", actor_batch[:, 0])
curr_actor_loss, curr_actor_clip_frac = self._update_actor(actor_s, actor_g, actor_a,
actor_logp, actor_batch_adv)
return vals
critic_loss += curr_critic_loss
actor_loss += np.abs(curr_actor_loss)
actor_clip_frac += curr_actor_clip_frac
def _compute_batch_new_vals(self, start_idx, end_idx, val_buffer):
rewards = self.replay_buffer.get_all("rewards")[start_idx:end_idx]
if (shuffle_actor):
np.random.shuffle(exp_idx)
if self.discount == 0:
new_vals = rewards.copy()
total_batches = mini_batches * self.epochs
critic_loss /= total_batches
actor_loss /= total_batches
actor_clip_frac /= total_batches
critic_loss = MPIUtil.reduce_avg(critic_loss)
actor_loss = MPIUtil.reduce_avg(actor_loss)
actor_clip_frac = MPIUtil.reduce_avg(actor_clip_frac)
critic_stepsize = self.critic_solver.get_stepsize()
actor_stepsize = self.update_actor_stepsize(actor_clip_frac)
self.logger.log_tabular('Critic_Loss', critic_loss)
self.logger.log_tabular('Critic_Stepsize', critic_stepsize)
self.logger.log_tabular('Actor_Loss', actor_loss)
self.logger.log_tabular('Actor_Stepsize', actor_stepsize)
self.logger.log_tabular('Clip_Frac', actor_clip_frac)
self.logger.log_tabular('Adv_Mean', adv_mean)
self.logger.log_tabular('Adv_Std', adv_std)
self.replay_buffer.clear()
return
def _get_iters_per_update(self):
return 1
def _valid_train_step(self):
samples = self.replay_buffer.get_current_size()
exp_samples = self.replay_buffer.count_filtered(self.EXP_ACTION_FLAG)
global_sample_count = int(MPIUtil.reduce_sum(samples))
global_exp_min = int(MPIUtil.reduce_min(exp_samples))
return (global_sample_count > self.batch_size) and (global_exp_min > 0)
def _compute_batch_vals(self, start_idx, end_idx):
states = self.replay_buffer.get_all("states")[start_idx:end_idx]
goals = self.replay_buffer.get_all("goals")[start_idx:end_idx] if self.has_goal() else None
idx = np.array(list(range(start_idx, end_idx)))
is_end = self.replay_buffer.is_path_end(idx)
is_fail = self.replay_buffer.check_terminal_flag(idx, Env.Terminate.Fail)
is_succ = self.replay_buffer.check_terminal_flag(idx, Env.Terminate.Succ)
is_fail = np.logical_and(is_end, is_fail)
is_succ = np.logical_and(is_end, is_succ)
vals = self._eval_critic(states, goals)
vals[is_fail] = self.val_fail
vals[is_succ] = self.val_succ
return vals
def _compute_batch_new_vals(self, start_idx, end_idx, val_buffer):
rewards = self.replay_buffer.get_all("rewards")[start_idx:end_idx]
if self.discount == 0:
new_vals = rewards.copy()
else:
new_vals = np.zeros_like(val_buffer)
curr_idx = start_idx
while curr_idx < end_idx:
idx0 = curr_idx - start_idx
idx1 = self.replay_buffer.get_path_end(curr_idx) - start_idx
r = rewards[idx0:idx1]
v = val_buffer[idx0:(idx1 + 1)]
new_vals[idx0:idx1] = RLUtil.compute_return(r, self.discount, self.td_lambda, v)
curr_idx = idx1 + start_idx + 1
return new_vals
def _update_critic(self, s, g, tar_vals):
feed = {self.s_tf: s, self.g_tf: g, self.tar_val_tf: tar_vals}
loss, grads = self.sess.run([self.critic_loss_tf, self.critic_grad_tf], feed)
self.critic_solver.update(grads)
return loss
def _update_actor(self, s, g, a, logp, adv):
feed = {self.s_tf: s, self.g_tf: g, self.a_tf: a, self.adv_tf: adv, self.old_logp_tf: logp}
loss, grads, clip_frac = self.sess.run(
[self.actor_loss_tf, self.actor_grad_tf, self.clip_frac_tf], feed)
self.actor_solver.update(grads)
return loss, clip_frac
def update_actor_stepsize(self, clip_frac):
clip_tol = 1.5
step_scale = 2
max_stepsize = 1e-2
min_stepsize = 1e-8
warmup_iters = 5
actor_stepsize = self.actor_solver.get_stepsize()
if (self.tar_clip_frac >= 0 and self.iter > warmup_iters):
min_clip = self.tar_clip_frac / clip_tol
max_clip = self.tar_clip_frac * clip_tol
under_tol = clip_frac < min_clip
over_tol = clip_frac > max_clip
if (over_tol or under_tol):
if (over_tol):
actor_stepsize *= self.actor_stepsize_decay
else:
new_vals = np.zeros_like(val_buffer)
actor_stepsize /= self.actor_stepsize_decay
curr_idx = start_idx
while curr_idx < end_idx:
idx0 = curr_idx - start_idx
idx1 = self.replay_buffer.get_path_end(curr_idx) - start_idx
r = rewards[idx0:idx1]
v = val_buffer[idx0:(idx1 + 1)]
actor_stepsize = np.clip(actor_stepsize, min_stepsize, max_stepsize)
self.set_actor_stepsize(actor_stepsize)
new_vals[idx0:idx1] = RLUtil.compute_return(r, self.discount, self.td_lambda, v)
curr_idx = idx1 + start_idx + 1
return new_vals
return actor_stepsize
def _update_critic(self, s, g, tar_vals):
feed = {
self.s_tf: s,
self.g_tf: g,
self.tar_val_tf: tar_vals
}
loss, grads = self.sess.run([self.critic_loss_tf, self.critic_grad_tf], feed)
self.critic_solver.update(grads)
return loss
def _update_actor(self, s, g, a, logp, adv):
feed = {
self.s_tf: s,
self.g_tf: g,
self.a_tf: a,
self.adv_tf: adv,
self.old_logp_tf: logp
}
loss, grads, clip_frac = self.sess.run([self.actor_loss_tf, self.actor_grad_tf,
self.clip_frac_tf], feed)
self.actor_solver.update(grads)
return loss, clip_frac
def update_actor_stepsize(self, clip_frac):
clip_tol = 1.5
step_scale = 2
max_stepsize = 1e-2
min_stepsize = 1e-8
warmup_iters = 5
actor_stepsize = self.actor_solver.get_stepsize()
if (self.tar_clip_frac >= 0 and self.iter > warmup_iters):
min_clip = self.tar_clip_frac / clip_tol
max_clip = self.tar_clip_frac * clip_tol
under_tol = clip_frac < min_clip
over_tol = clip_frac > max_clip
if (over_tol or under_tol):
if (over_tol):
actor_stepsize *= self.actor_stepsize_decay
else:
actor_stepsize /= self.actor_stepsize_decay
actor_stepsize = np.clip(actor_stepsize, min_stepsize, max_stepsize)
self.set_actor_stepsize(actor_stepsize)
return actor_stepsize
def set_actor_stepsize(self, stepsize):
feed = {
self._actor_stepsize_ph: stepsize,
}
self.sess.run(self._actor_stepsize_update_op, feed)
return
def set_actor_stepsize(self, stepsize):
feed = {
self._actor_stepsize_ph: stepsize,
}
self.sess.run(self._actor_stepsize_update_op, feed)
return

View File

@@ -5,347 +5,353 @@ import inspect as inspect
from pybullet_envs.deep_mimic.env.env import Env
import pybullet_utils.math_util as MathUtil
class ReplayBuffer(object):
TERMINATE_KEY = 'terminate'
PATH_START_KEY = 'path_start'
PATH_END_KEY = 'path_end'
TERMINATE_KEY = 'terminate'
PATH_START_KEY = 'path_start'
PATH_END_KEY = 'path_end'
def __init__(self, buffer_size):
assert buffer_size > 0
def __init__(self, buffer_size):
assert buffer_size > 0
self.buffer_size = buffer_size
self.total_count = 0
self.buffer_head = 0
self.buffer_tail = MathUtil.INVALID_IDX
self.num_paths = 0
self._sample_buffers = dict()
self.buffers = None
self.buffer_size = buffer_size
self.total_count = 0
self.buffer_head = 0
self.buffer_tail = MathUtil.INVALID_IDX
self.num_paths = 0
self._sample_buffers = dict()
self.buffers = None
self.clear()
return
self.clear()
return
def sample(self, n):
curr_size = self.get_current_size()
assert curr_size > 0
def sample(self, n):
curr_size = self.get_current_size()
assert curr_size > 0
idx = np.empty(n, dtype=int)
# makes sure that the end states are not sampled
for i in range(n):
while True:
curr_idx = np.random.randint(0, curr_size, size=1)[0]
curr_idx += self.buffer_tail
curr_idx = np.mod(curr_idx, self.buffer_size)
idx = np.empty(n, dtype=int)
# makes sure that the end states are not sampled
for i in range(n):
while True:
curr_idx = np.random.randint(0, curr_size, size=1)[0]
curr_idx += self.buffer_tail
curr_idx = np.mod(curr_idx, self.buffer_size)
if not self.is_path_end(curr_idx):
break
idx[i] = curr_idx
if not self.is_path_end(curr_idx):
break
idx[i] = curr_idx
return idx
return idx
def sample_filtered(self, n, key):
assert key in self._sample_buffers
curr_buffer = self._sample_buffers[key]
idx = curr_buffer.sample(n)
return idx
def sample_filtered(self, n, key):
assert key in self._sample_buffers
curr_buffer = self._sample_buffers[key]
idx = curr_buffer.sample(n)
return idx
def count_filtered(self, key):
curr_buffer = self._sample_buffers[key]
return curr_buffer.count
def count_filtered(self, key):
curr_buffer = self._sample_buffers[key]
return curr_buffer.count
def get(self, key, idx):
return self.buffers[key][idx]
def get(self, key, idx):
return self.buffers[key][idx]
def get_all(self, key):
return self.buffers[key]
def get_all(self, key):
return self.buffers[key]
def get_idx_filtered(self, key):
assert key in self._sample_buffers
curr_buffer = self._sample_buffers[key]
idx = curr_buffer.slot_to_idx[:curr_buffer.count]
return idx
def get_path_start(self, idx):
return self.buffers[self.PATH_START_KEY][idx]
def get_idx_filtered(self, key):
assert key in self._sample_buffers
curr_buffer = self._sample_buffers[key]
idx = curr_buffer.slot_to_idx[:curr_buffer.count]
return idx
def get_path_end(self, idx):
return self.buffers[self.PATH_END_KEY][idx]
def get_path_start(self, idx):
return self.buffers[self.PATH_START_KEY][idx]
def get_pathlen(self, idx):
is_array = isinstance(idx, np.ndarray) or isinstance(idx, list)
if not is_array:
idx = [idx]
def get_path_end(self, idx):
return self.buffers[self.PATH_END_KEY][idx]
n = len(idx)
start_idx = self.get_path_start(idx)
end_idx = self.get_path_end(idx)
pathlen = np.empty(n, dtype=int)
def get_pathlen(self, idx):
is_array = isinstance(idx, np.ndarray) or isinstance(idx, list)
if not is_array:
idx = [idx]
for i in range(n):
curr_start = start_idx[i]
curr_end = end_idx[i]
if curr_start < curr_end:
curr_len = curr_end - curr_start
else:
curr_len = self.buffer_size - curr_start + curr_end
pathlen[i] = curr_len
n = len(idx)
start_idx = self.get_path_start(idx)
end_idx = self.get_path_end(idx)
pathlen = np.empty(n, dtype=int)
if not is_array:
pathlen = pathlen[0]
for i in range(n):
curr_start = start_idx[i]
curr_end = end_idx[i]
if curr_start < curr_end:
curr_len = curr_end - curr_start
else:
curr_len = self.buffer_size - curr_start + curr_end
pathlen[i] = curr_len
return pathlen
if not is_array:
pathlen = pathlen[0]
def is_valid_path(self, idx):
start_idx = self.get_path_start(idx)
valid = start_idx != MathUtil.INVALID_IDX
return valid
return pathlen
def store(self, path):
start_idx = MathUtil.INVALID_IDX
n = path.pathlength()
if (n > 0):
assert path.is_valid()
def is_valid_path(self, idx):
start_idx = self.get_path_start(idx)
valid = start_idx != MathUtil.INVALID_IDX
return valid
if path.check_vals():
if self.buffers is None:
self._init_buffers(path)
def store(self, path):
start_idx = MathUtil.INVALID_IDX
n = path.pathlength()
idx = self._request_idx(n + 1)
self._store_path(path, idx)
self._add_sample_buffers(idx)
if (n > 0):
assert path.is_valid()
self.num_paths += 1
self.total_count += n + 1
start_idx = idx[0]
else:
Logger.print2('Invalid path data value detected')
return start_idx
if path.check_vals():
if self.buffers is None:
self._init_buffers(path)
def clear(self):
self.buffer_head = 0
self.buffer_tail = MathUtil.INVALID_IDX
self.num_paths = 0
idx = self._request_idx(n + 1)
self._store_path(path, idx)
self._add_sample_buffers(idx)
for key in self._sample_buffers:
self._sample_buffers[key].clear()
return
self.num_paths += 1
self.total_count += n + 1
start_idx = idx[0]
else:
Logger.print2('Invalid path data value detected')
def get_next_idx(self, idx):
next_idx = np.mod(idx + 1, self.buffer_size)
return next_idx
return start_idx
def is_terminal_state(self, idx):
terminate_flags = self.buffers[self.TERMINATE_KEY][idx]
terminate = terminate_flags != Env.Terminate.Null.value
is_end = self.is_path_end(idx)
terminal_state = np.logical_and(terminate, is_end)
return terminal_state
def clear(self):
self.buffer_head = 0
self.buffer_tail = MathUtil.INVALID_IDX
self.num_paths = 0
def check_terminal_flag(self, idx, flag):
terminate_flags = self.buffers[self.TERMINATE_KEY][idx]
terminate = terminate_flags == flag.value
return terminate
for key in self._sample_buffers:
self._sample_buffers[key].clear()
return
def is_path_end(self, idx):
is_end = self.buffers[self.PATH_END_KEY][idx] == idx
return is_end
def get_next_idx(self, idx):
next_idx = np.mod(idx + 1, self.buffer_size)
return next_idx
def add_filter_key(self, key):
assert self.get_current_size() == 0
if key not in self._sample_buffers:
self._sample_buffers[key] = SampleBuffer(self.buffer_size)
return
def is_terminal_state(self, idx):
terminate_flags = self.buffers[self.TERMINATE_KEY][idx]
terminate = terminate_flags != Env.Terminate.Null.value
is_end = self.is_path_end(idx)
terminal_state = np.logical_and(terminate, is_end)
return terminal_state
def get_current_size(self):
if self.buffer_tail == MathUtil.INVALID_IDX:
return 0
elif self.buffer_tail < self.buffer_head:
return self.buffer_head - self.buffer_tail
def check_terminal_flag(self, idx, flag):
terminate_flags = self.buffers[self.TERMINATE_KEY][idx]
terminate = terminate_flags == flag.value
return terminate
def is_path_end(self, idx):
is_end = self.buffers[self.PATH_END_KEY][idx] == idx
return is_end
def add_filter_key(self, key):
assert self.get_current_size() == 0
if key not in self._sample_buffers:
self._sample_buffers[key] = SampleBuffer(self.buffer_size)
return
def get_current_size(self):
if self.buffer_tail == MathUtil.INVALID_IDX:
return 0
elif self.buffer_tail < self.buffer_head:
return self.buffer_head - self.buffer_tail
else:
return self.buffer_size - self.buffer_tail + self.buffer_head
def _check_flags(self, key, flags):
return (flags & key) == key
def _add_sample_buffers(self, idx):
flags = self.buffers['flags']
for key in self._sample_buffers:
curr_buffer = self._sample_buffers[key]
filter_idx = [
i for i in idx if (self._check_flags(key, flags[i]) and not self.is_path_end(i))
]
curr_buffer.add(filter_idx)
return
def _free_sample_buffers(self, idx):
for key in self._sample_buffers:
curr_buffer = self._sample_buffers[key]
curr_buffer.free(idx)
return
def _init_buffers(self, path):
self.buffers = dict()
self.buffers[self.PATH_START_KEY] = MathUtil.INVALID_IDX * np.ones(self.buffer_size, dtype=int)
self.buffers[self.PATH_END_KEY] = MathUtil.INVALID_IDX * np.ones(self.buffer_size, dtype=int)
for key in dir(path):
val = getattr(path, key)
if not key.startswith('__') and not inspect.ismethod(val):
if key == self.TERMINATE_KEY:
self.buffers[self.TERMINATE_KEY] = np.zeros(shape=[self.buffer_size], dtype=int)
else:
return self.buffer_size - self.buffer_tail + self.buffer_head
val_type = type(val[0])
is_array = val_type == np.ndarray
if is_array:
shape = [self.buffer_size, val[0].shape[0]]
dtype = val[0].dtype
else:
shape = [self.buffer_size]
dtype = val_type
def _check_flags(self, key, flags):
return (flags & key) == key
self.buffers[key] = np.zeros(shape, dtype=dtype)
return
def _add_sample_buffers(self, idx):
flags = self.buffers['flags']
for key in self._sample_buffers:
curr_buffer = self._sample_buffers[key]
filter_idx = [i for i in idx if (self._check_flags(key, flags[i]) and not self.is_path_end(i))]
curr_buffer.add(filter_idx)
return
def _request_idx(self, n):
assert n + 1 < self.buffer_size # bad things can happen if path is too long
def _free_sample_buffers(self, idx):
for key in self._sample_buffers:
curr_buffer = self._sample_buffers[key]
curr_buffer.free(idx)
return
remainder = n
idx = []
def _init_buffers(self, path):
self.buffers = dict()
self.buffers[self.PATH_START_KEY] = MathUtil.INVALID_IDX * np.ones(self.buffer_size, dtype=int);
self.buffers[self.PATH_END_KEY] = MathUtil.INVALID_IDX * np.ones(self.buffer_size, dtype=int);
start_idx = self.buffer_head
while remainder > 0:
end_idx = np.minimum(start_idx + remainder, self.buffer_size)
remainder -= (end_idx - start_idx)
for key in dir(path):
val = getattr(path, key)
if not key.startswith('__') and not inspect.ismethod(val):
if key == self.TERMINATE_KEY:
self.buffers[self.TERMINATE_KEY] = np.zeros(shape=[self.buffer_size], dtype=int)
else:
val_type = type(val[0])
is_array = val_type == np.ndarray
if is_array:
shape = [self.buffer_size, val[0].shape[0]]
dtype = val[0].dtype
else:
shape = [self.buffer_size]
dtype = val_type
self.buffers[key] = np.zeros(shape, dtype=dtype)
return
free_idx = list(range(start_idx, end_idx))
self._free_idx(free_idx)
idx += free_idx
start_idx = 0
def _request_idx(self, n):
assert n + 1 < self.buffer_size # bad things can happen if path is too long
self.buffer_head = (self.buffer_head + n) % self.buffer_size
return idx
remainder = n
idx = []
def _free_idx(self, idx):
assert (idx[0] <= idx[-1])
n = len(idx)
if self.buffer_tail != MathUtil.INVALID_IDX:
update_tail = idx[0] <= idx[-1] and idx[0] <= self.buffer_tail and idx[-1] >= self.buffer_tail
update_tail |= idx[0] > idx[-1] and (idx[0] <= self.buffer_tail or
idx[-1] >= self.buffer_tail)
start_idx = self.buffer_head
while remainder > 0:
end_idx = np.minimum(start_idx + remainder, self.buffer_size)
remainder -= (end_idx - start_idx)
if update_tail:
i = 0
while i < n:
curr_idx = idx[i]
if self.is_valid_path(curr_idx):
start_idx = self.get_path_start(curr_idx)
end_idx = self.get_path_end(curr_idx)
pathlen = self.get_pathlen(curr_idx)
free_idx = list(range(start_idx, end_idx))
self._free_idx(free_idx)
idx += free_idx
start_idx = 0
if start_idx < end_idx:
self.buffers[self.PATH_START_KEY][start_idx:end_idx + 1] = MathUtil.INVALID_IDX
self._free_sample_buffers(list(range(start_idx, end_idx + 1)))
else:
self.buffers[self.PATH_START_KEY][start_idx:self.buffer_size] = MathUtil.INVALID_IDX
self.buffers[self.PATH_START_KEY][0:end_idx + 1] = MathUtil.INVALID_IDX
self._free_sample_buffers(list(range(start_idx, self.buffer_size)))
self._free_sample_buffers(list(range(0, end_idx + 1)))
self.buffer_head = (self.buffer_head + n) % self.buffer_size
return idx
self.num_paths -= 1
i += pathlen + 1
self.buffer_tail = (end_idx + 1) % self.buffer_size
else:
i += 1
else:
self.buffer_tail = idx[0]
return
def _free_idx(self, idx):
assert(idx[0] <= idx[-1])
n = len(idx)
if self.buffer_tail != MathUtil.INVALID_IDX:
update_tail = idx[0] <= idx[-1] and idx[0] <= self.buffer_tail and idx[-1] >= self.buffer_tail
update_tail |= idx[0] > idx[-1] and (idx[0] <= self.buffer_tail or idx[-1] >= self.buffer_tail)
if update_tail:
i = 0
while i < n:
curr_idx = idx[i]
if self.is_valid_path(curr_idx):
start_idx = self.get_path_start(curr_idx)
end_idx = self.get_path_end(curr_idx)
pathlen = self.get_pathlen(curr_idx)
def _store_path(self, path, idx):
n = path.pathlength()
for key, data in self.buffers.items():
if key != self.PATH_START_KEY and key != self.PATH_END_KEY and key != self.TERMINATE_KEY:
val = getattr(path, key)
val_len = len(val)
assert val_len == n or val_len == n + 1
data[idx[:val_len]] = val
if start_idx < end_idx:
self.buffers[self.PATH_START_KEY][start_idx:end_idx + 1] = MathUtil.INVALID_IDX
self._free_sample_buffers(list(range(start_idx, end_idx + 1)))
else:
self.buffers[self.PATH_START_KEY][start_idx:self.buffer_size] = MathUtil.INVALID_IDX
self.buffers[self.PATH_START_KEY][0:end_idx + 1] = MathUtil.INVALID_IDX
self._free_sample_buffers(list(range(start_idx, self.buffer_size)))
self._free_sample_buffers(list(range(0, end_idx + 1)))
self.num_paths -= 1
i += pathlen + 1
self.buffer_tail = (end_idx + 1) % self.buffer_size;
else:
i += 1
else:
self.buffer_tail = idx[0]
return
self.buffers[self.TERMINATE_KEY][idx] = path.terminate.value
self.buffers[self.PATH_START_KEY][idx] = idx[0]
self.buffers[self.PATH_END_KEY][idx] = idx[-1]
return
def _store_path(self, path, idx):
n = path.pathlength()
for key, data in self.buffers.items():
if key != self.PATH_START_KEY and key != self.PATH_END_KEY and key != self.TERMINATE_KEY:
val = getattr(path, key)
val_len = len(val)
assert val_len == n or val_len == n + 1
data[idx[:val_len]] = val
self.buffers[self.TERMINATE_KEY][idx] = path.terminate.value
self.buffers[self.PATH_START_KEY][idx] = idx[0]
self.buffers[self.PATH_END_KEY][idx] = idx[-1]
return
class SampleBuffer(object):
def __init__(self, size):
self.idx_to_slot = np.empty(shape=[size], dtype=int)
self.slot_to_idx = np.empty(shape=[size], dtype=int)
self.count = 0
self.clear()
return
def clear(self):
self.idx_to_slot.fill(MathUtil.INVALID_IDX)
self.slot_to_idx.fill(MathUtil.INVALID_IDX)
self.count = 0
return
def is_valid(self, idx):
return self.idx_to_slot[idx] != MathUtil.INVALID_IDX
def __init__(self, size):
self.idx_to_slot = np.empty(shape=[size], dtype=int)
self.slot_to_idx = np.empty(shape=[size], dtype=int)
self.count = 0
self.clear()
return
def get_size(self):
return self.idx_to_slot.shape[0]
def clear(self):
self.idx_to_slot.fill(MathUtil.INVALID_IDX)
self.slot_to_idx.fill(MathUtil.INVALID_IDX)
self.count = 0
return
def add(self, idx):
for i in idx:
if not self.is_valid(i):
new_slot = self.count
assert new_slot >= 0
def is_valid(self, idx):
return self.idx_to_slot[idx] != MathUtil.INVALID_IDX
self.idx_to_slot[i] = new_slot
self.slot_to_idx[new_slot] = i
self.count += 1
return
def get_size(self):
return self.idx_to_slot.shape[0]
def free(self, idx):
for i in idx:
if self.is_valid(i):
slot = self.idx_to_slot[i]
last_slot = self.count - 1
last_idx = self.slot_to_idx[last_slot]
def add(self, idx):
for i in idx:
if not self.is_valid(i):
new_slot = self.count
assert new_slot >= 0
self.idx_to_slot[last_idx] = slot
self.slot_to_idx[slot] = last_idx
self.idx_to_slot[i] = MathUtil.INVALID_IDX
self.slot_to_idx[last_slot] = MathUtil.INVALID_IDX
self.count -= 1
return
self.idx_to_slot[i] = new_slot
self.slot_to_idx[new_slot] = i
self.count += 1
return
def sample(self, n):
if self.count > 0:
slots = np.random.randint(0, self.count, size=n)
idx = self.slot_to_idx[slots]
else:
idx = np.empty(shape=[0], dtype=int)
return idx
def free(self, idx):
for i in idx:
if self.is_valid(i):
slot = self.idx_to_slot[i]
last_slot = self.count - 1
last_idx = self.slot_to_idx[last_slot]
def check_consistency(self):
valid = True
if self.count < 0:
self.idx_to_slot[last_idx] = slot
self.slot_to_idx[slot] = last_idx
self.idx_to_slot[i] = MathUtil.INVALID_IDX
self.slot_to_idx[last_slot] = MathUtil.INVALID_IDX
self.count -= 1
return
def sample(self, n):
if self.count > 0:
slots = np.random.randint(0, self.count, size=n)
idx = self.slot_to_idx[slots]
else:
idx = np.empty(shape=[0], dtype=int)
return idx
def check_consistency(self):
valid = True
if self.count < 0:
valid = False
if valid:
for i in range(self.get_size()):
if self.is_valid(i):
s = self.idx_to_slot[i]
if self.slot_to_idx[s] != i:
valid = False
break
if valid:
for i in range(self.get_size()):
if self.is_valid(i):
s = self.idx_to_slot[i]
if self.slot_to_idx[s] != i:
valid = False
break
s2i = self.slot_to_idx[i]
if s2i != MathUtil.INVALID_IDX:
i2s = self.idx_to_slot[s2i]
if i2s != i:
valid = False
break
s2i = self.slot_to_idx[i]
if s2i != MathUtil.INVALID_IDX:
i2s = self.idx_to_slot[s2i]
if i2s != i:
valid = False
break
count0 = np.sum(self.idx_to_slot == MathUtil.INVALID_IDX)
count1 = np.sum(self.slot_to_idx == MathUtil.INVALID_IDX)
valid &= count0 == count1
return valid
count0 = np.sum(self.idx_to_slot == MathUtil.INVALID_IDX)
count1 = np.sum(self.slot_to_idx == MathUtil.INVALID_IDX)
valid &= count0 == count1
return valid

View File

@@ -1,18 +1,19 @@
import numpy as np
def compute_return(rewards, gamma, td_lambda, val_t):
# computes td-lambda return of path
path_len = len(rewards)
assert len(val_t) == path_len + 1
# computes td-lambda return of path
path_len = len(rewards)
assert len(val_t) == path_len + 1
return_t = np.zeros(path_len)
last_val = rewards[-1] + gamma * val_t[-1]
return_t[-1] = last_val
return_t = np.zeros(path_len)
last_val = rewards[-1] + gamma * val_t[-1]
return_t[-1] = last_val
for i in reversed(range(0, path_len - 1)):
curr_r = rewards[i]
next_ret = return_t[i + 1]
curr_val = curr_r + gamma * ((1.0 - td_lambda) * val_t[i + 1] + td_lambda * next_ret)
return_t[i] = curr_val
return return_t
for i in reversed(range(0, path_len - 1)):
curr_r = rewards[i]
next_ret = return_t[i + 1]
curr_val = curr_r + gamma * ((1.0 - td_lambda) * val_t[i + 1] + td_lambda * next_ret)
return_t[i] = curr_val
return return_t

View File

@@ -5,139 +5,140 @@ from pybullet_envs.deep_mimic.learning.rl_agent import RLAgent
from pybullet_utils.logger import Logger
import pybullet_data
class RLWorld(object):
def __init__(self, env, arg_parser):
TFUtil.disable_gpu()
self.env = env
self.arg_parser = arg_parser
self._enable_training = True
self.train_agents = []
self.parse_args(arg_parser)
def __init__(self, env, arg_parser):
TFUtil.disable_gpu()
self.build_agents()
return
self.env = env
self.arg_parser = arg_parser
self._enable_training = True
self.train_agents = []
self.parse_args(arg_parser)
def get_enable_training(self):
return self._enable_training
def set_enable_training(self, enable):
self._enable_training = enable
for i in range(len(self.agents)):
curr_agent = self.agents[i]
if curr_agent is not None:
enable_curr_train = self.train_agents[i] if (len(self.train_agents) > 0) else True
curr_agent.enable_training = self.enable_training and enable_curr_train
self.build_agents()
if (self._enable_training):
self.env.set_mode(RLAgent.Mode.TRAIN)
else:
self.env.set_mode(RLAgent.Mode.TEST)
return
return
def get_enable_training(self):
return self._enable_training
enable_training = property(get_enable_training, set_enable_training)
def parse_args(self, arg_parser):
self.train_agents = self.arg_parser.parse_bools('train_agents')
num_agents = self.env.get_num_agents()
assert(len(self.train_agents) == num_agents or len(self.train_agents) == 0)
def set_enable_training(self, enable):
self._enable_training = enable
for i in range(len(self.agents)):
curr_agent = self.agents[i]
if curr_agent is not None:
enable_curr_train = self.train_agents[i] if (len(self.train_agents) > 0) else True
curr_agent.enable_training = self.enable_training and enable_curr_train
return
if (self._enable_training):
self.env.set_mode(RLAgent.Mode.TRAIN)
else:
self.env.set_mode(RLAgent.Mode.TEST)
def shutdown(self):
self.env.shutdown()
return
return
def build_agents(self):
num_agents = self.env.get_num_agents()
print("num_agents=",num_agents)
self.agents = []
enable_training = property(get_enable_training, set_enable_training)
Logger.print2('')
Logger.print2('Num Agents: {:d}'.format(num_agents))
def parse_args(self, arg_parser):
self.train_agents = self.arg_parser.parse_bools('train_agents')
num_agents = self.env.get_num_agents()
assert (len(self.train_agents) == num_agents or len(self.train_agents) == 0)
agent_files = self.arg_parser.parse_strings('agent_files')
print("len(agent_files)=",len(agent_files))
assert(len(agent_files) == num_agents or len(agent_files) == 0)
return
model_files = self.arg_parser.parse_strings('model_files')
assert(len(model_files) == num_agents or len(model_files) == 0)
def shutdown(self):
self.env.shutdown()
return
output_path = self.arg_parser.parse_string('output_path')
int_output_path = self.arg_parser.parse_string('int_output_path')
def build_agents(self):
num_agents = self.env.get_num_agents()
print("num_agents=", num_agents)
self.agents = []
for i in range(num_agents):
curr_file = agent_files[i]
curr_agent = self._build_agent(i, curr_file)
Logger.print2('')
Logger.print2('Num Agents: {:d}'.format(num_agents))
if curr_agent is not None:
curr_agent.output_dir = output_path
curr_agent.int_output_dir = int_output_path
Logger.print2(str(curr_agent))
agent_files = self.arg_parser.parse_strings('agent_files')
print("len(agent_files)=", len(agent_files))
assert (len(agent_files) == num_agents or len(agent_files) == 0)
if (len(model_files) > 0):
curr_model_file = model_files[i]
if curr_model_file != 'none':
curr_agent.load_model(pybullet_data.getDataPath()+"/"+curr_model_file)
model_files = self.arg_parser.parse_strings('model_files')
assert (len(model_files) == num_agents or len(model_files) == 0)
self.agents.append(curr_agent)
Logger.print2('')
output_path = self.arg_parser.parse_string('output_path')
int_output_path = self.arg_parser.parse_string('int_output_path')
self.set_enable_training(self.enable_training)
for i in range(num_agents):
curr_file = agent_files[i]
curr_agent = self._build_agent(i, curr_file)
return
if curr_agent is not None:
curr_agent.output_dir = output_path
curr_agent.int_output_dir = int_output_path
Logger.print2(str(curr_agent))
def update(self, timestep):
#print("world update!\n")
self._update_agents(timestep)
self._update_env(timestep)
return
if (len(model_files) > 0):
curr_model_file = model_files[i]
if curr_model_file != 'none':
curr_agent.load_model(pybullet_data.getDataPath() + "/" + curr_model_file)
def reset(self):
self._reset_agents()
self._reset_env()
return
self.agents.append(curr_agent)
Logger.print2('')
def end_episode(self):
self._end_episode_agents();
return
self.set_enable_training(self.enable_training)
def _update_env(self, timestep):
self.env.update(timestep)
return
return
def _update_agents(self, timestep):
#print("len(agents)=",len(self.agents))
for agent in self.agents:
if (agent is not None):
agent.update(timestep)
return
def update(self, timestep):
#print("world update!\n")
self._update_agents(timestep)
self._update_env(timestep)
return
def _reset_env(self):
self.env.reset()
return
def reset(self):
self._reset_agents()
self._reset_env()
return
def _reset_agents(self):
for agent in self.agents:
if (agent != None):
agent.reset()
return
def end_episode(self):
self._end_episode_agents()
return
def _end_episode_agents(self):
for agent in self.agents:
if (agent != None):
agent.end_episode()
return
def _update_env(self, timestep):
self.env.update(timestep)
return
def _build_agent(self, id, agent_file):
Logger.print2('Agent {:d}: {}'.format(id, agent_file))
if (agent_file == 'none'):
agent = None
else:
agent = AgentBuilder.build_agent(self, id, agent_file)
assert (agent != None), 'Failed to build agent {:d} from: {}'.format(id, agent_file)
return agent
def _update_agents(self, timestep):
#print("len(agents)=",len(self.agents))
for agent in self.agents:
if (agent is not None):
agent.update(timestep)
return
def _reset_env(self):
self.env.reset()
return
def _reset_agents(self):
for agent in self.agents:
if (agent != None):
agent.reset()
return
def _end_episode_agents(self):
for agent in self.agents:
if (agent != None):
agent.end_episode()
return
def _build_agent(self, id, agent_file):
Logger.print2('Agent {:d}: {}'.format(id, agent_file))
if (agent_file == 'none'):
agent = None
else:
agent = AgentBuilder.build_agent(self, id, agent_file)
assert (agent != None), 'Failed to build agent {:d} from: {}'.format(id, agent_file)
return agent

View File

@@ -8,96 +8,97 @@ from pybullet_utils.logger import Logger
from pybullet_envs.deep_mimic.learning.solvers.solver import Solver
class MPISolver(Solver):
CHECK_SYNC_ITERS = 1000
CHECK_SYNC_ITERS = 1000
def __init__(self, sess, optimizer, vars):
super().__init__(vars)
self.sess = sess
self.optimizer = optimizer
self._build_grad_feed(vars)
self._update = optimizer.apply_gradients(zip(self._grad_tf_list, self.vars))
self._set_flat_vars = TFUtil.SetFromFlat(sess, self.vars)
self._get_flat_vars = TFUtil.GetFlat(sess, self.vars)
def __init__(self, sess, optimizer, vars):
super().__init__(vars)
self.sess = sess
self.optimizer = optimizer
self._build_grad_feed(vars)
self._update = optimizer.apply_gradients(zip(self._grad_tf_list, self.vars))
self._set_flat_vars = TFUtil.SetFromFlat(sess, self.vars)
self._get_flat_vars = TFUtil.GetFlat(sess, self.vars)
self.iter = 0
grad_dim = self._calc_grad_dim()
self._flat_grad = np.zeros(grad_dim, dtype=np.float32)
self._global_flat_grad = np.zeros(grad_dim, dtype=np.float32)
return
self.iter = 0
grad_dim = self._calc_grad_dim()
self._flat_grad = np.zeros(grad_dim, dtype=np.float32)
self._global_flat_grad = np.zeros(grad_dim, dtype=np.float32)
def get_stepsize(self):
return self.optimizer._learning_rate_tensor.eval()
return
def update(self, grads=None, grad_scale=1.0):
if grads is not None:
self._flat_grad = MathUtil.flatten(grads)
else:
self._flat_grad.fill(0)
return self.update_flatgrad(self._flat_grad, grad_scale)
def get_stepsize(self):
return self.optimizer._learning_rate_tensor.eval()
def update_flatgrad(self, flat_grad, grad_scale=1.0):
if self.iter % self.CHECK_SYNC_ITERS == 0:
assert self.check_synced(), Logger.print2('Network parameters desynchronized')
if grad_scale != 1.0:
flat_grad *= grad_scale
def update(self, grads=None, grad_scale=1.0):
if grads is not None:
self._flat_grad = MathUtil.flatten(grads)
else:
self._flat_grad.fill(0)
return self.update_flatgrad(self._flat_grad, grad_scale)
MPI.COMM_WORLD.Allreduce(flat_grad, self._global_flat_grad, op=MPI.SUM)
self._global_flat_grad /= MPIUtil.get_num_procs()
def update_flatgrad(self, flat_grad, grad_scale=1.0):
if self.iter % self.CHECK_SYNC_ITERS == 0:
assert self.check_synced(), Logger.print2('Network parameters desynchronized')
self._load_flat_grad(self._global_flat_grad)
self.sess.run([self._update], self._grad_feed)
self.iter += 1
if grad_scale != 1.0:
flat_grad *= grad_scale
return
MPI.COMM_WORLD.Allreduce(flat_grad, self._global_flat_grad, op=MPI.SUM)
self._global_flat_grad /= MPIUtil.get_num_procs()
def sync(self):
vars = self._get_flat_vars()
MPIUtil.bcast(vars)
self._set_flat_vars(vars)
return
self._load_flat_grad(self._global_flat_grad)
self.sess.run([self._update], self._grad_feed)
self.iter += 1
def check_synced(self):
synced = True
if self._is_root():
vars = self._get_flat_vars()
MPIUtil.bcast(vars)
else:
vars_local = self._get_flat_vars()
vars_root = np.empty_like(vars_local)
MPIUtil.bcast(vars_root)
synced = (vars_local == vars_root).all()
return synced
return
def _is_root(self):
return MPIUtil.is_root_proc()
def _build_grad_feed(self, vars):
self._grad_tf_list = []
self._grad_buffers = []
for v in self.vars:
shape = v.get_shape()
grad = np.zeros(shape)
grad_tf = tf.placeholder(tf.float32, shape=shape)
self._grad_buffers.append(grad)
self._grad_tf_list.append(grad_tf)
def sync(self):
vars = self._get_flat_vars()
MPIUtil.bcast(vars)
self._set_flat_vars(vars)
return
self._grad_feed = dict({g_tf: g for g_tf, g in zip(self._grad_tf_list, self._grad_buffers)})
return
def check_synced(self):
synced = True
if self._is_root():
vars = self._get_flat_vars()
MPIUtil.bcast(vars)
else:
vars_local = self._get_flat_vars()
vars_root = np.empty_like(vars_local)
MPIUtil.bcast(vars_root)
synced = (vars_local == vars_root).all()
return synced
def _calc_grad_dim(self):
grad_dim = 0
for grad in self._grad_buffers:
grad_dim += grad.size
return grad_dim
def _is_root(self):
return MPIUtil.is_root_proc()
def _load_flat_grad(self, flat_grad):
start = 0
for g in self._grad_buffers:
size = g.size
np.copyto(g, np.reshape(flat_grad[start:start + size], g.shape))
start += size
return
def _build_grad_feed(self, vars):
self._grad_tf_list = []
self._grad_buffers = []
for v in self.vars:
shape = v.get_shape()
grad = np.zeros(shape)
grad_tf = tf.placeholder(tf.float32, shape=shape)
self._grad_buffers.append(grad)
self._grad_tf_list.append(grad_tf)
self._grad_feed = dict({g_tf: g for g_tf, g in zip(self._grad_tf_list, self._grad_buffers)})
return
def _calc_grad_dim(self):
grad_dim = 0
for grad in self._grad_buffers:
grad_dim += grad.size
return grad_dim
def _load_flat_grad(self, flat_grad):
start = 0
for g in self._grad_buffers:
size = g.size
np.copyto(g, np.reshape(flat_grad[start:start + size], g.shape))
start += size
return

View File

@@ -1,15 +1,17 @@
from abc import abstractmethod
import sys, abc
if sys.version_info >= (3, 4):
ABC = abc.ABC
ABC = abc.ABC
else:
ABC = abc.ABCMeta('ABC', (), {})
ABC = abc.ABCMeta('ABC', (), {})
class Solver(ABC):
def __init__(self, vars):
self.vars = vars
return
@abstractmethod
def update(self, grads):
pass
def __init__(self, vars):
self.vars = vars
return
@abstractmethod
def update(self, grads):
pass

View File

@@ -6,144 +6,148 @@ from pybullet_envs.deep_mimic.learning.rl_agent import RLAgent
from pybullet_utils.logger import Logger
from pybullet_envs.deep_mimic.learning.tf_normalizer import TFNormalizer
class TFAgent(RLAgent):
RESOURCE_SCOPE = 'resource'
SOLVER_SCOPE = 'solvers'
RESOURCE_SCOPE = 'resource'
SOLVER_SCOPE = 'solvers'
def __init__(self, world, id, json_data):
self.tf_scope = 'agent'
self.graph = tf.Graph()
self.sess = tf.Session(graph=self.graph)
def __init__(self, world, id, json_data):
self.tf_scope = 'agent'
self.graph = tf.Graph()
self.sess = tf.Session(graph=self.graph)
super().__init__(world, id, json_data)
self._build_graph(json_data)
self._init_normalizers()
return
super().__init__(world, id, json_data)
self._build_graph(json_data)
self._init_normalizers()
return
def __del__(self):
self.sess.close()
return
def __del__(self):
self.sess.close()
return
def save_model(self, out_path):
with self.sess.as_default(), self.graph.as_default():
try:
save_path = self.saver.save(self.sess, out_path, write_meta_graph=False, write_state=False)
Logger.print2('Model saved to: ' + save_path)
except:
Logger.print2("Failed to save model to: " + save_path)
return
def save_model(self, out_path):
with self.sess.as_default(), self.graph.as_default():
try:
save_path = self.saver.save(self.sess, out_path, write_meta_graph=False, write_state=False)
Logger.print2('Model saved to: ' + save_path)
except:
Logger.print2("Failed to save model to: " + save_path)
return
def load_model(self, in_path):
with self.sess.as_default(), self.graph.as_default():
self.saver.restore(self.sess, in_path)
self._load_normalizers()
Logger.print2('Model loaded from: ' + in_path)
return
def load_model(self, in_path):
with self.sess.as_default(), self.graph.as_default():
self.saver.restore(self.sess, in_path)
self._load_normalizers()
Logger.print2('Model loaded from: ' + in_path)
return
def _get_output_path(self):
assert(self.output_dir != '')
file_path = self.output_dir + '/agent' + str(self.id) + '_model.ckpt'
return file_path
def _get_output_path(self):
assert (self.output_dir != '')
file_path = self.output_dir + '/agent' + str(self.id) + '_model.ckpt'
return file_path
def _get_int_output_path(self):
assert(self.int_output_dir != '')
file_path = self.int_output_dir + ('/agent{:d}_models/agent{:d}_int_model_{:010d}.ckpt').format(self.id, self.id, self.iter)
return file_path
def _get_int_output_path(self):
assert (self.int_output_dir != '')
file_path = self.int_output_dir + (
'/agent{:d}_models/agent{:d}_int_model_{:010d}.ckpt').format(self.id, self.id, self.iter)
return file_path
def _build_graph(self, json_data):
with self.sess.as_default(), self.graph.as_default():
with tf.variable_scope(self.tf_scope):
self._build_nets(json_data)
with tf.variable_scope(self.SOLVER_SCOPE):
self._build_losses(json_data)
self._build_solvers(json_data)
def _build_graph(self, json_data):
with self.sess.as_default(), self.graph.as_default():
with tf.variable_scope(self.tf_scope):
self._build_nets(json_data)
self._initialize_vars()
self._build_saver()
return
with tf.variable_scope(self.SOLVER_SCOPE):
self._build_losses(json_data)
self._build_solvers(json_data)
def _init_normalizers(self):
with self.sess.as_default(), self.graph.as_default():
# update normalizers to sync the tensorflow tensors
self.s_norm.update()
self.g_norm.update()
self.a_norm.update()
return
self._initialize_vars()
self._build_saver()
return
@abstractmethod
def _build_nets(self, json_data):
pass
def _init_normalizers(self):
with self.sess.as_default(), self.graph.as_default():
# update normalizers to sync the tensorflow tensors
self.s_norm.update()
self.g_norm.update()
self.a_norm.update()
return
@abstractmethod
def _build_losses(self, json_data):
pass
@abstractmethod
def _build_nets(self, json_data):
pass
@abstractmethod
def _build_solvers(self, json_data):
pass
@abstractmethod
def _build_losses(self, json_data):
pass
def _tf_vars(self, scope=''):
with self.sess.as_default(), self.graph.as_default():
res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.tf_scope + '/' + scope)
assert len(res) > 0
return res
@abstractmethod
def _build_solvers(self, json_data):
pass
def _build_normalizers(self):
with self.sess.as_default(), self.graph.as_default(), tf.variable_scope(self.tf_scope):
with tf.variable_scope(self.RESOURCE_SCOPE):
self.s_norm = TFNormalizer(self.sess, 's_norm', self.get_state_size(), self.world.env.build_state_norm_groups(self.id))
state_offset = -self.world.env.build_state_offset(self.id)
print("state_offset=",state_offset)
state_scale = 1 / self.world.env.build_state_scale(self.id)
print("state_scale=",state_scale)
self.s_norm.set_mean_std(-self.world.env.build_state_offset(self.id),
1 / self.world.env.build_state_scale(self.id))
self.g_norm = TFNormalizer(self.sess, 'g_norm', self.get_goal_size(), self.world.env.build_goal_norm_groups(self.id))
self.g_norm.set_mean_std(-self.world.env.build_goal_offset(self.id),
1 / self.world.env.build_goal_scale(self.id))
def _tf_vars(self, scope=''):
with self.sess.as_default(), self.graph.as_default():
res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.tf_scope + '/' + scope)
assert len(res) > 0
return res
self.a_norm = TFNormalizer(self.sess, 'a_norm', self.get_action_size())
self.a_norm.set_mean_std(-self.world.env.build_action_offset(self.id),
1 / self.world.env.build_action_scale(self.id))
return
def _build_normalizers(self):
with self.sess.as_default(), self.graph.as_default(), tf.variable_scope(self.tf_scope):
with tf.variable_scope(self.RESOURCE_SCOPE):
self.s_norm = TFNormalizer(self.sess, 's_norm', self.get_state_size(),
self.world.env.build_state_norm_groups(self.id))
state_offset = -self.world.env.build_state_offset(self.id)
print("state_offset=", state_offset)
state_scale = 1 / self.world.env.build_state_scale(self.id)
print("state_scale=", state_scale)
self.s_norm.set_mean_std(-self.world.env.build_state_offset(self.id),
1 / self.world.env.build_state_scale(self.id))
def _load_normalizers(self):
self.s_norm.load()
self.g_norm.load()
self.a_norm.load()
return
self.g_norm = TFNormalizer(self.sess, 'g_norm', self.get_goal_size(),
self.world.env.build_goal_norm_groups(self.id))
self.g_norm.set_mean_std(-self.world.env.build_goal_offset(self.id),
1 / self.world.env.build_goal_scale(self.id))
def _update_normalizers(self):
with self.sess.as_default(), self.graph.as_default():
super()._update_normalizers()
return
self.a_norm = TFNormalizer(self.sess, 'a_norm', self.get_action_size())
self.a_norm.set_mean_std(-self.world.env.build_action_offset(self.id),
1 / self.world.env.build_action_scale(self.id))
return
def _initialize_vars(self):
self.sess.run(tf.global_variables_initializer())
return
def _load_normalizers(self):
self.s_norm.load()
self.g_norm.load()
self.a_norm.load()
return
def _build_saver(self):
vars = self._get_saver_vars()
self.saver = tf.train.Saver(vars, max_to_keep=0)
return
def _update_normalizers(self):
with self.sess.as_default(), self.graph.as_default():
super()._update_normalizers()
return
def _get_saver_vars(self):
with self.sess.as_default(), self.graph.as_default():
vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.tf_scope)
vars = [v for v in vars if '/' + self.SOLVER_SCOPE + '/' not in v.name]
#vars = [v for v in vars if '/target/' not in v.name]
assert len(vars) > 0
return vars
def _weight_decay_loss(self, scope):
vars = self._tf_vars(scope)
vars_no_bias = [v for v in vars if 'bias' not in v.name]
loss = tf.add_n([tf.nn.l2_loss(v) for v in vars_no_bias])
return loss
def _initialize_vars(self):
self.sess.run(tf.global_variables_initializer())
return
def _train(self):
with self.sess.as_default(), self.graph.as_default():
super()._train()
return
def _build_saver(self):
vars = self._get_saver_vars()
self.saver = tf.train.Saver(vars, max_to_keep=0)
return
def _get_saver_vars(self):
with self.sess.as_default(), self.graph.as_default():
vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.tf_scope)
vars = [v for v in vars if '/' + self.SOLVER_SCOPE + '/' not in v.name]
#vars = [v for v in vars if '/target/' not in v.name]
assert len(vars) > 0
return vars
def _weight_decay_loss(self, scope):
vars = self._tf_vars(scope)
vars_no_bias = [v for v in vars if 'bias' not in v.name]
loss = tf.add_n([tf.nn.l2_loss(v) for v in vars_no_bias])
return loss
def _train(self):
with self.sess.as_default(), self.graph.as_default():
super()._train()
return

View File

@@ -3,65 +3,72 @@ import copy
import tensorflow as tf
from pybullet_envs.deep_mimic.learning.normalizer import Normalizer
class TFNormalizer(Normalizer):
def __init__(self, sess, scope, size, groups_ids=None, eps=0.02, clip=np.inf):
self.sess = sess
self.scope = scope
super().__init__(size, groups_ids, eps, clip)
def __init__(self, sess, scope, size, groups_ids=None, eps=0.02, clip=np.inf):
self.sess = sess
self.scope = scope
super().__init__(size, groups_ids, eps, clip)
with tf.variable_scope(self.scope):
self._build_resource_tf()
return
with tf.variable_scope(self.scope):
self._build_resource_tf()
return
# initialze count when loading saved values so that things don't change to quickly during updates
def load(self):
self.count = self.count_tf.eval()[0]
self.mean = self.mean_tf.eval()
self.std = self.std_tf.eval()
self.mean_sq = self.calc_mean_sq(self.mean, self.std)
return
# initialze count when loading saved values so that things don't change to quickly during updates
def load(self):
self.count = self.count_tf.eval()[0]
self.mean = self.mean_tf.eval()
self.std = self.std_tf.eval()
self.mean_sq = self.calc_mean_sq(self.mean, self.std)
return
def update(self):
super().update()
self._update_resource_tf()
return
def update(self):
super().update()
self._update_resource_tf()
return
def set_mean_std(self, mean, std):
super().set_mean_std(mean, std)
self._update_resource_tf()
return
def set_mean_std(self, mean, std):
super().set_mean_std(mean, std)
self._update_resource_tf()
return
def normalize_tf(self, x):
norm_x = (x - self.mean_tf) / self.std_tf
norm_x = tf.clip_by_value(norm_x, -self.clip, self.clip)
return norm_x
def normalize_tf(self, x):
norm_x = (x - self.mean_tf) / self.std_tf
norm_x = tf.clip_by_value(norm_x, -self.clip, self.clip)
return norm_x
def unnormalize_tf(self, norm_x):
x = norm_x * self.std_tf + self.mean_tf
return x
def _build_resource_tf(self):
self.count_tf = tf.get_variable(dtype=tf.int32, name='count', initializer=np.array([self.count], dtype=np.int32), trainable=False)
self.mean_tf = tf.get_variable(dtype=tf.float32, name='mean', initializer=self.mean.astype(np.float32), trainable=False)
self.std_tf = tf.get_variable(dtype=tf.float32, name='std', initializer=self.std.astype(np.float32), trainable=False)
self.count_ph = tf.get_variable(dtype=tf.int32, name='count_ph', shape=[1])
self.mean_ph = tf.get_variable(dtype=tf.float32, name='mean_ph', shape=self.mean.shape)
self.std_ph = tf.get_variable(dtype=tf.float32, name='std_ph', shape=self.std.shape)
self._update_op = tf.group(
self.count_tf.assign(self.count_ph),
self.mean_tf.assign(self.mean_ph),
self.std_tf.assign(self.std_ph)
)
return
def unnormalize_tf(self, norm_x):
x = norm_x * self.std_tf + self.mean_tf
return x
def _update_resource_tf(self):
feed = {
self.count_ph: np.array([self.count], dtype=np.int32),
self.mean_ph: self.mean,
self.std_ph: self.std
}
self.sess.run(self._update_op, feed_dict=feed)
return
def _build_resource_tf(self):
self.count_tf = tf.get_variable(dtype=tf.int32,
name='count',
initializer=np.array([self.count], dtype=np.int32),
trainable=False)
self.mean_tf = tf.get_variable(dtype=tf.float32,
name='mean',
initializer=self.mean.astype(np.float32),
trainable=False)
self.std_tf = tf.get_variable(dtype=tf.float32,
name='std',
initializer=self.std.astype(np.float32),
trainable=False)
self.count_ph = tf.get_variable(dtype=tf.int32, name='count_ph', shape=[1])
self.mean_ph = tf.get_variable(dtype=tf.float32, name='mean_ph', shape=self.mean.shape)
self.std_ph = tf.get_variable(dtype=tf.float32, name='std_ph', shape=self.std.shape)
self._update_op = tf.group(self.count_tf.assign(self.count_ph),
self.mean_tf.assign(self.mean_ph), self.std_tf.assign(self.std_ph))
return
def _update_resource_tf(self):
feed = {
self.count_ph: np.array([self.count], dtype=np.int32),
self.mean_ph: self.mean,
self.std_ph: self.std
}
self.sess.run(self._update_op, feed_dict=feed)
return

View File

@@ -4,101 +4,116 @@ import os
xavier_initializer = tf.contrib.layers.xavier_initializer()
def disable_gpu():
os.environ["CUDA_VISIBLE_DEVICES"] = '-1'
return
os.environ["CUDA_VISIBLE_DEVICES"] = '-1'
return
def var_shape(x):
out = [k.value for k in x.get_shape()]
assert all(isinstance(a, int) for a in out), "shape function assumes that shape is fully known"
return out
out = [k.value for k in x.get_shape()]
assert all(isinstance(a, int) for a in out), "shape function assumes that shape is fully known"
return out
def intprod(x):
return int(np.prod(x))
return int(np.prod(x))
def numel(x):
n = intprod(var_shape(x))
return n
n = intprod(var_shape(x))
return n
def flat_grad(loss, var_list, grad_ys=None):
grads = tf.gradients(loss, var_list, grad_ys)
return tf.concat([tf.reshape(grad, [numel(v)]) for (v, grad) in zip(var_list, grads)], axis=0)
grads = tf.gradients(loss, var_list, grad_ys)
return tf.concat([tf.reshape(grad, [numel(v)]) for (v, grad) in zip(var_list, grads)], axis=0)
def fc_net(input, layers_sizes, activation, reuse=None, flatten=False): # build fully connected network
curr_tf = input
for i, size in enumerate(layers_sizes):
with tf.variable_scope(str(i), reuse=reuse):
curr_tf = tf.layers.dense(inputs=curr_tf,
units=size,
kernel_initializer=xavier_initializer,
activation = activation if i < len(layers_sizes)-1 else None)
if flatten:
assert layers_sizes[-1] == 1
curr_tf = tf.reshape(curr_tf, [-1])
return curr_tf
def fc_net(input, layers_sizes, activation, reuse=None,
flatten=False): # build fully connected network
curr_tf = input
for i, size in enumerate(layers_sizes):
with tf.variable_scope(str(i), reuse=reuse):
curr_tf = tf.layers.dense(inputs=curr_tf,
units=size,
kernel_initializer=xavier_initializer,
activation=activation if i < len(layers_sizes) - 1 else None)
if flatten:
assert layers_sizes[-1] == 1
curr_tf = tf.reshape(curr_tf, [-1])
return curr_tf
def copy(sess, src, dst):
assert len(src) == len(dst)
sess.run(list(map(lambda v: v[1].assign(v[0]), zip(src, dst))))
return
assert len(src) == len(dst)
sess.run(list(map(lambda v: v[1].assign(v[0]), zip(src, dst))))
return
def flat_grad(loss, var_list):
grads = tf.gradients(loss, var_list)
return tf.concat(axis=0, values=[tf.reshape(grad, [numel(v)])
for (v, grad) in zip(var_list, grads)])
grads = tf.gradients(loss, var_list)
return tf.concat(axis=0,
values=[tf.reshape(grad, [numel(v)]) for (v, grad) in zip(var_list, grads)])
def calc_logp_gaussian(x_tf, mean_tf, std_tf):
dim = tf.to_float(tf.shape(x_tf)[-1])
dim = tf.to_float(tf.shape(x_tf)[-1])
if mean_tf is None:
diff_tf = x_tf
else:
diff_tf = x_tf - mean_tf
if mean_tf is None:
diff_tf = x_tf
else:
diff_tf = x_tf - mean_tf
logp_tf = -0.5 * tf.reduce_sum(tf.square(diff_tf / std_tf), axis=-1)
logp_tf += -0.5 * dim * np.log(2 * np.pi) - tf.reduce_sum(tf.log(std_tf), axis=-1)
return logp_tf
logp_tf = -0.5 * tf.reduce_sum(tf.square(diff_tf / std_tf), axis=-1)
logp_tf += -0.5 * dim * np.log(2 * np.pi) - tf.reduce_sum(tf.log(std_tf), axis=-1)
return logp_tf
def calc_bound_loss(x_tf, bound_min, bound_max):
# penalty for violating bounds
violation_min = tf.minimum(x_tf - bound_min, 0)
violation_max = tf.maximum(x_tf - bound_max, 0)
violation = tf.reduce_sum(tf.square(violation_min), axis=-1) + tf.reduce_sum(tf.square(violation_max), axis=-1)
loss = 0.5 * tf.reduce_mean(violation)
return loss
# penalty for violating bounds
violation_min = tf.minimum(x_tf - bound_min, 0)
violation_max = tf.maximum(x_tf - bound_max, 0)
violation = tf.reduce_sum(tf.square(violation_min), axis=-1) + tf.reduce_sum(
tf.square(violation_max), axis=-1)
loss = 0.5 * tf.reduce_mean(violation)
return loss
class SetFromFlat(object):
def __init__(self, sess, var_list, dtype=tf.float32):
assigns = []
shapes = list(map(var_shape, var_list))
total_size = np.sum([intprod(shape) for shape in shapes])
self.sess = sess
self.theta = tf.placeholder(dtype,[total_size])
start=0
assigns = []
def __init__(self, sess, var_list, dtype=tf.float32):
assigns = []
shapes = list(map(var_shape, var_list))
total_size = np.sum([intprod(shape) for shape in shapes])
for (shape,v) in zip(shapes,var_list):
size = intprod(shape)
assigns.append(tf.assign(v, tf.reshape(self.theta[start:start+size],shape)))
start += size
self.sess = sess
self.theta = tf.placeholder(dtype, [total_size])
start = 0
assigns = []
self.op = tf.group(*assigns)
for (shape, v) in zip(shapes, var_list):
size = intprod(shape)
assigns.append(tf.assign(v, tf.reshape(self.theta[start:start + size], shape)))
start += size
return
self.op = tf.group(*assigns)
return
def __call__(self, theta):
self.sess.run(self.op, feed_dict={self.theta: theta})
return
def __call__(self, theta):
self.sess.run(self.op, feed_dict={self.theta:theta})
return
class GetFlat(object):
def __init__(self, sess, var_list):
self.sess = sess
self.op = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list])
return
def __call__(self):
return self.sess.run(self.op)
def __init__(self, sess, var_list):
self.sess = sess
self.op = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list])
return
def __call__(self):
return self.sess.run(self.op)