add yapf style and apply yapf to format all Python files

This recreates pull request #2192
2019-04-27 07:31:15 -07:00
parent c591735042
commit ef9570c315
347 changed files with 70304 additions and 22752 deletions
--- a/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/agent_builder.py
+++ b/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/agent_builder.py
@@ -5,17 +5,18 @@ import pybullet_data

 AGENT_TYPE_KEY = "AgentType"

-def build_agent(world, id, file):
-    agent = None
-    with open(pybullet_data.getDataPath()+"/"+file) as data_file:    
-        json_data = json.load(data_file)
-        
-        assert AGENT_TYPE_KEY in json_data
-        agent_type = json_data[AGENT_TYPE_KEY]
-        
-        if (agent_type == PPOAgent.NAME):
-            agent = PPOAgent(world, id, json_data)
-        else:
-            assert False, 'Unsupported agent type: ' + agent_type

-    return agent
+def build_agent(world, id, file):
+  agent = None
+  with open(pybullet_data.getDataPath() + "/" + file) as data_file:
+    json_data = json.load(data_file)
+
+    assert AGENT_TYPE_KEY in json_data
+    agent_type = json_data[AGENT_TYPE_KEY]
+
+    if (agent_type == PPOAgent.NAME):
+      agent = PPOAgent(world, id, json_data)
+    else:
+      assert False, 'Unsupported agent type: ' + agent_type
+
+  return agent
--- a/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/exp_params.py
+++ b/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/exp_params.py
@@ -2,53 +2,54 @@ import json
 import numpy as np
 import pybullet_utils.math_util as MathUtil

+
 class ExpParams(object):
-    RATE_KEY = 'Rate'
-    INIT_ACTION_RATE_KEY = 'InitActionRate'
-    NOISE_KEY = 'Noise'
-    NOISE_INTERNAL_KEY = 'NoiseInternal'
-    TEMP_KEY = 'Temp'
+  RATE_KEY = 'Rate'
+  INIT_ACTION_RATE_KEY = 'InitActionRate'
+  NOISE_KEY = 'Noise'
+  NOISE_INTERNAL_KEY = 'NoiseInternal'
+  TEMP_KEY = 'Temp'

-    def __init__(self):
-        self.rate = 0.2
-        self.init_action_rate = 0
-        self.noise = 0.1
-        self.noise_internal = 0
-        self.temp = 0.1
-        return
+  def __init__(self):
+    self.rate = 0.2
+    self.init_action_rate = 0
+    self.noise = 0.1
+    self.noise_internal = 0
+    self.temp = 0.1
+    return

-    def __str__(self):
-        str = ''
-        str += '{}: {:.2f}\n'.format(self.RATE_KEY, self.rate)
-        str += '{}: {:.2f}\n'.format(self.INIT_ACTION_RATE_KEY, self.init_action_rate)
-        str += '{}: {:.2f}\n'.format(self.NOISE_KEY, self.noise)
-        str += '{}: {:.2f}\n'.format(self.NOISE_INTERNAL_KEY, self.noise_internal)
-        str += '{}: {:.2f}\n'.format(self.TEMP_KEY, self.temp)
-        return str
+  def __str__(self):
+    str = ''
+    str += '{}: {:.2f}\n'.format(self.RATE_KEY, self.rate)
+    str += '{}: {:.2f}\n'.format(self.INIT_ACTION_RATE_KEY, self.init_action_rate)
+    str += '{}: {:.2f}\n'.format(self.NOISE_KEY, self.noise)
+    str += '{}: {:.2f}\n'.format(self.NOISE_INTERNAL_KEY, self.noise_internal)
+    str += '{}: {:.2f}\n'.format(self.TEMP_KEY, self.temp)
+    return str

-    def load(self, json_data):
-        if (self.RATE_KEY in json_data):
-            self.rate = json_data[self.RATE_KEY]
+  def load(self, json_data):
+    if (self.RATE_KEY in json_data):
+      self.rate = json_data[self.RATE_KEY]

-        if (self.INIT_ACTION_RATE_KEY in json_data):
-            self.init_action_rate = json_data[self.INIT_ACTION_RATE_KEY]
+    if (self.INIT_ACTION_RATE_KEY in json_data):
+      self.init_action_rate = json_data[self.INIT_ACTION_RATE_KEY]

-        if (self.NOISE_KEY in json_data):
-            self.noise = json_data[self.NOISE_KEY]
+    if (self.NOISE_KEY in json_data):
+      self.noise = json_data[self.NOISE_KEY]

-        if (self.NOISE_INTERNAL_KEY in json_data):
-            self.noise_internal = json_data[self.NOISE_INTERNAL_KEY]
+    if (self.NOISE_INTERNAL_KEY in json_data):
+      self.noise_internal = json_data[self.NOISE_INTERNAL_KEY]

-        if (self.TEMP_KEY in json_data):
-            self.temp = json_data[self.TEMP_KEY]
+    if (self.TEMP_KEY in json_data):
+      self.temp = json_data[self.TEMP_KEY]

-        return
+    return

-    def lerp(self, other, t):
-        lerp_params = ExpParams()
-        lerp_params.rate = MathUtil.lerp(self.rate, other.rate, t)
-        lerp_params.init_action_rate = MathUtil.lerp(self.init_action_rate, other.init_action_rate, t)
-        lerp_params.noise = MathUtil.lerp(self.noise, other.noise, t)
-        lerp_params.noise_internal = MathUtil.lerp(self.noise_internal, other.noise_internal, t)
-        lerp_params.temp = MathUtil.log_lerp(self.temp, other.temp, t)
-        return lerp_params
+  def lerp(self, other, t):
+    lerp_params = ExpParams()
+    lerp_params.rate = MathUtil.lerp(self.rate, other.rate, t)
+    lerp_params.init_action_rate = MathUtil.lerp(self.init_action_rate, other.init_action_rate, t)
+    lerp_params.noise = MathUtil.lerp(self.noise, other.noise, t)
+    lerp_params.noise_internal = MathUtil.lerp(self.noise_internal, other.noise_internal, t)
+    lerp_params.temp = MathUtil.log_lerp(self.temp, other.temp, t)
+    return lerp_params
--- a/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/nets/init.py
+++ b/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/nets/init.py
@@ -1 +1 @@
-from . import *
+from . import *
--- a/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/nets/fc_2layers_1024units.py
+++ b/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/nets/fc_2layers_1024units.py
@@ -3,11 +3,12 @@ import pybullet_envs.deep_mimic.learning.tf_util as TFUtil

 NAME = "fc_2layers_1024units"

-def build_net(input_tfs, reuse=False):
-    layers = [1024, 512]
-    activation = tf.nn.relu

-    input_tf = tf.concat(axis=-1, values=input_tfs)          
-    h = TFUtil.fc_net(input_tf, layers, activation=activation, reuse=reuse)
-    h = activation(h)
-    return h
+def build_net(input_tfs, reuse=False):
+  layers = [1024, 512]
+  activation = tf.nn.relu
+
+  input_tf = tf.concat(axis=-1, values=input_tfs)
+  h = TFUtil.fc_net(input_tf, layers, activation=activation, reuse=reuse)
+  h = activation(h)
+  return h
--- a/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/nets/net_builder.py
+++ b/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/nets/net_builder.py
@@ -1,11 +1,12 @@
 import pybullet_envs.deep_mimic.learning.nets.fc_2layers_1024units as fc_2layers_1024units

-def build_net(net_name, input_tfs, reuse=False):
-    net = None

-    if (net_name == fc_2layers_1024units.NAME):
-        net = fc_2layers_1024units.build_net(input_tfs, reuse)
-    else:
-        assert False, 'Unsupported net: ' + net_name
-    
-    return net
+def build_net(net_name, input_tfs, reuse=False):
+  net = None
+
+  if (net_name == fc_2layers_1024units.NAME):
+    net = fc_2layers_1024units.build_net(input_tfs, reuse)
+  else:
+    assert False, 'Unsupported net: ' + net_name
+
+  return net
--- a/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/normalizer.py
+++ b/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/normalizer.py
@@ -3,147 +3,149 @@ import copy
 import pybullet_utils.mpi_util as MPIUtil
 from pybullet_utils.logger import Logger

+
 class Normalizer(object):
-    CHECK_SYNC_COUNT = 50000 # check synchronization after a certain number of entries
+  CHECK_SYNC_COUNT = 50000  # check synchronization after a certain number of entries

-    # these group IDs must be the same as those in CharController.h
-    NORM_GROUP_SINGLE = 0
-    NORM_GROUP_NONE = -1
+  # these group IDs must be the same as those in CharController.h
+  NORM_GROUP_SINGLE = 0
+  NORM_GROUP_NONE = -1

-    class Group(object):
-        def __init__(self, id, indices):
-            self.id = id
-            self.indices = indices
-            return
+  class Group(object):

-    def __init__(self, size, groups_ids=None, eps=0.02, clip=np.inf):
-        self.eps = eps
-        self.clip = clip
-        self.mean = np.zeros(size)
-        self.mean_sq = np.zeros(size)
-        self.std = np.ones(size)
-        self.count = 0
-        self.groups = self._build_groups(groups_ids)
+    def __init__(self, id, indices):
+      self.id = id
+      self.indices = indices
+      return

-        self.new_count = 0
-        self.new_sum = np.zeros_like(self.mean)
-        self.new_sum_sq = np.zeros_like(self.mean_sq)
-        return
+  def __init__(self, size, groups_ids=None, eps=0.02, clip=np.inf):
+    self.eps = eps
+    self.clip = clip
+    self.mean = np.zeros(size)
+    self.mean_sq = np.zeros(size)
+    self.std = np.ones(size)
+    self.count = 0
+    self.groups = self._build_groups(groups_ids)

-    def record(self, x):
-        size = self.get_size()
-        is_array = isinstance(x, np.ndarray)
-        if not is_array:
-            assert(size == 1)
-            x = np.array([[x]])
+    self.new_count = 0
+    self.new_sum = np.zeros_like(self.mean)
+    self.new_sum_sq = np.zeros_like(self.mean_sq)
+    return

-        assert x.shape[-1] == size, \
-            Logger.print2('Normalizer shape mismatch, expecting size {:d}, but got {:d}'.format(size, x.shape[-1]))
-        x = np.reshape(x, [-1, size])
+  def record(self, x):
+    size = self.get_size()
+    is_array = isinstance(x, np.ndarray)
+    if not is_array:
+      assert (size == 1)
+      x = np.array([[x]])

-        self.new_count += x.shape[0]
-        self.new_sum += np.sum(x, axis=0)
-        self.new_sum_sq += np.sum(np.square(x), axis=0)
-        return
+    assert x.shape[-1] == size, \
+        Logger.print2('Normalizer shape mismatch, expecting size {:d}, but got {:d}'.format(size, x.shape[-1]))
+    x = np.reshape(x, [-1, size])

-    def update(self):
-        new_count = MPIUtil.reduce_sum(self.new_count)
-        new_sum = MPIUtil.reduce_sum(self.new_sum)
-        new_sum_sq = MPIUtil.reduce_sum(self.new_sum_sq)
+    self.new_count += x.shape[0]
+    self.new_sum += np.sum(x, axis=0)
+    self.new_sum_sq += np.sum(np.square(x), axis=0)
+    return

-        new_total = self.count + new_count
-        if (self.count // self.CHECK_SYNC_COUNT != new_total // self.CHECK_SYNC_COUNT):
-            assert self.check_synced(), Logger.print2('Normalizer parameters desynchronized')
+  def update(self):
+    new_count = MPIUtil.reduce_sum(self.new_count)
+    new_sum = MPIUtil.reduce_sum(self.new_sum)
+    new_sum_sq = MPIUtil.reduce_sum(self.new_sum_sq)

-        if new_count > 0:
-            new_mean = self._process_group_data(new_sum / new_count, self.mean)
-            new_mean_sq = self._process_group_data(new_sum_sq / new_count, self.mean_sq)
-            w_old = float(self.count) / new_total
-            w_new = float(new_count) / new_total
+    new_total = self.count + new_count
+    if (self.count // self.CHECK_SYNC_COUNT != new_total // self.CHECK_SYNC_COUNT):
+      assert self.check_synced(), Logger.print2('Normalizer parameters desynchronized')

-            self.mean = w_old * self.mean + w_new * new_mean
-            self.mean_sq = w_old * self.mean_sq + w_new * new_mean_sq
-            self.count = new_total
-            self.std = self.calc_std(self.mean, self.mean_sq)
+    if new_count > 0:
+      new_mean = self._process_group_data(new_sum / new_count, self.mean)
+      new_mean_sq = self._process_group_data(new_sum_sq / new_count, self.mean_sq)
+      w_old = float(self.count) / new_total
+      w_new = float(new_count) / new_total

-            self.new_count = 0
-            self.new_sum.fill(0)
-            self.new_sum_sq.fill(0)
+      self.mean = w_old * self.mean + w_new * new_mean
+      self.mean_sq = w_old * self.mean_sq + w_new * new_mean_sq
+      self.count = new_total
+      self.std = self.calc_std(self.mean, self.mean_sq)

-        return
+      self.new_count = 0
+      self.new_sum.fill(0)
+      self.new_sum_sq.fill(0)

-    def get_size(self):
-        return self.mean.size
+    return

-    def set_mean_std(self, mean, std):
-        size = self.get_size()
-        is_array = isinstance(mean, np.ndarray) and isinstance(std, np.ndarray)
-        
-        if not is_array:
-            assert(size == 1)
-            mean = np.array([mean])
-            std = np.array([std])
+  def get_size(self):
+    return self.mean.size

-        assert len(mean) == size and len(std) == size, \
-            Logger.print2('Normalizer shape mismatch, expecting size {:d}, but got {:d} and {:d}'.format(size, len(mean), len(std)))
-        
-        self.mean = mean
-        self.std = std
-        self.mean_sq = self.calc_mean_sq(self.mean, self.std)
-        return
+  def set_mean_std(self, mean, std):
+    size = self.get_size()
+    is_array = isinstance(mean, np.ndarray) and isinstance(std, np.ndarray)

-    def normalize(self, x):
-        norm_x = (x - self.mean) / self.std
-        norm_x = np.clip(norm_x, -self.clip, self.clip)
-        return norm_x
+    if not is_array:
+      assert (size == 1)
+      mean = np.array([mean])
+      std = np.array([std])

-    def unnormalize(self, norm_x):
-        x = norm_x * self.std + self.mean
-        return x
+    assert len(mean) == size and len(std) == size, \
+        Logger.print2('Normalizer shape mismatch, expecting size {:d}, but got {:d} and {:d}'.format(size, len(mean), len(std)))

-    def calc_std(self, mean, mean_sq):
-        var = mean_sq - np.square(mean)
-        # some time floating point errors can lead to small negative numbers
-        var = np.maximum(var, 0)
-        std = np.sqrt(var)
-        std = np.maximum(std, self.eps)
-        return std
+    self.mean = mean
+    self.std = std
+    self.mean_sq = self.calc_mean_sq(self.mean, self.std)
+    return

-    def calc_mean_sq(self, mean, std):
-        return np.square(std) + np.square(self.mean)
+  def normalize(self, x):
+    norm_x = (x - self.mean) / self.std
+    norm_x = np.clip(norm_x, -self.clip, self.clip)
+    return norm_x

-    def check_synced(self):
-        synced = True
-        if MPIUtil.is_root_proc():
-            vars = np.concatenate([self.mean, self.mean_sq])
-            MPIUtil.bcast(vars)
-        else:
-            vars_local = np.concatenate([self.mean, self.mean_sq])
-            vars_root = np.empty_like(vars_local)
-            MPIUtil.bcast(vars_root)
-            synced = (vars_local == vars_root).all()
-        return synced
+  def unnormalize(self, norm_x):
+    x = norm_x * self.std + self.mean
+    return x

-    def _build_groups(self, groups_ids):
-        groups = []
-        if groups_ids is None:
-            curr_id = self.NORM_GROUP_SINGLE
-            curr_list = np.arange(self.get_size()).astype(np.int32)
-            groups.append(self.Group(curr_id, curr_list))
-        else:
-            ids = np.unique(groups_ids)
-            for id in ids:
-                curr_list = np.nonzero(groups_ids == id)[0].astype(np.int32)
-                groups.append(self.Group(id, curr_list))
+  def calc_std(self, mean, mean_sq):
+    var = mean_sq - np.square(mean)
+    # some time floating point errors can lead to small negative numbers
+    var = np.maximum(var, 0)
+    std = np.sqrt(var)
+    std = np.maximum(std, self.eps)
+    return std

-        return groups
+  def calc_mean_sq(self, mean, std):
+    return np.square(std) + np.square(self.mean)

-    def _process_group_data(self, new_data, old_data):
-        proc_data = new_data.copy()
-        for group in self.groups:
-            if group.id == self.NORM_GROUP_NONE:
-                proc_data[group.indices] = old_data[group.indices]
-            elif group.id != self.NORM_GROUP_SINGLE:
-                avg = np.mean(new_data[group.indices])
-                proc_data[group.indices] = avg
-        return proc_data
+  def check_synced(self):
+    synced = True
+    if MPIUtil.is_root_proc():
+      vars = np.concatenate([self.mean, self.mean_sq])
+      MPIUtil.bcast(vars)
+    else:
+      vars_local = np.concatenate([self.mean, self.mean_sq])
+      vars_root = np.empty_like(vars_local)
+      MPIUtil.bcast(vars_root)
+      synced = (vars_local == vars_root).all()
+    return synced
+
+  def _build_groups(self, groups_ids):
+    groups = []
+    if groups_ids is None:
+      curr_id = self.NORM_GROUP_SINGLE
+      curr_list = np.arange(self.get_size()).astype(np.int32)
+      groups.append(self.Group(curr_id, curr_list))
+    else:
+      ids = np.unique(groups_ids)
+      for id in ids:
+        curr_list = np.nonzero(groups_ids == id)[0].astype(np.int32)
+        groups.append(self.Group(id, curr_list))
+
+    return groups
+
+  def _process_group_data(self, new_data, old_data):
+    proc_data = new_data.copy()
+    for group in self.groups:
+      if group.id == self.NORM_GROUP_NONE:
+        proc_data[group.indices] = old_data[group.indices]
+      elif group.id != self.NORM_GROUP_SINGLE:
+        avg = np.mean(new_data[group.indices])
+        proc_data[group.indices] = avg
+    return proc_data
--- a/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/path.py
+++ b/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/path.py
@@ -1,46 +1,47 @@
 import numpy as np
 from pybullet_envs.deep_mimic.env.env import Env

+
 class Path(object):
-    def __init__(self):
-        self.clear()
-        return

-    def pathlength(self):
-        return len(self.actions)
+  def __init__(self):
+    self.clear()
+    return

-    def is_valid(self):
-        valid = True
-        l = self.pathlength()
-        valid &= len(self.states) == l + 1
-        valid &= len(self.goals) == l + 1
-        valid &= len(self.actions) == l
-        valid &= len(self.logps) == l
-        valid &= len(self.rewards) == l
-        valid &= len(self.flags) == l
+  def pathlength(self):
+    return len(self.actions)

-        return valid
+  def is_valid(self):
+    valid = True
+    l = self.pathlength()
+    valid &= len(self.states) == l + 1
+    valid &= len(self.goals) == l + 1
+    valid &= len(self.actions) == l
+    valid &= len(self.logps) == l
+    valid &= len(self.rewards) == l
+    valid &= len(self.flags) == l

-    def check_vals(self):
-        for vals in [self.states, self.goals, self.actions, self.logps,
-                  self.rewards]:
-            for v in vals:
-                if not np.isfinite(v).all():
-                    return False
-        return True
+    return valid

-    def clear(self):
-        self.states = []
-        self.goals = []
-        self.actions = []
-        self.logps = []
-        self.rewards = []
-        self.flags = []
-        self.terminate = Env.Terminate.Null
-        return
+  def check_vals(self):
+    for vals in [self.states, self.goals, self.actions, self.logps, self.rewards]:
+      for v in vals:
+        if not np.isfinite(v).all():
+          return False
+    return True

-    def get_pathlen(self):
-        return len(self.rewards)
+  def clear(self):
+    self.states = []
+    self.goals = []
+    self.actions = []
+    self.logps = []
+    self.rewards = []
+    self.flags = []
+    self.terminate = Env.Terminate.Null
+    return

-    def calc_return(self):
-        return sum(self.rewards)
+  def get_pathlen(self):
+    return len(self.rewards)
+
+  def calc_return(self):
+    return sum(self.rewards)
--- a/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/pg_agent.py
+++ b/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/pg_agent.py
@@ -13,341 +13,343 @@ import pybullet_utils.mpi_util as MPIUtil
 import pybullet_utils.math_util as MathUtil
 from pybullet_envs.deep_mimic.env.action_space import ActionSpace
 from pybullet_envs.deep_mimic.env.env import Env
-
 '''
 Policy Gradient Agent
 '''

+
 class PGAgent(TFAgent):
-    NAME = 'PG'
+  NAME = 'PG'

-    ACTOR_NET_KEY = 'ActorNet'
-    ACTOR_STEPSIZE_KEY = 'ActorStepsize'
-    ACTOR_MOMENTUM_KEY = 'ActorMomentum'
-    ACTOR_WEIGHT_DECAY_KEY = 'ActorWeightDecay'
-    ACTOR_INIT_OUTPUT_SCALE_KEY = 'ActorInitOutputScale'
+  ACTOR_NET_KEY = 'ActorNet'
+  ACTOR_STEPSIZE_KEY = 'ActorStepsize'
+  ACTOR_MOMENTUM_KEY = 'ActorMomentum'
+  ACTOR_WEIGHT_DECAY_KEY = 'ActorWeightDecay'
+  ACTOR_INIT_OUTPUT_SCALE_KEY = 'ActorInitOutputScale'

-    CRITIC_NET_KEY = 'CriticNet'
-    CRITIC_STEPSIZE_KEY = 'CriticStepsize'
-    CRITIC_MOMENTUM_KEY = 'CriticMomentum'
-    CRITIC_WEIGHT_DECAY_KEY = 'CriticWeightDecay'
-    
-    EXP_ACTION_FLAG = 1 << 0
+  CRITIC_NET_KEY = 'CriticNet'
+  CRITIC_STEPSIZE_KEY = 'CriticStepsize'
+  CRITIC_MOMENTUM_KEY = 'CriticMomentum'
+  CRITIC_WEIGHT_DECAY_KEY = 'CriticWeightDecay'

-    def __init__(self, world, id, json_data): 
-        self._exp_action = False
-        super().__init__(world, id, json_data)
-        return
+  EXP_ACTION_FLAG = 1 << 0

-    def reset(self):
-        super().reset()
-        self._exp_action = False
-        return
+  def __init__(self, world, id, json_data):
+    self._exp_action = False
+    super().__init__(world, id, json_data)
+    return

-    def _check_action_space(self):
-        action_space = self.get_action_space()
-        return action_space == ActionSpace.Continuous
+  def reset(self):
+    super().reset()
+    self._exp_action = False
+    return

-    def _load_params(self, json_data):
-        super()._load_params(json_data)
-        self.val_min, self.val_max = self._calc_val_bounds(self.discount)
-        self.val_fail, self.val_succ = self._calc_term_vals(self.discount)
-        return
+  def _check_action_space(self):
+    action_space = self.get_action_space()
+    return action_space == ActionSpace.Continuous

-    def _build_nets(self, json_data):
-        assert self.ACTOR_NET_KEY in json_data
-        assert self.CRITIC_NET_KEY in json_data
+  def _load_params(self, json_data):
+    super()._load_params(json_data)
+    self.val_min, self.val_max = self._calc_val_bounds(self.discount)
+    self.val_fail, self.val_succ = self._calc_term_vals(self.discount)
+    return

-        actor_net_name = json_data[self.ACTOR_NET_KEY]
-        critic_net_name = json_data[self.CRITIC_NET_KEY]
-        actor_init_output_scale = 1 if (self.ACTOR_INIT_OUTPUT_SCALE_KEY not in json_data) else json_data[self.ACTOR_INIT_OUTPUT_SCALE_KEY]
-        
-        s_size = self.get_state_size()
-        g_size = self.get_goal_size()
-        a_size = self.get_action_size()
+  def _build_nets(self, json_data):
+    assert self.ACTOR_NET_KEY in json_data
+    assert self.CRITIC_NET_KEY in json_data

-        # setup input tensors
-        self.s_tf = tf.placeholder(tf.float32, shape=[None, s_size], name="s") # observations
-        self.tar_val_tf = tf.placeholder(tf.float32, shape=[None], name="tar_val") # target value s
-        self.adv_tf = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage
-        self.a_tf = tf.placeholder(tf.float32, shape=[None, a_size], name="a") # target actions
-        self.g_tf = tf.placeholder(tf.float32, shape=([None, g_size] if self.has_goal() else None), name="g") # goals
+    actor_net_name = json_data[self.ACTOR_NET_KEY]
+    critic_net_name = json_data[self.CRITIC_NET_KEY]
+    actor_init_output_scale = 1 if (self.ACTOR_INIT_OUTPUT_SCALE_KEY not in json_data
+                                   ) else json_data[self.ACTOR_INIT_OUTPUT_SCALE_KEY]

-        with tf.variable_scope('main'):
-            with tf.variable_scope('actor'):
-                self.actor_tf = self._build_net_actor(actor_net_name, actor_init_output_scale)
-            with tf.variable_scope('critic'):
-                self.critic_tf = self._build_net_critic(critic_net_name)
+    s_size = self.get_state_size()
+    g_size = self.get_goal_size()
+    a_size = self.get_action_size()

-        if (self.actor_tf != None):
-            Logger.print2('Built actor net: ' + actor_net_name)
+    # setup input tensors
+    self.s_tf = tf.placeholder(tf.float32, shape=[None, s_size], name="s")  # observations
+    self.tar_val_tf = tf.placeholder(tf.float32, shape=[None], name="tar_val")  # target value s
+    self.adv_tf = tf.placeholder(tf.float32, shape=[None], name="adv")  # advantage
+    self.a_tf = tf.placeholder(tf.float32, shape=[None, a_size], name="a")  # target actions
+    self.g_tf = tf.placeholder(tf.float32,
+                               shape=([None, g_size] if self.has_goal() else None),
+                               name="g")  # goals

-        if (self.critic_tf != None):
-            Logger.print2('Built critic net: ' + critic_net_name)
+    with tf.variable_scope('main'):
+      with tf.variable_scope('actor'):
+        self.actor_tf = self._build_net_actor(actor_net_name, actor_init_output_scale)
+      with tf.variable_scope('critic'):
+        self.critic_tf = self._build_net_critic(critic_net_name)

-        return
+    if (self.actor_tf != None):
+      Logger.print2('Built actor net: ' + actor_net_name)

-    def _build_normalizers(self):
-        super()._build_normalizers()
-        with self.sess.as_default(), self.graph.as_default(), tf.variable_scope(self.tf_scope):
-            with tf.variable_scope(self.RESOURCE_SCOPE):
-                val_offset, val_scale = self._calc_val_offset_scale(self.discount)
-                self.val_norm = TFNormalizer(self.sess, 'val_norm', 1)
-                self.val_norm.set_mean_std(-val_offset, 1.0 / val_scale)
-        return
+    if (self.critic_tf != None):
+      Logger.print2('Built critic net: ' + critic_net_name)

-    def _init_normalizers(self):
-        super()._init_normalizers()
-        with self.sess.as_default(), self.graph.as_default():
-            self.val_norm.update()
-        return
+    return

-    def _load_normalizers(self):
-        super()._load_normalizers()
-        self.val_norm.load()
-        return
+  def _build_normalizers(self):
+    super()._build_normalizers()
+    with self.sess.as_default(), self.graph.as_default(), tf.variable_scope(self.tf_scope):
+      with tf.variable_scope(self.RESOURCE_SCOPE):
+        val_offset, val_scale = self._calc_val_offset_scale(self.discount)
+        self.val_norm = TFNormalizer(self.sess, 'val_norm', 1)
+        self.val_norm.set_mean_std(-val_offset, 1.0 / val_scale)
+    return

-    def _build_losses(self, json_data):
-        actor_weight_decay = 0 if (self.ACTOR_WEIGHT_DECAY_KEY not in json_data) else json_data[self.ACTOR_WEIGHT_DECAY_KEY]
-        critic_weight_decay = 0 if (self.CRITIC_WEIGHT_DECAY_KEY not in json_data) else json_data[self.CRITIC_WEIGHT_DECAY_KEY]
+  def _init_normalizers(self):
+    super()._init_normalizers()
+    with self.sess.as_default(), self.graph.as_default():
+      self.val_norm.update()
+    return

-        norm_val_diff = self.val_norm.normalize_tf(self.tar_val_tf) - self.val_norm.normalize_tf(self.critic_tf)
-        self.critic_loss_tf = 0.5 * tf.reduce_mean(tf.square(norm_val_diff))
+  def _load_normalizers(self):
+    super()._load_normalizers()
+    self.val_norm.load()
+    return

-        if (critic_weight_decay != 0):
-            self.critic_loss_tf += critic_weight_decay * self._weight_decay_loss('main/critic')
-        
-        norm_a_mean_tf = self.a_norm.normalize_tf(self.actor_tf)
-        norm_a_diff = self.a_norm.normalize_tf(self.a_tf) - norm_a_mean_tf
+  def _build_losses(self, json_data):
+    actor_weight_decay = 0 if (
+        self.ACTOR_WEIGHT_DECAY_KEY not in json_data) else json_data[self.ACTOR_WEIGHT_DECAY_KEY]
+    critic_weight_decay = 0 if (
+        self.CRITIC_WEIGHT_DECAY_KEY not in json_data) else json_data[self.CRITIC_WEIGHT_DECAY_KEY]

-        self.actor_loss_tf = tf.reduce_sum(tf.square(norm_a_diff), axis=-1)
-        self.actor_loss_tf *= self.adv_tf
-        self.actor_loss_tf = 0.5 * tf.reduce_mean(self.actor_loss_tf)
+    norm_val_diff = self.val_norm.normalize_tf(self.tar_val_tf) - self.val_norm.normalize_tf(
+        self.critic_tf)
+    self.critic_loss_tf = 0.5 * tf.reduce_mean(tf.square(norm_val_diff))

-        norm_a_bound_min = self.a_norm.normalize(self.a_bound_min)
-        norm_a_bound_max = self.a_norm.normalize(self.a_bound_max)
-        a_bound_loss = TFUtil.calc_bound_loss(norm_a_mean_tf, norm_a_bound_min, norm_a_bound_max)
-        a_bound_loss /= self.exp_params_curr.noise
-        self.actor_loss_tf += a_bound_loss
+    if (critic_weight_decay != 0):
+      self.critic_loss_tf += critic_weight_decay * self._weight_decay_loss('main/critic')

-        if (actor_weight_decay != 0):
-            self.actor_loss_tf += actor_weight_decay * self._weight_decay_loss('main/actor')
-        
-        return
+    norm_a_mean_tf = self.a_norm.normalize_tf(self.actor_tf)
+    norm_a_diff = self.a_norm.normalize_tf(self.a_tf) - norm_a_mean_tf

-    def _build_solvers(self, json_data):
-        actor_stepsize = 0.001 if (self.ACTOR_STEPSIZE_KEY not in json_data) else json_data[self.ACTOR_STEPSIZE_KEY]
-        actor_momentum = 0.9 if (self.ACTOR_MOMENTUM_KEY not in json_data) else json_data[self.ACTOR_MOMENTUM_KEY]
-        critic_stepsize = 0.01 if (self.CRITIC_STEPSIZE_KEY not in json_data) else json_data[self.CRITIC_STEPSIZE_KEY]
-        critic_momentum = 0.9 if (self.CRITIC_MOMENTUM_KEY not in json_data) else json_data[self.CRITIC_MOMENTUM_KEY]
-        
-        critic_vars = self._tf_vars('main/critic')
-        critic_opt = tf.train.MomentumOptimizer(learning_rate=critic_stepsize, momentum=critic_momentum)
-        self.critic_grad_tf = tf.gradients(self.critic_loss_tf, critic_vars)
-        self.critic_solver = MPISolver(self.sess, critic_opt, critic_vars)
+    self.actor_loss_tf = tf.reduce_sum(tf.square(norm_a_diff), axis=-1)
+    self.actor_loss_tf *= self.adv_tf
+    self.actor_loss_tf = 0.5 * tf.reduce_mean(self.actor_loss_tf)

-        actor_vars = self._tf_vars('main/actor')
-        actor_opt = tf.train.MomentumOptimizer(learning_rate=actor_stepsize, momentum=actor_momentum)
-        self.actor_grad_tf = tf.gradients(self.actor_loss_tf, actor_vars)
-        self.actor_solver = MPISolver(self.sess, actor_opt, actor_vars)
+    norm_a_bound_min = self.a_norm.normalize(self.a_bound_min)
+    norm_a_bound_max = self.a_norm.normalize(self.a_bound_max)
+    a_bound_loss = TFUtil.calc_bound_loss(norm_a_mean_tf, norm_a_bound_min, norm_a_bound_max)
+    a_bound_loss /= self.exp_params_curr.noise
+    self.actor_loss_tf += a_bound_loss

-        return
+    if (actor_weight_decay != 0):
+      self.actor_loss_tf += actor_weight_decay * self._weight_decay_loss('main/actor')

-    def _build_net_actor(self, net_name, init_output_scale):
-        norm_s_tf = self.s_norm.normalize_tf(self.s_tf)
-        input_tfs = [norm_s_tf]
-        if (self.has_goal()):
-            norm_g_tf = self.g_norm.normalize_tf(self.g_tf)
-            input_tfs += [norm_g_tf]
-        
-        h = NetBuilder.build_net(net_name, input_tfs)
-        norm_a_tf = tf.layers.dense(inputs=h, units=self.get_action_size(), activation=None,
-                                kernel_initializer=tf.random_uniform_initializer(minval=-init_output_scale, maxval=init_output_scale))
-        
-        a_tf = self.a_norm.unnormalize_tf(norm_a_tf)
-        return a_tf
-    
-    def _build_net_critic(self, net_name):
-        norm_s_tf = self.s_norm.normalize_tf(self.s_tf)
-        input_tfs = [norm_s_tf]
-        if (self.has_goal()):
-            norm_g_tf = self.g_norm.normalize_tf(self.g_tf)
-            input_tfs += [norm_g_tf]
-        
-        h = NetBuilder.build_net(net_name, input_tfs)
-        norm_val_tf = tf.layers.dense(inputs=h, units=1, activation=None,
-                                kernel_initializer=TFUtil.xavier_initializer);
+    return

-        norm_val_tf = tf.reshape(norm_val_tf, [-1])
-        val_tf = self.val_norm.unnormalize_tf(norm_val_tf)
-        return val_tf
+  def _build_solvers(self, json_data):
+    actor_stepsize = 0.001 if (
+        self.ACTOR_STEPSIZE_KEY not in json_data) else json_data[self.ACTOR_STEPSIZE_KEY]
+    actor_momentum = 0.9 if (
+        self.ACTOR_MOMENTUM_KEY not in json_data) else json_data[self.ACTOR_MOMENTUM_KEY]
+    critic_stepsize = 0.01 if (
+        self.CRITIC_STEPSIZE_KEY not in json_data) else json_data[self.CRITIC_STEPSIZE_KEY]
+    critic_momentum = 0.9 if (
+        self.CRITIC_MOMENTUM_KEY not in json_data) else json_data[self.CRITIC_MOMENTUM_KEY]

-    def _initialize_vars(self):
-        super()._initialize_vars()
-        self._sync_solvers()
-        return
+    critic_vars = self._tf_vars('main/critic')
+    critic_opt = tf.train.MomentumOptimizer(learning_rate=critic_stepsize,
+                                            momentum=critic_momentum)
+    self.critic_grad_tf = tf.gradients(self.critic_loss_tf, critic_vars)
+    self.critic_solver = MPISolver(self.sess, critic_opt, critic_vars)

-    def _sync_solvers(self):
-        self.actor_solver.sync()
-        self.critic_solver.sync()
-        return
+    actor_vars = self._tf_vars('main/actor')
+    actor_opt = tf.train.MomentumOptimizer(learning_rate=actor_stepsize, momentum=actor_momentum)
+    self.actor_grad_tf = tf.gradients(self.actor_loss_tf, actor_vars)
+    self.actor_solver = MPISolver(self.sess, actor_opt, actor_vars)

-    def _decide_action(self, s, g):
-        with self.sess.as_default(), self.graph.as_default():
-            self._exp_action = False
-            a = self._eval_actor(s, g)[0]
-            logp = 0
+    return

-            if self._enable_stoch_policy():
-                # epsilon-greedy
-                rand_action = MathUtil.flip_coin(self.exp_params_curr.rate)
-                if rand_action:
-                    norm_exp_noise = np.random.randn(*a.shape)
-                    norm_exp_noise *= self.exp_params_curr.noise
-                    exp_noise = norm_exp_noise * self.a_norm.std
-                    a += exp_noise
+  def _build_net_actor(self, net_name, init_output_scale):
+    norm_s_tf = self.s_norm.normalize_tf(self.s_tf)
+    input_tfs = [norm_s_tf]
+    if (self.has_goal()):
+      norm_g_tf = self.g_norm.normalize_tf(self.g_tf)
+      input_tfs += [norm_g_tf]

-                    logp = self._calc_action_logp(norm_exp_noise)
-                    self._exp_action = True
+    h = NetBuilder.build_net(net_name, input_tfs)
+    norm_a_tf = tf.layers.dense(inputs=h,
+                                units=self.get_action_size(),
+                                activation=None,
+                                kernel_initializer=tf.random_uniform_initializer(
+                                    minval=-init_output_scale, maxval=init_output_scale))

-        return a, logp
+    a_tf = self.a_norm.unnormalize_tf(norm_a_tf)
+    return a_tf

-    def _enable_stoch_policy(self):
-        return self.enable_training and (self._mode == self.Mode.TRAIN or self._mode == self.Mode.TRAIN_END)
+  def _build_net_critic(self, net_name):
+    norm_s_tf = self.s_norm.normalize_tf(self.s_tf)
+    input_tfs = [norm_s_tf]
+    if (self.has_goal()):
+      norm_g_tf = self.g_norm.normalize_tf(self.g_tf)
+      input_tfs += [norm_g_tf]

-    def _eval_actor(self, s, g):
-        s = np.reshape(s, [-1, self.get_state_size()])
-        g = np.reshape(g, [-1, self.get_goal_size()]) if self.has_goal() else None
-          
-        feed = {
-            self.s_tf : s,
-            self.g_tf : g
-        }
+    h = NetBuilder.build_net(net_name, input_tfs)
+    norm_val_tf = tf.layers.dense(inputs=h,
+                                  units=1,
+                                  activation=None,
+                                  kernel_initializer=TFUtil.xavier_initializer)

-        a = self.actor_tf.eval(feed)
-        return a
-    
-    def _eval_critic(self, s, g):
-        with self.sess.as_default(), self.graph.as_default():
-            s = np.reshape(s, [-1, self.get_state_size()])
-            g = np.reshape(g, [-1, self.get_goal_size()]) if self.has_goal() else None
+    norm_val_tf = tf.reshape(norm_val_tf, [-1])
+    val_tf = self.val_norm.unnormalize_tf(norm_val_tf)
+    return val_tf

-            feed = {
-                self.s_tf : s,
-                self.g_tf : g
-            }
+  def _initialize_vars(self):
+    super()._initialize_vars()
+    self._sync_solvers()
+    return

-            val = self.critic_tf.eval(feed)    
-        return val
+  def _sync_solvers(self):
+    self.actor_solver.sync()
+    self.critic_solver.sync()
+    return

-    def _record_flags(self):
-        flags = int(0)
-        if (self._exp_action):
-            flags = flags | self.EXP_ACTION_FLAG
-        return flags
+  def _decide_action(self, s, g):
+    with self.sess.as_default(), self.graph.as_default():
+      self._exp_action = False
+      a = self._eval_actor(s, g)[0]
+      logp = 0

-    def _train_step(self):
-        super()._train_step()
+      if self._enable_stoch_policy():
+        # epsilon-greedy
+        rand_action = MathUtil.flip_coin(self.exp_params_curr.rate)
+        if rand_action:
+          norm_exp_noise = np.random.randn(*a.shape)
+          norm_exp_noise *= self.exp_params_curr.noise
+          exp_noise = norm_exp_noise * self.a_norm.std
+          a += exp_noise

-        critic_loss = self._update_critic()
-        actor_loss = self._update_actor()
-        critic_loss = MPIUtil.reduce_avg(critic_loss)
-        actor_loss = MPIUtil.reduce_avg(actor_loss)
+          logp = self._calc_action_logp(norm_exp_noise)
+          self._exp_action = True

-        critic_stepsize = self.critic_solver.get_stepsize()
-        actor_stepsize = self.actor_solver.get_stepsize()
-        
-        self.logger.log_tabular('Critic_Loss', critic_loss)
-        self.logger.log_tabular('Critic_Stepsize', critic_stepsize)
-        self.logger.log_tabular('Actor_Loss', actor_loss) 
-        self.logger.log_tabular('Actor_Stepsize', actor_stepsize)
+    return a, logp

-        return
+  def _enable_stoch_policy(self):
+    return self.enable_training and (self._mode == self.Mode.TRAIN or
+                                     self._mode == self.Mode.TRAIN_END)

-    def _update_critic(self):
-        idx = self.replay_buffer.sample(self._local_mini_batch_size)
-        s = self.replay_buffer.get('states', idx)
-        g = self.replay_buffer.get('goals', idx) if self.has_goal() else None
-        
-        tar_V = self._calc_updated_vals(idx)
-        tar_V = np.clip(tar_V, self.val_min, self.val_max)
+  def _eval_actor(self, s, g):
+    s = np.reshape(s, [-1, self.get_state_size()])
+    g = np.reshape(g, [-1, self.get_goal_size()]) if self.has_goal() else None

-        feed = {
-            self.s_tf: s,
-            self.g_tf: g,
-            self.tar_val_tf: tar_V
-        }
+    feed = {self.s_tf: s, self.g_tf: g}

-        loss, grads = self.sess.run([self.critic_loss_tf, self.critic_grad_tf], feed)
-        self.critic_solver.update(grads)
-        return loss
-    
-    def _update_actor(self):
-        key = self.EXP_ACTION_FLAG
-        idx = self.replay_buffer.sample_filtered(self._local_mini_batch_size, key)
-        has_goal = self.has_goal()
+    a = self.actor_tf.eval(feed)
+    return a

-        s = self.replay_buffer.get('states', idx)
-        g = self.replay_buffer.get('goals', idx) if has_goal else None
-        a = self.replay_buffer.get('actions', idx)
+  def _eval_critic(self, s, g):
+    with self.sess.as_default(), self.graph.as_default():
+      s = np.reshape(s, [-1, self.get_state_size()])
+      g = np.reshape(g, [-1, self.get_goal_size()]) if self.has_goal() else None

-        V_new = self._calc_updated_vals(idx)
-        V_old = self._eval_critic(s, g)
-        adv = V_new - V_old
+      feed = {self.s_tf: s, self.g_tf: g}

-        feed = {
-            self.s_tf: s,
-            self.g_tf: g,
-            self.a_tf: a,
-            self.adv_tf: adv
-        }
+      val = self.critic_tf.eval(feed)
+    return val

-        loss, grads = self.sess.run([self.actor_loss_tf, self.actor_grad_tf], feed)
-        self.actor_solver.update(grads)
+  def _record_flags(self):
+    flags = int(0)
+    if (self._exp_action):
+      flags = flags | self.EXP_ACTION_FLAG
+    return flags

-        return loss
+  def _train_step(self):
+    super()._train_step()

-    def _calc_updated_vals(self, idx):
-        r = self.replay_buffer.get('rewards', idx)
+    critic_loss = self._update_critic()
+    actor_loss = self._update_actor()
+    critic_loss = MPIUtil.reduce_avg(critic_loss)
+    actor_loss = MPIUtil.reduce_avg(actor_loss)

-        if self.discount == 0:
-            new_V = r
-        else:
-            next_idx = self.replay_buffer.get_next_idx(idx)
-            s_next = self.replay_buffer.get('states', next_idx)
-            g_next = self.replay_buffer.get('goals', next_idx) if self.has_goal() else None
+    critic_stepsize = self.critic_solver.get_stepsize()
+    actor_stepsize = self.actor_solver.get_stepsize()

-            is_end = self.replay_buffer.is_path_end(idx)
-            is_fail = self.replay_buffer.check_terminal_flag(idx, Env.Terminate.Fail)
-            is_succ = self.replay_buffer.check_terminal_flag(idx, Env.Terminate.Succ)
-            is_fail = np.logical_and(is_end, is_fail) 
-            is_succ = np.logical_and(is_end, is_succ) 
+    self.logger.log_tabular('Critic_Loss', critic_loss)
+    self.logger.log_tabular('Critic_Stepsize', critic_stepsize)
+    self.logger.log_tabular('Actor_Loss', actor_loss)
+    self.logger.log_tabular('Actor_Stepsize', actor_stepsize)

-            V_next = self._eval_critic(s_next, g_next)
-            V_next[is_fail] = self.val_fail
-            V_next[is_succ] = self.val_succ
+    return

-            new_V = r + self.discount * V_next
-        return new_V
+  def _update_critic(self):
+    idx = self.replay_buffer.sample(self._local_mini_batch_size)
+    s = self.replay_buffer.get('states', idx)
+    g = self.replay_buffer.get('goals', idx) if self.has_goal() else None

-    def _calc_action_logp(self, norm_action_deltas):
-        # norm action delta are for the normalized actions (scaled by self.a_norm.std) 
-        stdev = self.exp_params_curr.noise
-        assert stdev > 0
+    tar_V = self._calc_updated_vals(idx)
+    tar_V = np.clip(tar_V, self.val_min, self.val_max)

-        a_size = self.get_action_size()
-        logp = -0.5 / (stdev * stdev) * np.sum(np.square(norm_action_deltas), axis=-1)
-        logp += -0.5 * a_size * np.log(2 * np.pi)
-        logp += -a_size * np.log(stdev)
-        return logp
+    feed = {self.s_tf: s, self.g_tf: g, self.tar_val_tf: tar_V}

-    def _log_val(self, s, g):
-        val = self._eval_critic(s, g)
-        norm_val = self.val_norm.normalize(val)
-        self.world.env.log_val(self.id, norm_val[0])
-        return
+    loss, grads = self.sess.run([self.critic_loss_tf, self.critic_grad_tf], feed)
+    self.critic_solver.update(grads)
+    return loss

-    def _build_replay_buffer(self, buffer_size):
-        super()._build_replay_buffer(buffer_size)
-        self.replay_buffer.add_filter_key(self.EXP_ACTION_FLAG)
-        return
+  def _update_actor(self):
+    key = self.EXP_ACTION_FLAG
+    idx = self.replay_buffer.sample_filtered(self._local_mini_batch_size, key)
+    has_goal = self.has_goal()
+
+    s = self.replay_buffer.get('states', idx)
+    g = self.replay_buffer.get('goals', idx) if has_goal else None
+    a = self.replay_buffer.get('actions', idx)
+
+    V_new = self._calc_updated_vals(idx)
+    V_old = self._eval_critic(s, g)
+    adv = V_new - V_old
+
+    feed = {self.s_tf: s, self.g_tf: g, self.a_tf: a, self.adv_tf: adv}
+
+    loss, grads = self.sess.run([self.actor_loss_tf, self.actor_grad_tf], feed)
+    self.actor_solver.update(grads)
+
+    return loss
+
+  def _calc_updated_vals(self, idx):
+    r = self.replay_buffer.get('rewards', idx)
+
+    if self.discount == 0:
+      new_V = r
+    else:
+      next_idx = self.replay_buffer.get_next_idx(idx)
+      s_next = self.replay_buffer.get('states', next_idx)
+      g_next = self.replay_buffer.get('goals', next_idx) if self.has_goal() else None
+
+      is_end = self.replay_buffer.is_path_end(idx)
+      is_fail = self.replay_buffer.check_terminal_flag(idx, Env.Terminate.Fail)
+      is_succ = self.replay_buffer.check_terminal_flag(idx, Env.Terminate.Succ)
+      is_fail = np.logical_and(is_end, is_fail)
+      is_succ = np.logical_and(is_end, is_succ)
+
+      V_next = self._eval_critic(s_next, g_next)
+      V_next[is_fail] = self.val_fail
+      V_next[is_succ] = self.val_succ
+
+      new_V = r + self.discount * V_next
+    return new_V
+
+  def _calc_action_logp(self, norm_action_deltas):
+    # norm action delta are for the normalized actions (scaled by self.a_norm.std)
+    stdev = self.exp_params_curr.noise
+    assert stdev > 0
+
+    a_size = self.get_action_size()
+    logp = -0.5 / (stdev * stdev) * np.sum(np.square(norm_action_deltas), axis=-1)
+    logp += -0.5 * a_size * np.log(2 * np.pi)
+    logp += -a_size * np.log(stdev)
+    return logp
+
+  def _log_val(self, s, g):
+    val = self._eval_critic(s, g)
+    norm_val = self.val_norm.normalize(val)
+    self.world.env.log_val(self.id, norm_val[0])
+    return
+
+  def _build_replay_buffer(self, buffer_size):
+    super()._build_replay_buffer(buffer_size)
+    self.replay_buffer.add_filter_key(self.EXP_ACTION_FLAG)
+    return
--- a/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/ppo_agent.py
+++ b/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/ppo_agent.py
@@ -10,359 +10,374 @@ from pybullet_utils.logger import Logger
 import pybullet_utils.mpi_util as MPIUtil
 import pybullet_utils.math_util as MathUtil
 from pybullet_envs.deep_mimic.env.env import Env
-
 '''
 Proximal Policy Optimization Agent
 '''

+
 class PPOAgent(PGAgent):
-    NAME = "PPO"
-    EPOCHS_KEY = "Epochs"
-    BATCH_SIZE_KEY = "BatchSize"
-    RATIO_CLIP_KEY = "RatioClip"
-    NORM_ADV_CLIP_KEY = "NormAdvClip"
-    TD_LAMBDA_KEY = "TDLambda"
-    TAR_CLIP_FRAC = "TarClipFrac"
-    ACTOR_STEPSIZE_DECAY = "ActorStepsizeDecay"
+  NAME = "PPO"
+  EPOCHS_KEY = "Epochs"
+  BATCH_SIZE_KEY = "BatchSize"
+  RATIO_CLIP_KEY = "RatioClip"
+  NORM_ADV_CLIP_KEY = "NormAdvClip"
+  TD_LAMBDA_KEY = "TDLambda"
+  TAR_CLIP_FRAC = "TarClipFrac"
+  ACTOR_STEPSIZE_DECAY = "ActorStepsizeDecay"

-    def __init__(self, world, id, json_data): 
-        super().__init__(world, id, json_data)
-        return
+  def __init__(self, world, id, json_data):
+    super().__init__(world, id, json_data)
+    return

-    def _load_params(self, json_data):
-        super()._load_params(json_data)
+  def _load_params(self, json_data):
+    super()._load_params(json_data)

-        self.epochs = 1 if (self.EPOCHS_KEY not in json_data) else json_data[self.EPOCHS_KEY]
-        self.batch_size = 1024 if (self.BATCH_SIZE_KEY not in json_data) else json_data[self.BATCH_SIZE_KEY]
-        self.ratio_clip = 0.2 if (self.RATIO_CLIP_KEY not in json_data) else json_data[self.RATIO_CLIP_KEY]
-        self.norm_adv_clip = 5 if (self.NORM_ADV_CLIP_KEY not in json_data) else json_data[self.NORM_ADV_CLIP_KEY]
-        self.td_lambda = 0.95 if (self.TD_LAMBDA_KEY not in json_data) else json_data[self.TD_LAMBDA_KEY]
-        self.tar_clip_frac = -1 if (self.TAR_CLIP_FRAC not in json_data) else json_data[self.TAR_CLIP_FRAC]
-        self.actor_stepsize_decay = 0.5 if (self.ACTOR_STEPSIZE_DECAY not in json_data) else json_data[self.ACTOR_STEPSIZE_DECAY]
+    self.epochs = 1 if (self.EPOCHS_KEY not in json_data) else json_data[self.EPOCHS_KEY]
+    self.batch_size = 1024 if (
+        self.BATCH_SIZE_KEY not in json_data) else json_data[self.BATCH_SIZE_KEY]
+    self.ratio_clip = 0.2 if (
+        self.RATIO_CLIP_KEY not in json_data) else json_data[self.RATIO_CLIP_KEY]
+    self.norm_adv_clip = 5 if (
+        self.NORM_ADV_CLIP_KEY not in json_data) else json_data[self.NORM_ADV_CLIP_KEY]
+    self.td_lambda = 0.95 if (
+        self.TD_LAMBDA_KEY not in json_data) else json_data[self.TD_LAMBDA_KEY]
+    self.tar_clip_frac = -1 if (
+        self.TAR_CLIP_FRAC not in json_data) else json_data[self.TAR_CLIP_FRAC]
+    self.actor_stepsize_decay = 0.5 if (
+        self.ACTOR_STEPSIZE_DECAY not in json_data) else json_data[self.ACTOR_STEPSIZE_DECAY]

-        num_procs = MPIUtil.get_num_procs()
-        local_batch_size = int(self.batch_size / num_procs)
-        min_replay_size = 2 * local_batch_size # needed to prevent buffer overflow
-        assert(self.replay_buffer_size > min_replay_size)
+    num_procs = MPIUtil.get_num_procs()
+    local_batch_size = int(self.batch_size / num_procs)
+    min_replay_size = 2 * local_batch_size  # needed to prevent buffer overflow
+    assert (self.replay_buffer_size > min_replay_size)

-        self.replay_buffer_size = np.maximum(min_replay_size, self.replay_buffer_size)
+    self.replay_buffer_size = np.maximum(min_replay_size, self.replay_buffer_size)

-        return
+    return

-    def _build_nets(self, json_data):
-        assert self.ACTOR_NET_KEY in json_data
-        assert self.CRITIC_NET_KEY in json_data
+  def _build_nets(self, json_data):
+    assert self.ACTOR_NET_KEY in json_data
+    assert self.CRITIC_NET_KEY in json_data

-        actor_net_name = json_data[self.ACTOR_NET_KEY]
-        critic_net_name = json_data[self.CRITIC_NET_KEY]
-        actor_init_output_scale = 1 if (self.ACTOR_INIT_OUTPUT_SCALE_KEY not in json_data) else json_data[self.ACTOR_INIT_OUTPUT_SCALE_KEY]
+    actor_net_name = json_data[self.ACTOR_NET_KEY]
+    critic_net_name = json_data[self.CRITIC_NET_KEY]
+    actor_init_output_scale = 1 if (self.ACTOR_INIT_OUTPUT_SCALE_KEY not in json_data
+                                   ) else json_data[self.ACTOR_INIT_OUTPUT_SCALE_KEY]

-        s_size = self.get_state_size()
-        g_size = self.get_goal_size()
-        a_size = self.get_action_size()
+    s_size = self.get_state_size()
+    g_size = self.get_goal_size()
+    a_size = self.get_action_size()

-        # setup input tensors
-        self.s_tf = tf.placeholder(tf.float32, shape=[None, s_size], name="s")
-        self.a_tf = tf.placeholder(tf.float32, shape=[None, a_size], name="a")
-        self.tar_val_tf = tf.placeholder(tf.float32, shape=[None], name="tar_val")
-        self.adv_tf = tf.placeholder(tf.float32, shape=[None], name="adv")
-        self.g_tf = tf.placeholder(tf.float32, shape=([None, g_size] if self.has_goal() else None), name="g")
-        self.old_logp_tf = tf.placeholder(tf.float32, shape=[None], name="old_logp")
-        self.exp_mask_tf = tf.placeholder(tf.float32, shape=[None], name="exp_mask")
+    # setup input tensors
+    self.s_tf = tf.placeholder(tf.float32, shape=[None, s_size], name="s")
+    self.a_tf = tf.placeholder(tf.float32, shape=[None, a_size], name="a")
+    self.tar_val_tf = tf.placeholder(tf.float32, shape=[None], name="tar_val")
+    self.adv_tf = tf.placeholder(tf.float32, shape=[None], name="adv")
+    self.g_tf = tf.placeholder(tf.float32,
+                               shape=([None, g_size] if self.has_goal() else None),
+                               name="g")
+    self.old_logp_tf = tf.placeholder(tf.float32, shape=[None], name="old_logp")
+    self.exp_mask_tf = tf.placeholder(tf.float32, shape=[None], name="exp_mask")

-        with tf.variable_scope('main'):
-            with tf.variable_scope('actor'):
-                self.a_mean_tf = self._build_net_actor(actor_net_name, actor_init_output_scale)
-            with tf.variable_scope('critic'):
-                self.critic_tf = self._build_net_critic(critic_net_name)
-                
-        if (self.a_mean_tf != None):
-            Logger.print2('Built actor net: ' + actor_net_name)
+    with tf.variable_scope('main'):
+      with tf.variable_scope('actor'):
+        self.a_mean_tf = self._build_net_actor(actor_net_name, actor_init_output_scale)
+      with tf.variable_scope('critic'):
+        self.critic_tf = self._build_net_critic(critic_net_name)

-        if (self.critic_tf != None):
-            Logger.print2('Built critic net: ' + critic_net_name)
-        
-        self.norm_a_std_tf = self.exp_params_curr.noise * tf.ones(a_size)
-        norm_a_noise_tf = self.norm_a_std_tf * tf.random_normal(shape=tf.shape(self.a_mean_tf))
-        norm_a_noise_tf *= tf.expand_dims(self.exp_mask_tf, axis=-1)
-        self.sample_a_tf = self.a_mean_tf + norm_a_noise_tf * self.a_norm.std_tf
-        self.sample_a_logp_tf = TFUtil.calc_logp_gaussian(x_tf=norm_a_noise_tf, mean_tf=None, std_tf=self.norm_a_std_tf)
+    if (self.a_mean_tf != None):
+      Logger.print2('Built actor net: ' + actor_net_name)

-        return
+    if (self.critic_tf != None):
+      Logger.print2('Built critic net: ' + critic_net_name)

-    def _build_losses(self, json_data):
-        actor_weight_decay = 0 if (self.ACTOR_WEIGHT_DECAY_KEY not in json_data) else json_data[self.ACTOR_WEIGHT_DECAY_KEY]
-        critic_weight_decay = 0 if (self.CRITIC_WEIGHT_DECAY_KEY not in json_data) else json_data[self.CRITIC_WEIGHT_DECAY_KEY]
-        
-        norm_val_diff = self.val_norm.normalize_tf(self.tar_val_tf) - self.val_norm.normalize_tf(self.critic_tf)
-        self.critic_loss_tf = 0.5 * tf.reduce_mean(tf.square(norm_val_diff))
+    self.norm_a_std_tf = self.exp_params_curr.noise * tf.ones(a_size)
+    norm_a_noise_tf = self.norm_a_std_tf * tf.random_normal(shape=tf.shape(self.a_mean_tf))
+    norm_a_noise_tf *= tf.expand_dims(self.exp_mask_tf, axis=-1)
+    self.sample_a_tf = self.a_mean_tf + norm_a_noise_tf * self.a_norm.std_tf
+    self.sample_a_logp_tf = TFUtil.calc_logp_gaussian(x_tf=norm_a_noise_tf,
+                                                      mean_tf=None,
+                                                      std_tf=self.norm_a_std_tf)

-        if (critic_weight_decay != 0):
-            self.critic_loss_tf += critic_weight_decay * self._weight_decay_loss('main/critic')
-        
-        norm_tar_a_tf = self.a_norm.normalize_tf(self.a_tf)
-        self._norm_a_mean_tf = self.a_norm.normalize_tf(self.a_mean_tf)
+    return

-        self.logp_tf = TFUtil.calc_logp_gaussian(norm_tar_a_tf, self._norm_a_mean_tf, self.norm_a_std_tf)
-        ratio_tf = tf.exp(self.logp_tf - self.old_logp_tf)
-        actor_loss0 = self.adv_tf * ratio_tf
-        actor_loss1 = self.adv_tf * tf.clip_by_value(ratio_tf, 1.0 - self.ratio_clip, 1 + self.ratio_clip)
-        self.actor_loss_tf = -tf.reduce_mean(tf.minimum(actor_loss0, actor_loss1))
+  def _build_losses(self, json_data):
+    actor_weight_decay = 0 if (
+        self.ACTOR_WEIGHT_DECAY_KEY not in json_data) else json_data[self.ACTOR_WEIGHT_DECAY_KEY]
+    critic_weight_decay = 0 if (
+        self.CRITIC_WEIGHT_DECAY_KEY not in json_data) else json_data[self.CRITIC_WEIGHT_DECAY_KEY]

-        norm_a_bound_min = self.a_norm.normalize(self.a_bound_min)
-        norm_a_bound_max = self.a_norm.normalize(self.a_bound_max)
-        a_bound_loss = TFUtil.calc_bound_loss(self._norm_a_mean_tf, norm_a_bound_min, norm_a_bound_max)
-        self.actor_loss_tf += a_bound_loss
+    norm_val_diff = self.val_norm.normalize_tf(self.tar_val_tf) - self.val_norm.normalize_tf(
+        self.critic_tf)
+    self.critic_loss_tf = 0.5 * tf.reduce_mean(tf.square(norm_val_diff))

-        if (actor_weight_decay != 0):
-            self.actor_loss_tf += actor_weight_decay * self._weight_decay_loss('main/actor')
-        
-        # for debugging
-        self.clip_frac_tf = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio_tf - 1.0), self.ratio_clip)))
+    if (critic_weight_decay != 0):
+      self.critic_loss_tf += critic_weight_decay * self._weight_decay_loss('main/critic')

-        return
+    norm_tar_a_tf = self.a_norm.normalize_tf(self.a_tf)
+    self._norm_a_mean_tf = self.a_norm.normalize_tf(self.a_mean_tf)

-    def _build_solvers(self, json_data):
-        actor_stepsize = 0.001 if (self.ACTOR_STEPSIZE_KEY not in json_data) else json_data[self.ACTOR_STEPSIZE_KEY]
-        actor_momentum = 0.9 if (self.ACTOR_MOMENTUM_KEY not in json_data) else json_data[self.ACTOR_MOMENTUM_KEY]
-        critic_stepsize = 0.01 if (self.CRITIC_STEPSIZE_KEY not in json_data) else json_data[self.CRITIC_STEPSIZE_KEY]
-        critic_momentum = 0.9 if (self.CRITIC_MOMENTUM_KEY not in json_data) else json_data[self.CRITIC_MOMENTUM_KEY]
-        
-        critic_vars = self._tf_vars('main/critic')
-        critic_opt = tf.train.MomentumOptimizer(learning_rate=critic_stepsize, momentum=critic_momentum)
-        self.critic_grad_tf = tf.gradients(self.critic_loss_tf, critic_vars)
-        self.critic_solver = MPISolver(self.sess, critic_opt, critic_vars)
+    self.logp_tf = TFUtil.calc_logp_gaussian(norm_tar_a_tf, self._norm_a_mean_tf,
+                                             self.norm_a_std_tf)
+    ratio_tf = tf.exp(self.logp_tf - self.old_logp_tf)
+    actor_loss0 = self.adv_tf * ratio_tf
+    actor_loss1 = self.adv_tf * tf.clip_by_value(ratio_tf, 1.0 - self.ratio_clip,
+                                                 1 + self.ratio_clip)
+    self.actor_loss_tf = -tf.reduce_mean(tf.minimum(actor_loss0, actor_loss1))

-        self._actor_stepsize_tf = tf.get_variable(dtype=tf.float32, name='actor_stepsize', initializer=actor_stepsize, trainable=False)
-        self._actor_stepsize_ph = tf.get_variable(dtype=tf.float32, name='actor_stepsize_ph', shape=[])
-        self._actor_stepsize_update_op = self._actor_stepsize_tf.assign(self._actor_stepsize_ph)
+    norm_a_bound_min = self.a_norm.normalize(self.a_bound_min)
+    norm_a_bound_max = self.a_norm.normalize(self.a_bound_max)
+    a_bound_loss = TFUtil.calc_bound_loss(self._norm_a_mean_tf, norm_a_bound_min, norm_a_bound_max)
+    self.actor_loss_tf += a_bound_loss

-        actor_vars = self._tf_vars('main/actor')
-        actor_opt = tf.train.MomentumOptimizer(learning_rate=self._actor_stepsize_tf, momentum=actor_momentum)
-        self.actor_grad_tf = tf.gradients(self.actor_loss_tf, actor_vars)
-        self.actor_solver = MPISolver(self.sess, actor_opt, actor_vars)
-        
-        return
+    if (actor_weight_decay != 0):
+      self.actor_loss_tf += actor_weight_decay * self._weight_decay_loss('main/actor')

-    def _decide_action(self, s, g):
-        with self.sess.as_default(), self.graph.as_default():
-            self._exp_action = self._enable_stoch_policy() and MathUtil.flip_coin(self.exp_params_curr.rate)
-            #print("_decide_action._exp_action=",self._exp_action)
-            a, logp = self._eval_actor(s, g, self._exp_action)
-        return a[0], logp[0]
+    # for debugging
+    self.clip_frac_tf = tf.reduce_mean(
+        tf.to_float(tf.greater(tf.abs(ratio_tf - 1.0), self.ratio_clip)))

-    def _eval_actor(self, s, g, enable_exp):
-        s = np.reshape(s, [-1, self.get_state_size()])
-        g = np.reshape(g, [-1, self.get_goal_size()]) if self.has_goal() else None
-          
-        feed = {
-            self.s_tf : s,
-            self.g_tf : g,
-            self.exp_mask_tf: np.array([1 if enable_exp else 0])
-        }
+    return

-        a, logp = self.sess.run([self.sample_a_tf, self.sample_a_logp_tf], feed_dict=feed)
-        return a, logp
+  def _build_solvers(self, json_data):
+    actor_stepsize = 0.001 if (
+        self.ACTOR_STEPSIZE_KEY not in json_data) else json_data[self.ACTOR_STEPSIZE_KEY]
+    actor_momentum = 0.9 if (
+        self.ACTOR_MOMENTUM_KEY not in json_data) else json_data[self.ACTOR_MOMENTUM_KEY]
+    critic_stepsize = 0.01 if (
+        self.CRITIC_STEPSIZE_KEY not in json_data) else json_data[self.CRITIC_STEPSIZE_KEY]
+    critic_momentum = 0.9 if (
+        self.CRITIC_MOMENTUM_KEY not in json_data) else json_data[self.CRITIC_MOMENTUM_KEY]

-    def _train_step(self):
-        adv_eps = 1e-5
+    critic_vars = self._tf_vars('main/critic')
+    critic_opt = tf.train.MomentumOptimizer(learning_rate=critic_stepsize,
+                                            momentum=critic_momentum)
+    self.critic_grad_tf = tf.gradients(self.critic_loss_tf, critic_vars)
+    self.critic_solver = MPISolver(self.sess, critic_opt, critic_vars)

-        start_idx = self.replay_buffer.buffer_tail
-        end_idx = self.replay_buffer.buffer_head
-        assert(start_idx == 0)
-        assert(self.replay_buffer.get_current_size() <= self.replay_buffer.buffer_size) # must avoid overflow
-        assert(start_idx < end_idx)
+    self._actor_stepsize_tf = tf.get_variable(dtype=tf.float32,
+                                              name='actor_stepsize',
+                                              initializer=actor_stepsize,
+                                              trainable=False)
+    self._actor_stepsize_ph = tf.get_variable(dtype=tf.float32, name='actor_stepsize_ph', shape=[])
+    self._actor_stepsize_update_op = self._actor_stepsize_tf.assign(self._actor_stepsize_ph)

-        idx = np.array(list(range(start_idx, end_idx)))        
-        end_mask = self.replay_buffer.is_path_end(idx)
-        end_mask = np.logical_not(end_mask) 
-        
-        vals = self._compute_batch_vals(start_idx, end_idx)
-        new_vals = self._compute_batch_new_vals(start_idx, end_idx, vals)
+    actor_vars = self._tf_vars('main/actor')
+    actor_opt = tf.train.MomentumOptimizer(learning_rate=self._actor_stepsize_tf,
+                                           momentum=actor_momentum)
+    self.actor_grad_tf = tf.gradients(self.actor_loss_tf, actor_vars)
+    self.actor_solver = MPISolver(self.sess, actor_opt, actor_vars)

-        valid_idx = idx[end_mask]
-        exp_idx = self.replay_buffer.get_idx_filtered(self.EXP_ACTION_FLAG).copy()
-        num_valid_idx = valid_idx.shape[0]
-        num_exp_idx = exp_idx.shape[0]
-        exp_idx = np.column_stack([exp_idx, np.array(list(range(0, num_exp_idx)), dtype=np.int32)])
-        
-        local_sample_count = valid_idx.size
-        global_sample_count = int(MPIUtil.reduce_sum(local_sample_count))
-        mini_batches = int(np.ceil(global_sample_count / self.mini_batch_size))
-        
-        adv = new_vals[exp_idx[:,0]] - vals[exp_idx[:,0]]
-        new_vals = np.clip(new_vals, self.val_min, self.val_max)
+    return

-        adv_mean = np.mean(adv)
-        adv_std = np.std(adv)
-        adv = (adv - adv_mean) / (adv_std + adv_eps)
-        adv = np.clip(adv, -self.norm_adv_clip, self.norm_adv_clip)
+  def _decide_action(self, s, g):
+    with self.sess.as_default(), self.graph.as_default():
+      self._exp_action = self._enable_stoch_policy() and MathUtil.flip_coin(
+          self.exp_params_curr.rate)
+      #print("_decide_action._exp_action=",self._exp_action)
+      a, logp = self._eval_actor(s, g, self._exp_action)
+    return a[0], logp[0]

-        critic_loss = 0
-        actor_loss = 0
-        actor_clip_frac = 0
+  def _eval_actor(self, s, g, enable_exp):
+    s = np.reshape(s, [-1, self.get_state_size()])
+    g = np.reshape(g, [-1, self.get_goal_size()]) if self.has_goal() else None

-        for e in range(self.epochs):
-            np.random.shuffle(valid_idx)
-            np.random.shuffle(exp_idx)
+    feed = {self.s_tf: s, self.g_tf: g, self.exp_mask_tf: np.array([1 if enable_exp else 0])}

-            for b in range(mini_batches):
-                batch_idx_beg = b * self._local_mini_batch_size
-                batch_idx_end = batch_idx_beg + self._local_mini_batch_size
+    a, logp = self.sess.run([self.sample_a_tf, self.sample_a_logp_tf], feed_dict=feed)
+    return a, logp

-                critic_batch = np.array(range(batch_idx_beg, batch_idx_end), dtype=np.int32)
-                actor_batch = critic_batch.copy()
-                critic_batch = np.mod(critic_batch, num_valid_idx)
-                actor_batch = np.mod(actor_batch, num_exp_idx)
-                shuffle_actor = (actor_batch[-1] < actor_batch[0]) or (actor_batch[-1] == num_exp_idx - 1)
+  def _train_step(self):
+    adv_eps = 1e-5

-                critic_batch = valid_idx[critic_batch]
-                actor_batch = exp_idx[actor_batch]
-                critic_batch_vals = new_vals[critic_batch]
-                actor_batch_adv = adv[actor_batch[:,1]]
+    start_idx = self.replay_buffer.buffer_tail
+    end_idx = self.replay_buffer.buffer_head
+    assert (start_idx == 0)
+    assert (self.replay_buffer.get_current_size() <= self.replay_buffer.buffer_size
+           )  # must avoid overflow
+    assert (start_idx < end_idx)

-                critic_s = self.replay_buffer.get('states', critic_batch)
-                critic_g = self.replay_buffer.get('goals', critic_batch) if self.has_goal() else None
-                curr_critic_loss = self._update_critic(critic_s, critic_g, critic_batch_vals)
+    idx = np.array(list(range(start_idx, end_idx)))
+    end_mask = self.replay_buffer.is_path_end(idx)
+    end_mask = np.logical_not(end_mask)

-                actor_s = self.replay_buffer.get("states", actor_batch[:,0])
-                actor_g = self.replay_buffer.get("goals", actor_batch[:,0]) if self.has_goal() else None
-                actor_a = self.replay_buffer.get("actions", actor_batch[:,0])
-                actor_logp = self.replay_buffer.get("logps", actor_batch[:,0])
-                curr_actor_loss, curr_actor_clip_frac = self._update_actor(actor_s, actor_g, actor_a, actor_logp, actor_batch_adv)
-                
-                critic_loss += curr_critic_loss
-                actor_loss += np.abs(curr_actor_loss)
-                actor_clip_frac += curr_actor_clip_frac
+    vals = self._compute_batch_vals(start_idx, end_idx)
+    new_vals = self._compute_batch_new_vals(start_idx, end_idx, vals)

-                if (shuffle_actor):
-                    np.random.shuffle(exp_idx)
+    valid_idx = idx[end_mask]
+    exp_idx = self.replay_buffer.get_idx_filtered(self.EXP_ACTION_FLAG).copy()
+    num_valid_idx = valid_idx.shape[0]
+    num_exp_idx = exp_idx.shape[0]
+    exp_idx = np.column_stack([exp_idx, np.array(list(range(0, num_exp_idx)), dtype=np.int32)])

-        total_batches = mini_batches * self.epochs
-        critic_loss /= total_batches
-        actor_loss /= total_batches
-        actor_clip_frac /= total_batches
+    local_sample_count = valid_idx.size
+    global_sample_count = int(MPIUtil.reduce_sum(local_sample_count))
+    mini_batches = int(np.ceil(global_sample_count / self.mini_batch_size))

-        critic_loss = MPIUtil.reduce_avg(critic_loss)
-        actor_loss = MPIUtil.reduce_avg(actor_loss)
-        actor_clip_frac = MPIUtil.reduce_avg(actor_clip_frac)
+    adv = new_vals[exp_idx[:, 0]] - vals[exp_idx[:, 0]]
+    new_vals = np.clip(new_vals, self.val_min, self.val_max)

-        critic_stepsize = self.critic_solver.get_stepsize()
-        actor_stepsize = self.update_actor_stepsize(actor_clip_frac)
+    adv_mean = np.mean(adv)
+    adv_std = np.std(adv)
+    adv = (adv - adv_mean) / (adv_std + adv_eps)
+    adv = np.clip(adv, -self.norm_adv_clip, self.norm_adv_clip)

-        self.logger.log_tabular('Critic_Loss', critic_loss)
-        self.logger.log_tabular('Critic_Stepsize', critic_stepsize)
-        self.logger.log_tabular('Actor_Loss', actor_loss) 
-        self.logger.log_tabular('Actor_Stepsize', actor_stepsize)
-        self.logger.log_tabular('Clip_Frac', actor_clip_frac)
-        self.logger.log_tabular('Adv_Mean', adv_mean)
-        self.logger.log_tabular('Adv_Std', adv_std)
+    critic_loss = 0
+    actor_loss = 0
+    actor_clip_frac = 0

-        self.replay_buffer.clear()
+    for e in range(self.epochs):
+      np.random.shuffle(valid_idx)
+      np.random.shuffle(exp_idx)

-        return
+      for b in range(mini_batches):
+        batch_idx_beg = b * self._local_mini_batch_size
+        batch_idx_end = batch_idx_beg + self._local_mini_batch_size

-    def _get_iters_per_update(self):
-        return 1
+        critic_batch = np.array(range(batch_idx_beg, batch_idx_end), dtype=np.int32)
+        actor_batch = critic_batch.copy()
+        critic_batch = np.mod(critic_batch, num_valid_idx)
+        actor_batch = np.mod(actor_batch, num_exp_idx)
+        shuffle_actor = (actor_batch[-1] < actor_batch[0]) or (actor_batch[-1] == num_exp_idx - 1)

-    def _valid_train_step(self):
-        samples = self.replay_buffer.get_current_size()
-        exp_samples = self.replay_buffer.count_filtered(self.EXP_ACTION_FLAG)
-        global_sample_count = int(MPIUtil.reduce_sum(samples))
-        global_exp_min = int(MPIUtil.reduce_min(exp_samples))
-        return (global_sample_count > self.batch_size) and (global_exp_min > 0)
+        critic_batch = valid_idx[critic_batch]
+        actor_batch = exp_idx[actor_batch]
+        critic_batch_vals = new_vals[critic_batch]
+        actor_batch_adv = adv[actor_batch[:, 1]]

-    def _compute_batch_vals(self, start_idx, end_idx):
-        states = self.replay_buffer.get_all("states")[start_idx:end_idx]
-        goals = self.replay_buffer.get_all("goals")[start_idx:end_idx] if self.has_goal() else None
-        
-        idx = np.array(list(range(start_idx, end_idx)))        
-        is_end = self.replay_buffer.is_path_end(idx)
-        is_fail = self.replay_buffer.check_terminal_flag(idx, Env.Terminate.Fail)
-        is_succ = self.replay_buffer.check_terminal_flag(idx, Env.Terminate.Succ)
-        is_fail = np.logical_and(is_end, is_fail) 
-        is_succ = np.logical_and(is_end, is_succ) 
+        critic_s = self.replay_buffer.get('states', critic_batch)
+        critic_g = self.replay_buffer.get('goals', critic_batch) if self.has_goal() else None
+        curr_critic_loss = self._update_critic(critic_s, critic_g, critic_batch_vals)

-        vals = self._eval_critic(states, goals)
-        vals[is_fail] = self.val_fail
-        vals[is_succ] = self.val_succ
+        actor_s = self.replay_buffer.get("states", actor_batch[:, 0])
+        actor_g = self.replay_buffer.get("goals", actor_batch[:, 0]) if self.has_goal() else None
+        actor_a = self.replay_buffer.get("actions", actor_batch[:, 0])
+        actor_logp = self.replay_buffer.get("logps", actor_batch[:, 0])
+        curr_actor_loss, curr_actor_clip_frac = self._update_actor(actor_s, actor_g, actor_a,
+                                                                   actor_logp, actor_batch_adv)

-        return vals
+        critic_loss += curr_critic_loss
+        actor_loss += np.abs(curr_actor_loss)
+        actor_clip_frac += curr_actor_clip_frac

-    def _compute_batch_new_vals(self, start_idx, end_idx, val_buffer):
-        rewards = self.replay_buffer.get_all("rewards")[start_idx:end_idx]
+        if (shuffle_actor):
+          np.random.shuffle(exp_idx)

-        if self.discount == 0:
-            new_vals = rewards.copy()
+    total_batches = mini_batches * self.epochs
+    critic_loss /= total_batches
+    actor_loss /= total_batches
+    actor_clip_frac /= total_batches
+
+    critic_loss = MPIUtil.reduce_avg(critic_loss)
+    actor_loss = MPIUtil.reduce_avg(actor_loss)
+    actor_clip_frac = MPIUtil.reduce_avg(actor_clip_frac)
+
+    critic_stepsize = self.critic_solver.get_stepsize()
+    actor_stepsize = self.update_actor_stepsize(actor_clip_frac)
+
+    self.logger.log_tabular('Critic_Loss', critic_loss)
+    self.logger.log_tabular('Critic_Stepsize', critic_stepsize)
+    self.logger.log_tabular('Actor_Loss', actor_loss)
+    self.logger.log_tabular('Actor_Stepsize', actor_stepsize)
+    self.logger.log_tabular('Clip_Frac', actor_clip_frac)
+    self.logger.log_tabular('Adv_Mean', adv_mean)
+    self.logger.log_tabular('Adv_Std', adv_std)
+
+    self.replay_buffer.clear()
+
+    return
+
+  def _get_iters_per_update(self):
+    return 1
+
+  def _valid_train_step(self):
+    samples = self.replay_buffer.get_current_size()
+    exp_samples = self.replay_buffer.count_filtered(self.EXP_ACTION_FLAG)
+    global_sample_count = int(MPIUtil.reduce_sum(samples))
+    global_exp_min = int(MPIUtil.reduce_min(exp_samples))
+    return (global_sample_count > self.batch_size) and (global_exp_min > 0)
+
+  def _compute_batch_vals(self, start_idx, end_idx):
+    states = self.replay_buffer.get_all("states")[start_idx:end_idx]
+    goals = self.replay_buffer.get_all("goals")[start_idx:end_idx] if self.has_goal() else None
+
+    idx = np.array(list(range(start_idx, end_idx)))
+    is_end = self.replay_buffer.is_path_end(idx)
+    is_fail = self.replay_buffer.check_terminal_flag(idx, Env.Terminate.Fail)
+    is_succ = self.replay_buffer.check_terminal_flag(idx, Env.Terminate.Succ)
+    is_fail = np.logical_and(is_end, is_fail)
+    is_succ = np.logical_and(is_end, is_succ)
+
+    vals = self._eval_critic(states, goals)
+    vals[is_fail] = self.val_fail
+    vals[is_succ] = self.val_succ
+
+    return vals
+
+  def _compute_batch_new_vals(self, start_idx, end_idx, val_buffer):
+    rewards = self.replay_buffer.get_all("rewards")[start_idx:end_idx]
+
+    if self.discount == 0:
+      new_vals = rewards.copy()
+    else:
+      new_vals = np.zeros_like(val_buffer)
+
+      curr_idx = start_idx
+      while curr_idx < end_idx:
+        idx0 = curr_idx - start_idx
+        idx1 = self.replay_buffer.get_path_end(curr_idx) - start_idx
+        r = rewards[idx0:idx1]
+        v = val_buffer[idx0:(idx1 + 1)]
+
+        new_vals[idx0:idx1] = RLUtil.compute_return(r, self.discount, self.td_lambda, v)
+        curr_idx = idx1 + start_idx + 1
+
+    return new_vals
+
+  def _update_critic(self, s, g, tar_vals):
+    feed = {self.s_tf: s, self.g_tf: g, self.tar_val_tf: tar_vals}
+
+    loss, grads = self.sess.run([self.critic_loss_tf, self.critic_grad_tf], feed)
+    self.critic_solver.update(grads)
+    return loss
+
+  def _update_actor(self, s, g, a, logp, adv):
+    feed = {self.s_tf: s, self.g_tf: g, self.a_tf: a, self.adv_tf: adv, self.old_logp_tf: logp}
+
+    loss, grads, clip_frac = self.sess.run(
+        [self.actor_loss_tf, self.actor_grad_tf, self.clip_frac_tf], feed)
+    self.actor_solver.update(grads)
+
+    return loss, clip_frac
+
+  def update_actor_stepsize(self, clip_frac):
+    clip_tol = 1.5
+    step_scale = 2
+    max_stepsize = 1e-2
+    min_stepsize = 1e-8
+    warmup_iters = 5
+
+    actor_stepsize = self.actor_solver.get_stepsize()
+    if (self.tar_clip_frac >= 0 and self.iter > warmup_iters):
+      min_clip = self.tar_clip_frac / clip_tol
+      max_clip = self.tar_clip_frac * clip_tol
+      under_tol = clip_frac < min_clip
+      over_tol = clip_frac > max_clip
+
+      if (over_tol or under_tol):
+        if (over_tol):
+          actor_stepsize *= self.actor_stepsize_decay
        else:
-            new_vals = np.zeros_like(val_buffer)
+          actor_stepsize /= self.actor_stepsize_decay

-            curr_idx = start_idx
-            while curr_idx < end_idx:
-                idx0 = curr_idx - start_idx
-                idx1 = self.replay_buffer.get_path_end(curr_idx) - start_idx
-                r = rewards[idx0:idx1]
-                v = val_buffer[idx0:(idx1 + 1)]
+        actor_stepsize = np.clip(actor_stepsize, min_stepsize, max_stepsize)
+        self.set_actor_stepsize(actor_stepsize)

-                new_vals[idx0:idx1] = RLUtil.compute_return(r, self.discount, self.td_lambda, v)
-                curr_idx = idx1 + start_idx + 1
-        
-        return new_vals
+    return actor_stepsize

-    def _update_critic(self, s, g, tar_vals):
-        feed = {
-            self.s_tf: s,
-            self.g_tf: g,
-            self.tar_val_tf: tar_vals
-        }
-
-        loss, grads = self.sess.run([self.critic_loss_tf, self.critic_grad_tf], feed)
-        self.critic_solver.update(grads)
-        return loss
-    
-    def _update_actor(self, s, g, a, logp, adv):
-        feed = {
-            self.s_tf: s,
-            self.g_tf: g,
-            self.a_tf: a,
-            self.adv_tf: adv,
-            self.old_logp_tf: logp
-        }
-
-        loss, grads, clip_frac = self.sess.run([self.actor_loss_tf, self.actor_grad_tf,
-                                                        self.clip_frac_tf], feed)
-        self.actor_solver.update(grads)
-
-        return loss, clip_frac
-
-    def update_actor_stepsize(self, clip_frac):
-        clip_tol = 1.5
-        step_scale = 2
-        max_stepsize = 1e-2
-        min_stepsize = 1e-8
-        warmup_iters = 5
-
-        actor_stepsize = self.actor_solver.get_stepsize()
-        if (self.tar_clip_frac >= 0 and self.iter > warmup_iters):
-            min_clip = self.tar_clip_frac / clip_tol
-            max_clip = self.tar_clip_frac * clip_tol
-            under_tol = clip_frac < min_clip
-            over_tol = clip_frac > max_clip
-
-            if (over_tol or under_tol):
-                if (over_tol):
-                    actor_stepsize *= self.actor_stepsize_decay
-                else:
-                    actor_stepsize /= self.actor_stepsize_decay
-
-                actor_stepsize = np.clip(actor_stepsize, min_stepsize, max_stepsize)
-                self.set_actor_stepsize(actor_stepsize)
-
-        return actor_stepsize
-
-    def set_actor_stepsize(self, stepsize):
-        feed = {
-            self._actor_stepsize_ph: stepsize,
-        }
-        self.sess.run(self._actor_stepsize_update_op, feed)
-        return
+  def set_actor_stepsize(self, stepsize):
+    feed = {
+        self._actor_stepsize_ph: stepsize,
+    }
+    self.sess.run(self._actor_stepsize_update_op, feed)
+    return
--- a/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/replay_buffer.py
+++ b/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/replay_buffer.py
@@ -5,347 +5,353 @@ import inspect as inspect
 from pybullet_envs.deep_mimic.env.env import Env
 import pybullet_utils.math_util as MathUtil

+
 class ReplayBuffer(object):
-    TERMINATE_KEY = 'terminate'
-    PATH_START_KEY = 'path_start'
-    PATH_END_KEY = 'path_end'
+  TERMINATE_KEY = 'terminate'
+  PATH_START_KEY = 'path_start'
+  PATH_END_KEY = 'path_end'

-    def __init__(self, buffer_size):
-        assert buffer_size > 0
+  def __init__(self, buffer_size):
+    assert buffer_size > 0

-        self.buffer_size = buffer_size
-        self.total_count = 0
-        self.buffer_head = 0
-        self.buffer_tail = MathUtil.INVALID_IDX
-        self.num_paths = 0
-        self._sample_buffers = dict()
-        self.buffers = None
+    self.buffer_size = buffer_size
+    self.total_count = 0
+    self.buffer_head = 0
+    self.buffer_tail = MathUtil.INVALID_IDX
+    self.num_paths = 0
+    self._sample_buffers = dict()
+    self.buffers = None

-        self.clear()
-        return
+    self.clear()
+    return

-    def sample(self, n):
-        curr_size = self.get_current_size()
-        assert curr_size > 0
+  def sample(self, n):
+    curr_size = self.get_current_size()
+    assert curr_size > 0

-        idx = np.empty(n, dtype=int)
-        # makes sure that the end states are not sampled
-        for i in range(n):
-            while True:
-                curr_idx = np.random.randint(0, curr_size, size=1)[0]
-                curr_idx += self.buffer_tail
-                curr_idx = np.mod(curr_idx, self.buffer_size)
+    idx = np.empty(n, dtype=int)
+    # makes sure that the end states are not sampled
+    for i in range(n):
+      while True:
+        curr_idx = np.random.randint(0, curr_size, size=1)[0]
+        curr_idx += self.buffer_tail
+        curr_idx = np.mod(curr_idx, self.buffer_size)

-                if not self.is_path_end(curr_idx):
-                    break
-            idx[i] = curr_idx
+        if not self.is_path_end(curr_idx):
+          break
+      idx[i] = curr_idx

-        return idx
+    return idx

-    def sample_filtered(self, n, key):
-        assert key in self._sample_buffers
-        curr_buffer = self._sample_buffers[key]
-        idx = curr_buffer.sample(n)
-        return idx
+  def sample_filtered(self, n, key):
+    assert key in self._sample_buffers
+    curr_buffer = self._sample_buffers[key]
+    idx = curr_buffer.sample(n)
+    return idx

-    def count_filtered(self, key):
-        curr_buffer = self._sample_buffers[key]
-        return curr_buffer.count
+  def count_filtered(self, key):
+    curr_buffer = self._sample_buffers[key]
+    return curr_buffer.count

-    def get(self, key, idx):
-        return self.buffers[key][idx]
+  def get(self, key, idx):
+    return self.buffers[key][idx]

-    def get_all(self, key):
-        return self.buffers[key]
+  def get_all(self, key):
+    return self.buffers[key]

-    def get_idx_filtered(self, key):
-        assert key in self._sample_buffers
-        curr_buffer = self._sample_buffers[key]
-        idx = curr_buffer.slot_to_idx[:curr_buffer.count]
-        return idx
-    
-    def get_path_start(self, idx):
-        return self.buffers[self.PATH_START_KEY][idx]
+  def get_idx_filtered(self, key):
+    assert key in self._sample_buffers
+    curr_buffer = self._sample_buffers[key]
+    idx = curr_buffer.slot_to_idx[:curr_buffer.count]
+    return idx

-    def get_path_end(self, idx):
-        return self.buffers[self.PATH_END_KEY][idx]
+  def get_path_start(self, idx):
+    return self.buffers[self.PATH_START_KEY][idx]

-    def get_pathlen(self, idx):
-        is_array = isinstance(idx, np.ndarray) or isinstance(idx, list)
-        if not is_array:
-            idx = [idx]
+  def get_path_end(self, idx):
+    return self.buffers[self.PATH_END_KEY][idx]

-        n = len(idx)
-        start_idx = self.get_path_start(idx)
-        end_idx = self.get_path_end(idx)
-        pathlen = np.empty(n, dtype=int)
+  def get_pathlen(self, idx):
+    is_array = isinstance(idx, np.ndarray) or isinstance(idx, list)
+    if not is_array:
+      idx = [idx]

-        for i in range(n):
-            curr_start = start_idx[i]
-            curr_end = end_idx[i]
-            if curr_start < curr_end:
-                curr_len = curr_end - curr_start
-            else:
-                curr_len = self.buffer_size - curr_start + curr_end
-            pathlen[i] = curr_len
+    n = len(idx)
+    start_idx = self.get_path_start(idx)
+    end_idx = self.get_path_end(idx)
+    pathlen = np.empty(n, dtype=int)

-        if not is_array:
-            pathlen = pathlen[0]
+    for i in range(n):
+      curr_start = start_idx[i]
+      curr_end = end_idx[i]
+      if curr_start < curr_end:
+        curr_len = curr_end - curr_start
+      else:
+        curr_len = self.buffer_size - curr_start + curr_end
+      pathlen[i] = curr_len

-        return pathlen
+    if not is_array:
+      pathlen = pathlen[0]

-    def is_valid_path(self, idx):
-        start_idx = self.get_path_start(idx)
-        valid = start_idx != MathUtil.INVALID_IDX
-        return valid
+    return pathlen

-    def store(self, path):
-        start_idx = MathUtil.INVALID_IDX
-        n = path.pathlength()
-        
-        if (n > 0):
-            assert path.is_valid()
+  def is_valid_path(self, idx):
+    start_idx = self.get_path_start(idx)
+    valid = start_idx != MathUtil.INVALID_IDX
+    return valid

-            if path.check_vals():
-                if self.buffers is None:
-                    self._init_buffers(path)
+  def store(self, path):
+    start_idx = MathUtil.INVALID_IDX
+    n = path.pathlength()

-                idx = self._request_idx(n + 1)
-                self._store_path(path, idx)
-                self._add_sample_buffers(idx)
+    if (n > 0):
+      assert path.is_valid()

-                self.num_paths += 1
-                self.total_count += n + 1
-                start_idx = idx[0]
-            else:
-                Logger.print2('Invalid path data value detected')
-        
-        return start_idx
+      if path.check_vals():
+        if self.buffers is None:
+          self._init_buffers(path)

-    def clear(self):
-        self.buffer_head = 0
-        self.buffer_tail = MathUtil.INVALID_IDX
-        self.num_paths = 0
+        idx = self._request_idx(n + 1)
+        self._store_path(path, idx)
+        self._add_sample_buffers(idx)

-        for key in self._sample_buffers:
-            self._sample_buffers[key].clear()
-        return
+        self.num_paths += 1
+        self.total_count += n + 1
+        start_idx = idx[0]
+      else:
+        Logger.print2('Invalid path data value detected')

-    def get_next_idx(self, idx):
-        next_idx = np.mod(idx + 1, self.buffer_size)
-        return next_idx
+    return start_idx

-    def is_terminal_state(self, idx):
-        terminate_flags = self.buffers[self.TERMINATE_KEY][idx]
-        terminate = terminate_flags != Env.Terminate.Null.value
-        is_end = self.is_path_end(idx)
-        terminal_state = np.logical_and(terminate, is_end)
-        return terminal_state
+  def clear(self):
+    self.buffer_head = 0
+    self.buffer_tail = MathUtil.INVALID_IDX
+    self.num_paths = 0

-    def check_terminal_flag(self, idx, flag):
-        terminate_flags = self.buffers[self.TERMINATE_KEY][idx]
-        terminate = terminate_flags == flag.value
-        return terminate
+    for key in self._sample_buffers:
+      self._sample_buffers[key].clear()
+    return

-    def is_path_end(self, idx):
-        is_end = self.buffers[self.PATH_END_KEY][idx] == idx
-        return is_end
+  def get_next_idx(self, idx):
+    next_idx = np.mod(idx + 1, self.buffer_size)
+    return next_idx

-    def add_filter_key(self, key):
-        assert self.get_current_size() == 0
-        if key not in self._sample_buffers:
-            self._sample_buffers[key] = SampleBuffer(self.buffer_size)
-        return
+  def is_terminal_state(self, idx):
+    terminate_flags = self.buffers[self.TERMINATE_KEY][idx]
+    terminate = terminate_flags != Env.Terminate.Null.value
+    is_end = self.is_path_end(idx)
+    terminal_state = np.logical_and(terminate, is_end)
+    return terminal_state

-    def get_current_size(self):
-        if self.buffer_tail == MathUtil.INVALID_IDX:
-            return 0
-        elif self.buffer_tail < self.buffer_head:
-            return self.buffer_head - self.buffer_tail
+  def check_terminal_flag(self, idx, flag):
+    terminate_flags = self.buffers[self.TERMINATE_KEY][idx]
+    terminate = terminate_flags == flag.value
+    return terminate
+
+  def is_path_end(self, idx):
+    is_end = self.buffers[self.PATH_END_KEY][idx] == idx
+    return is_end
+
+  def add_filter_key(self, key):
+    assert self.get_current_size() == 0
+    if key not in self._sample_buffers:
+      self._sample_buffers[key] = SampleBuffer(self.buffer_size)
+    return
+
+  def get_current_size(self):
+    if self.buffer_tail == MathUtil.INVALID_IDX:
+      return 0
+    elif self.buffer_tail < self.buffer_head:
+      return self.buffer_head - self.buffer_tail
+    else:
+      return self.buffer_size - self.buffer_tail + self.buffer_head
+
+  def _check_flags(self, key, flags):
+    return (flags & key) == key
+
+  def _add_sample_buffers(self, idx):
+    flags = self.buffers['flags']
+    for key in self._sample_buffers:
+      curr_buffer = self._sample_buffers[key]
+      filter_idx = [
+          i for i in idx if (self._check_flags(key, flags[i]) and not self.is_path_end(i))
+      ]
+      curr_buffer.add(filter_idx)
+    return
+
+  def _free_sample_buffers(self, idx):
+    for key in self._sample_buffers:
+      curr_buffer = self._sample_buffers[key]
+      curr_buffer.free(idx)
+    return
+
+  def _init_buffers(self, path):
+    self.buffers = dict()
+    self.buffers[self.PATH_START_KEY] = MathUtil.INVALID_IDX * np.ones(self.buffer_size, dtype=int)
+    self.buffers[self.PATH_END_KEY] = MathUtil.INVALID_IDX * np.ones(self.buffer_size, dtype=int)
+
+    for key in dir(path):
+      val = getattr(path, key)
+      if not key.startswith('__') and not inspect.ismethod(val):
+        if key == self.TERMINATE_KEY:
+          self.buffers[self.TERMINATE_KEY] = np.zeros(shape=[self.buffer_size], dtype=int)
        else:
-            return self.buffer_size - self.buffer_tail + self.buffer_head
+          val_type = type(val[0])
+          is_array = val_type == np.ndarray
+          if is_array:
+            shape = [self.buffer_size, val[0].shape[0]]
+            dtype = val[0].dtype
+          else:
+            shape = [self.buffer_size]
+            dtype = val_type

-    def _check_flags(self, key, flags):
-        return (flags & key) == key
+          self.buffers[key] = np.zeros(shape, dtype=dtype)
+    return

-    def _add_sample_buffers(self, idx):
-        flags = self.buffers['flags']
-        for key in self._sample_buffers:
-            curr_buffer = self._sample_buffers[key]
-            filter_idx = [i for i in idx if (self._check_flags(key, flags[i]) and not self.is_path_end(i))]
-            curr_buffer.add(filter_idx)
-        return
+  def _request_idx(self, n):
+    assert n + 1 < self.buffer_size  # bad things can happen if path is too long

-    def _free_sample_buffers(self, idx):
-        for key in self._sample_buffers:
-            curr_buffer = self._sample_buffers[key]
-            curr_buffer.free(idx)    
-        return
+    remainder = n
+    idx = []

-    def _init_buffers(self, path):
-        self.buffers = dict()
-        self.buffers[self.PATH_START_KEY] = MathUtil.INVALID_IDX * np.ones(self.buffer_size, dtype=int);
-        self.buffers[self.PATH_END_KEY] = MathUtil.INVALID_IDX * np.ones(self.buffer_size, dtype=int);
+    start_idx = self.buffer_head
+    while remainder > 0:
+      end_idx = np.minimum(start_idx + remainder, self.buffer_size)
+      remainder -= (end_idx - start_idx)

-        for key in dir(path):
-            val = getattr(path, key)
-            if not key.startswith('__') and not inspect.ismethod(val):
-                if key == self.TERMINATE_KEY:
-                    self.buffers[self.TERMINATE_KEY] = np.zeros(shape=[self.buffer_size], dtype=int)
-                else:
-                    val_type = type(val[0])
-                    is_array = val_type == np.ndarray
-                    if is_array:
-                        shape = [self.buffer_size, val[0].shape[0]]
-                        dtype = val[0].dtype
-                    else:
-                        shape = [self.buffer_size]
-                        dtype = val_type
-                    
-                    self.buffers[key] = np.zeros(shape, dtype=dtype)
-        return
+      free_idx = list(range(start_idx, end_idx))
+      self._free_idx(free_idx)
+      idx += free_idx
+      start_idx = 0

-    def _request_idx(self, n):
-        assert n + 1 < self.buffer_size # bad things can happen if path is too long
+    self.buffer_head = (self.buffer_head + n) % self.buffer_size
+    return idx

-        remainder = n
-        idx = []
+  def _free_idx(self, idx):
+    assert (idx[0] <= idx[-1])
+    n = len(idx)
+    if self.buffer_tail != MathUtil.INVALID_IDX:
+      update_tail = idx[0] <= idx[-1] and idx[0] <= self.buffer_tail and idx[-1] >= self.buffer_tail
+      update_tail |= idx[0] > idx[-1] and (idx[0] <= self.buffer_tail or
+                                           idx[-1] >= self.buffer_tail)

-        start_idx = self.buffer_head
-        while remainder > 0:
-            end_idx = np.minimum(start_idx + remainder, self.buffer_size)
-            remainder -= (end_idx - start_idx)
+      if update_tail:
+        i = 0
+        while i < n:
+          curr_idx = idx[i]
+          if self.is_valid_path(curr_idx):
+            start_idx = self.get_path_start(curr_idx)
+            end_idx = self.get_path_end(curr_idx)
+            pathlen = self.get_pathlen(curr_idx)

-            free_idx = list(range(start_idx, end_idx))
-            self._free_idx(free_idx)
-            idx += free_idx
-            start_idx = 0
+            if start_idx < end_idx:
+              self.buffers[self.PATH_START_KEY][start_idx:end_idx + 1] = MathUtil.INVALID_IDX
+              self._free_sample_buffers(list(range(start_idx, end_idx + 1)))
+            else:
+              self.buffers[self.PATH_START_KEY][start_idx:self.buffer_size] = MathUtil.INVALID_IDX
+              self.buffers[self.PATH_START_KEY][0:end_idx + 1] = MathUtil.INVALID_IDX
+              self._free_sample_buffers(list(range(start_idx, self.buffer_size)))
+              self._free_sample_buffers(list(range(0, end_idx + 1)))

-        self.buffer_head = (self.buffer_head + n) % self.buffer_size
-        return idx
+            self.num_paths -= 1
+            i += pathlen + 1
+            self.buffer_tail = (end_idx + 1) % self.buffer_size
+          else:
+            i += 1
+    else:
+      self.buffer_tail = idx[0]
+    return

-    def _free_idx(self, idx):
-        assert(idx[0] <= idx[-1])
-        n = len(idx)
-        if self.buffer_tail != MathUtil.INVALID_IDX:
-            update_tail = idx[0] <= idx[-1] and idx[0] <= self.buffer_tail and idx[-1] >= self.buffer_tail
-            update_tail |= idx[0] > idx[-1] and (idx[0] <= self.buffer_tail or idx[-1] >= self.buffer_tail)
-            
-            if update_tail:
-                i = 0
-                while i < n:
-                    curr_idx = idx[i]
-                    if self.is_valid_path(curr_idx):
-                        start_idx = self.get_path_start(curr_idx)
-                        end_idx = self.get_path_end(curr_idx)
-                        pathlen = self.get_pathlen(curr_idx)
+  def _store_path(self, path, idx):
+    n = path.pathlength()
+    for key, data in self.buffers.items():
+      if key != self.PATH_START_KEY and key != self.PATH_END_KEY and key != self.TERMINATE_KEY:
+        val = getattr(path, key)
+        val_len = len(val)
+        assert val_len == n or val_len == n + 1
+        data[idx[:val_len]] = val

-                        if start_idx < end_idx:
-                            self.buffers[self.PATH_START_KEY][start_idx:end_idx + 1] = MathUtil.INVALID_IDX
-                            self._free_sample_buffers(list(range(start_idx, end_idx + 1)))
-                        else:
-                            self.buffers[self.PATH_START_KEY][start_idx:self.buffer_size] = MathUtil.INVALID_IDX
-                            self.buffers[self.PATH_START_KEY][0:end_idx + 1] = MathUtil.INVALID_IDX
-                            self._free_sample_buffers(list(range(start_idx, self.buffer_size)))
-                            self._free_sample_buffers(list(range(0, end_idx + 1)))
-                        
-                        self.num_paths -= 1
-                        i += pathlen + 1
-                        self.buffer_tail = (end_idx + 1) % self.buffer_size;
-                    else:
-                        i += 1
-        else:
-            self.buffer_tail = idx[0]
-        return
+    self.buffers[self.TERMINATE_KEY][idx] = path.terminate.value
+    self.buffers[self.PATH_START_KEY][idx] = idx[0]
+    self.buffers[self.PATH_END_KEY][idx] = idx[-1]
+    return

-    def _store_path(self, path, idx):
-        n = path.pathlength()
-        for key, data in self.buffers.items():
-            if key != self.PATH_START_KEY and key != self.PATH_END_KEY and key != self.TERMINATE_KEY:
-                val = getattr(path, key)
-                val_len = len(val)
-                assert val_len == n or val_len == n + 1
-                data[idx[:val_len]] = val
-
-        self.buffers[self.TERMINATE_KEY][idx] = path.terminate.value
-        self.buffers[self.PATH_START_KEY][idx] = idx[0]
-        self.buffers[self.PATH_END_KEY][idx] = idx[-1]
-        return

 class SampleBuffer(object):
-    def __init__(self, size):
-        self.idx_to_slot = np.empty(shape=[size], dtype=int)
-        self.slot_to_idx = np.empty(shape=[size], dtype=int)
-        self.count = 0
-        self.clear()
-        return
-    
-    def clear(self):
-        self.idx_to_slot.fill(MathUtil.INVALID_IDX)
-        self.slot_to_idx.fill(MathUtil.INVALID_IDX)
-        self.count = 0
-        return

-    def is_valid(self, idx):
-        return self.idx_to_slot[idx] != MathUtil.INVALID_IDX
+  def __init__(self, size):
+    self.idx_to_slot = np.empty(shape=[size], dtype=int)
+    self.slot_to_idx = np.empty(shape=[size], dtype=int)
+    self.count = 0
+    self.clear()
+    return

-    def get_size(self):
-        return self.idx_to_slot.shape[0]
+  def clear(self):
+    self.idx_to_slot.fill(MathUtil.INVALID_IDX)
+    self.slot_to_idx.fill(MathUtil.INVALID_IDX)
+    self.count = 0
+    return

-    def add(self, idx):
-        for i in idx:
-            if not self.is_valid(i):
-                new_slot = self.count
-                assert new_slot >= 0
+  def is_valid(self, idx):
+    return self.idx_to_slot[idx] != MathUtil.INVALID_IDX

-                self.idx_to_slot[i] = new_slot
-                self.slot_to_idx[new_slot] = i
-                self.count += 1
-        return
+  def get_size(self):
+    return self.idx_to_slot.shape[0]

-    def free(self, idx):
-        for i in idx:
-            if self.is_valid(i):
-                slot = self.idx_to_slot[i]
-                last_slot = self.count - 1
-                last_idx = self.slot_to_idx[last_slot]
+  def add(self, idx):
+    for i in idx:
+      if not self.is_valid(i):
+        new_slot = self.count
+        assert new_slot >= 0

-                self.idx_to_slot[last_idx] = slot
-                self.slot_to_idx[slot] = last_idx
-                self.idx_to_slot[i] = MathUtil.INVALID_IDX
-                self.slot_to_idx[last_slot] = MathUtil.INVALID_IDX
-                self.count -= 1
-        return
+        self.idx_to_slot[i] = new_slot
+        self.slot_to_idx[new_slot] = i
+        self.count += 1
+    return

-    def sample(self, n):
-        if self.count > 0:
-            slots = np.random.randint(0, self.count, size=n)
-            idx = self.slot_to_idx[slots]
-        else:
-            idx = np.empty(shape=[0], dtype=int)
-        return idx
+  def free(self, idx):
+    for i in idx:
+      if self.is_valid(i):
+        slot = self.idx_to_slot[i]
+        last_slot = self.count - 1
+        last_idx = self.slot_to_idx[last_slot]

-    def check_consistency(self):
-        valid = True
-        if self.count < 0:
+        self.idx_to_slot[last_idx] = slot
+        self.slot_to_idx[slot] = last_idx
+        self.idx_to_slot[i] = MathUtil.INVALID_IDX
+        self.slot_to_idx[last_slot] = MathUtil.INVALID_IDX
+        self.count -= 1
+    return
+
+  def sample(self, n):
+    if self.count > 0:
+      slots = np.random.randint(0, self.count, size=n)
+      idx = self.slot_to_idx[slots]
+    else:
+      idx = np.empty(shape=[0], dtype=int)
+    return idx
+
+  def check_consistency(self):
+    valid = True
+    if self.count < 0:
+      valid = False
+
+    if valid:
+      for i in range(self.get_size()):
+        if self.is_valid(i):
+          s = self.idx_to_slot[i]
+          if self.slot_to_idx[s] != i:
            valid = False
+            break

-        if valid:
-            for i in range(self.get_size()):
-                if self.is_valid(i):
-                    s = self.idx_to_slot[i]
-                    if self.slot_to_idx[s] != i:
-                        valid = False
-                        break
+        s2i = self.slot_to_idx[i]
+        if s2i != MathUtil.INVALID_IDX:
+          i2s = self.idx_to_slot[s2i]
+          if i2s != i:
+            valid = False
+            break

-                s2i = self.slot_to_idx[i] 
-                if s2i != MathUtil.INVALID_IDX:
-                    i2s = self.idx_to_slot[s2i]
-                    if i2s != i:
-                        valid = False
-                        break
-
-        count0 = np.sum(self.idx_to_slot == MathUtil.INVALID_IDX)
-        count1 = np.sum(self.slot_to_idx == MathUtil.INVALID_IDX)
-        valid &= count0 == count1
-        return valid
+    count0 = np.sum(self.idx_to_slot == MathUtil.INVALID_IDX)
+    count1 = np.sum(self.slot_to_idx == MathUtil.INVALID_IDX)
+    valid &= count0 == count1
+    return valid
--- a/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/rl_agent.py
+++ b/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/rl_agent.py
--- a/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/rl_util.py
+++ b/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/rl_util.py
@@ -1,18 +1,19 @@
 import numpy as np

+
 def compute_return(rewards, gamma, td_lambda, val_t):
-    # computes td-lambda return of path
-    path_len = len(rewards)
-    assert len(val_t) == path_len + 1
+  # computes td-lambda return of path
+  path_len = len(rewards)
+  assert len(val_t) == path_len + 1

-    return_t = np.zeros(path_len)
-    last_val = rewards[-1] + gamma * val_t[-1]
-    return_t[-1] = last_val
+  return_t = np.zeros(path_len)
+  last_val = rewards[-1] + gamma * val_t[-1]
+  return_t[-1] = last_val

-    for i in reversed(range(0, path_len - 1)):
-        curr_r = rewards[i]
-        next_ret = return_t[i + 1]
-        curr_val = curr_r + gamma * ((1.0 - td_lambda) * val_t[i + 1] + td_lambda * next_ret)
-        return_t[i] = curr_val
-    
-    return return_t
+  for i in reversed(range(0, path_len - 1)):
+    curr_r = rewards[i]
+    next_ret = return_t[i + 1]
+    curr_val = curr_r + gamma * ((1.0 - td_lambda) * val_t[i + 1] + td_lambda * next_ret)
+    return_t[i] = curr_val
+
+  return return_t
--- a/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/rl_world.py
+++ b/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/rl_world.py
@@ -5,139 +5,140 @@ from pybullet_envs.deep_mimic.learning.rl_agent import RLAgent
 from pybullet_utils.logger import Logger
 import pybullet_data

+
 class RLWorld(object):
-    def __init__(self, env, arg_parser):
-        TFUtil.disable_gpu()

-        self.env = env
-        self.arg_parser = arg_parser
-        self._enable_training = True
-        self.train_agents = []
-        self.parse_args(arg_parser)
+  def __init__(self, env, arg_parser):
+    TFUtil.disable_gpu()

-        self.build_agents()
-        
-        return
+    self.env = env
+    self.arg_parser = arg_parser
+    self._enable_training = True
+    self.train_agents = []
+    self.parse_args(arg_parser)

-    def get_enable_training(self):
-        return self._enable_training
-    
-    def set_enable_training(self, enable):
-        self._enable_training = enable
-        for i in range(len(self.agents)):
-            curr_agent = self.agents[i]
-            if curr_agent is not None:
-                enable_curr_train = self.train_agents[i] if (len(self.train_agents) > 0) else True
-                curr_agent.enable_training = self.enable_training and enable_curr_train
+    self.build_agents()

-        if (self._enable_training):
-            self.env.set_mode(RLAgent.Mode.TRAIN)
-        else:
-            self.env.set_mode(RLAgent.Mode.TEST)
+    return

-        return
+  def get_enable_training(self):
+    return self._enable_training

-    enable_training = property(get_enable_training, set_enable_training)
-    
-    def parse_args(self, arg_parser):
-        self.train_agents = self.arg_parser.parse_bools('train_agents')
-        num_agents = self.env.get_num_agents()
-        assert(len(self.train_agents) == num_agents or len(self.train_agents) == 0)
+  def set_enable_training(self, enable):
+    self._enable_training = enable
+    for i in range(len(self.agents)):
+      curr_agent = self.agents[i]
+      if curr_agent is not None:
+        enable_curr_train = self.train_agents[i] if (len(self.train_agents) > 0) else True
+        curr_agent.enable_training = self.enable_training and enable_curr_train

-        return
+    if (self._enable_training):
+      self.env.set_mode(RLAgent.Mode.TRAIN)
+    else:
+      self.env.set_mode(RLAgent.Mode.TEST)

-    def shutdown(self):
-        self.env.shutdown()
-        return
+    return

-    def build_agents(self):
-        num_agents = self.env.get_num_agents()
-        print("num_agents=",num_agents)
-        self.agents = []
+  enable_training = property(get_enable_training, set_enable_training)

-        Logger.print2('')
-        Logger.print2('Num Agents: {:d}'.format(num_agents))
+  def parse_args(self, arg_parser):
+    self.train_agents = self.arg_parser.parse_bools('train_agents')
+    num_agents = self.env.get_num_agents()
+    assert (len(self.train_agents) == num_agents or len(self.train_agents) == 0)

-        agent_files = self.arg_parser.parse_strings('agent_files')
-        print("len(agent_files)=",len(agent_files))
-        assert(len(agent_files) == num_agents or len(agent_files) == 0)
+    return

-        model_files = self.arg_parser.parse_strings('model_files')
-        assert(len(model_files) == num_agents or len(model_files) == 0)
+  def shutdown(self):
+    self.env.shutdown()
+    return

-        output_path = self.arg_parser.parse_string('output_path')
-        int_output_path = self.arg_parser.parse_string('int_output_path')
+  def build_agents(self):
+    num_agents = self.env.get_num_agents()
+    print("num_agents=", num_agents)
+    self.agents = []

-        for i in range(num_agents):
-            curr_file = agent_files[i]
-            curr_agent = self._build_agent(i, curr_file)
+    Logger.print2('')
+    Logger.print2('Num Agents: {:d}'.format(num_agents))

-            if curr_agent is not None:
-                curr_agent.output_dir = output_path
-                curr_agent.int_output_dir = int_output_path
-                Logger.print2(str(curr_agent))
+    agent_files = self.arg_parser.parse_strings('agent_files')
+    print("len(agent_files)=", len(agent_files))
+    assert (len(agent_files) == num_agents or len(agent_files) == 0)

-                if (len(model_files) > 0):
-                    curr_model_file = model_files[i]
-                    if curr_model_file != 'none':
-                        curr_agent.load_model(pybullet_data.getDataPath()+"/"+curr_model_file)
+    model_files = self.arg_parser.parse_strings('model_files')
+    assert (len(model_files) == num_agents or len(model_files) == 0)

-            self.agents.append(curr_agent)
-            Logger.print2('')
+    output_path = self.arg_parser.parse_string('output_path')
+    int_output_path = self.arg_parser.parse_string('int_output_path')

-        self.set_enable_training(self.enable_training)
+    for i in range(num_agents):
+      curr_file = agent_files[i]
+      curr_agent = self._build_agent(i, curr_file)

-        return
+      if curr_agent is not None:
+        curr_agent.output_dir = output_path
+        curr_agent.int_output_dir = int_output_path
+        Logger.print2(str(curr_agent))

-    def update(self, timestep):
-        #print("world update!\n")
-        self._update_agents(timestep)
-        self._update_env(timestep)
-        return
+        if (len(model_files) > 0):
+          curr_model_file = model_files[i]
+          if curr_model_file != 'none':
+            curr_agent.load_model(pybullet_data.getDataPath() + "/" + curr_model_file)

-    def reset(self):
-        self._reset_agents()
-        self._reset_env()
-        return
+      self.agents.append(curr_agent)
+      Logger.print2('')

-    def end_episode(self):
-        self._end_episode_agents();
-        return
+    self.set_enable_training(self.enable_training)

-    def _update_env(self, timestep):
-        self.env.update(timestep)
-        return
+    return

-    def _update_agents(self, timestep):
-        #print("len(agents)=",len(self.agents))
-        for agent in self.agents:
-            if (agent is not None):
-                agent.update(timestep)
-        return
+  def update(self, timestep):
+    #print("world update!\n")
+    self._update_agents(timestep)
+    self._update_env(timestep)
+    return

-    def _reset_env(self):
-        self.env.reset()
-        return
+  def reset(self):
+    self._reset_agents()
+    self._reset_env()
+    return

-    def _reset_agents(self):
-        for agent in self.agents:
-            if (agent != None):
-                agent.reset()
-        return
+  def end_episode(self):
+    self._end_episode_agents()
+    return

-    def _end_episode_agents(self):
-        for agent in self.agents:
-            if (agent != None):
-                agent.end_episode()
-        return
+  def _update_env(self, timestep):
+    self.env.update(timestep)
+    return

-    def _build_agent(self, id, agent_file):
-        Logger.print2('Agent {:d}: {}'.format(id, agent_file))
-        if (agent_file == 'none'):
-            agent = None
-        else:
-            agent = AgentBuilder.build_agent(self, id, agent_file)
-            assert (agent != None), 'Failed to build agent {:d} from: {}'.format(id, agent_file)
-        
-        return agent
-        
+  def _update_agents(self, timestep):
+    #print("len(agents)=",len(self.agents))
+    for agent in self.agents:
+      if (agent is not None):
+        agent.update(timestep)
+    return
+
+  def _reset_env(self):
+    self.env.reset()
+    return
+
+  def _reset_agents(self):
+    for agent in self.agents:
+      if (agent != None):
+        agent.reset()
+    return
+
+  def _end_episode_agents(self):
+    for agent in self.agents:
+      if (agent != None):
+        agent.end_episode()
+    return
+
+  def _build_agent(self, id, agent_file):
+    Logger.print2('Agent {:d}: {}'.format(id, agent_file))
+    if (agent_file == 'none'):
+      agent = None
+    else:
+      agent = AgentBuilder.build_agent(self, id, agent_file)
+      assert (agent != None), 'Failed to build agent {:d} from: {}'.format(id, agent_file)
+
+    return agent
--- a/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/solvers/mpi_solver.py
+++ b/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/solvers/mpi_solver.py
@@ -8,96 +8,97 @@ from pybullet_utils.logger import Logger

 from pybullet_envs.deep_mimic.learning.solvers.solver import Solver

+
 class MPISolver(Solver):
-    CHECK_SYNC_ITERS = 1000
+  CHECK_SYNC_ITERS = 1000

-    def __init__(self, sess, optimizer, vars):
-        super().__init__(vars)
-        self.sess = sess
-        self.optimizer = optimizer
-        self._build_grad_feed(vars)
-        self._update = optimizer.apply_gradients(zip(self._grad_tf_list, self.vars))
-        self._set_flat_vars = TFUtil.SetFromFlat(sess, self.vars)
-        self._get_flat_vars = TFUtil.GetFlat(sess, self.vars)
+  def __init__(self, sess, optimizer, vars):
+    super().__init__(vars)
+    self.sess = sess
+    self.optimizer = optimizer
+    self._build_grad_feed(vars)
+    self._update = optimizer.apply_gradients(zip(self._grad_tf_list, self.vars))
+    self._set_flat_vars = TFUtil.SetFromFlat(sess, self.vars)
+    self._get_flat_vars = TFUtil.GetFlat(sess, self.vars)

-        self.iter = 0
-        grad_dim = self._calc_grad_dim()
-        self._flat_grad = np.zeros(grad_dim, dtype=np.float32)
-        self._global_flat_grad = np.zeros(grad_dim, dtype=np.float32)
-        
-        return
+    self.iter = 0
+    grad_dim = self._calc_grad_dim()
+    self._flat_grad = np.zeros(grad_dim, dtype=np.float32)
+    self._global_flat_grad = np.zeros(grad_dim, dtype=np.float32)

-    def get_stepsize(self):
-        return self.optimizer._learning_rate_tensor.eval()
+    return

-    def update(self, grads=None, grad_scale=1.0):
-        if grads is not None:
-            self._flat_grad = MathUtil.flatten(grads)
-        else:
-            self._flat_grad.fill(0)
-        return self.update_flatgrad(self._flat_grad, grad_scale)
+  def get_stepsize(self):
+    return self.optimizer._learning_rate_tensor.eval()

-    def update_flatgrad(self, flat_grad, grad_scale=1.0):
-        if self.iter % self.CHECK_SYNC_ITERS == 0:
-            assert self.check_synced(), Logger.print2('Network parameters desynchronized')
-        
-        if grad_scale != 1.0:
-            flat_grad *= grad_scale
+  def update(self, grads=None, grad_scale=1.0):
+    if grads is not None:
+      self._flat_grad = MathUtil.flatten(grads)
+    else:
+      self._flat_grad.fill(0)
+    return self.update_flatgrad(self._flat_grad, grad_scale)

-        MPI.COMM_WORLD.Allreduce(flat_grad, self._global_flat_grad, op=MPI.SUM)
-        self._global_flat_grad /= MPIUtil.get_num_procs()
+  def update_flatgrad(self, flat_grad, grad_scale=1.0):
+    if self.iter % self.CHECK_SYNC_ITERS == 0:
+      assert self.check_synced(), Logger.print2('Network parameters desynchronized')

-        self._load_flat_grad(self._global_flat_grad)
-        self.sess.run([self._update], self._grad_feed)
-        self.iter += 1
+    if grad_scale != 1.0:
+      flat_grad *= grad_scale

-        return
+    MPI.COMM_WORLD.Allreduce(flat_grad, self._global_flat_grad, op=MPI.SUM)
+    self._global_flat_grad /= MPIUtil.get_num_procs()

-    def sync(self):
-        vars = self._get_flat_vars()
-        MPIUtil.bcast(vars)
-        self._set_flat_vars(vars)
-        return
+    self._load_flat_grad(self._global_flat_grad)
+    self.sess.run([self._update], self._grad_feed)
+    self.iter += 1

-    def check_synced(self):
-        synced = True
-        if self._is_root():
-            vars = self._get_flat_vars()
-            MPIUtil.bcast(vars)
-        else:
-            vars_local = self._get_flat_vars()
-            vars_root = np.empty_like(vars_local)
-            MPIUtil.bcast(vars_root)
-            synced = (vars_local == vars_root).all()
-        return synced
+    return

-    def _is_root(self):
-        return MPIUtil.is_root_proc()
-    
-    def _build_grad_feed(self, vars):
-        self._grad_tf_list = []
-        self._grad_buffers = []
-        for v in self.vars:
-            shape = v.get_shape()
-            grad = np.zeros(shape)
-            grad_tf = tf.placeholder(tf.float32, shape=shape)
-            self._grad_buffers.append(grad)
-            self._grad_tf_list.append(grad_tf)
+  def sync(self):
+    vars = self._get_flat_vars()
+    MPIUtil.bcast(vars)
+    self._set_flat_vars(vars)
+    return

-        self._grad_feed = dict({g_tf: g for g_tf, g in zip(self._grad_tf_list, self._grad_buffers)})
-        
-        return
+  def check_synced(self):
+    synced = True
+    if self._is_root():
+      vars = self._get_flat_vars()
+      MPIUtil.bcast(vars)
+    else:
+      vars_local = self._get_flat_vars()
+      vars_root = np.empty_like(vars_local)
+      MPIUtil.bcast(vars_root)
+      synced = (vars_local == vars_root).all()
+    return synced

-    def _calc_grad_dim(self):
-        grad_dim = 0
-        for grad in self._grad_buffers:
-            grad_dim += grad.size
-        return grad_dim
+  def _is_root(self):
+    return MPIUtil.is_root_proc()

-    def _load_flat_grad(self, flat_grad):
-        start = 0
-        for g in self._grad_buffers:
-            size = g.size
-            np.copyto(g, np.reshape(flat_grad[start:start + size], g.shape))
-            start += size
-        return
+  def _build_grad_feed(self, vars):
+    self._grad_tf_list = []
+    self._grad_buffers = []
+    for v in self.vars:
+      shape = v.get_shape()
+      grad = np.zeros(shape)
+      grad_tf = tf.placeholder(tf.float32, shape=shape)
+      self._grad_buffers.append(grad)
+      self._grad_tf_list.append(grad_tf)
+
+    self._grad_feed = dict({g_tf: g for g_tf, g in zip(self._grad_tf_list, self._grad_buffers)})
+
+    return
+
+  def _calc_grad_dim(self):
+    grad_dim = 0
+    for grad in self._grad_buffers:
+      grad_dim += grad.size
+    return grad_dim
+
+  def _load_flat_grad(self, flat_grad):
+    start = 0
+    for g in self._grad_buffers:
+      size = g.size
+      np.copyto(g, np.reshape(flat_grad[start:start + size], g.shape))
+      start += size
+    return
--- a/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/solvers/solver.py
+++ b/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/solvers/solver.py
@@ -1,15 +1,17 @@
 from abc import abstractmethod
 import sys, abc
 if sys.version_info >= (3, 4):
-    ABC = abc.ABC
+  ABC = abc.ABC
 else:
-    ABC = abc.ABCMeta('ABC', (), {})
+  ABC = abc.ABCMeta('ABC', (), {})
+

 class Solver(ABC):
-    def __init__(self, vars):
-        self.vars = vars
-        return

-    @abstractmethod
-    def update(self, grads):
-        pass
+  def __init__(self, vars):
+    self.vars = vars
+    return
+
+  @abstractmethod
+  def update(self, grads):
+    pass
--- a/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/tf_agent.py
+++ b/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/tf_agent.py
@@ -6,144 +6,148 @@ from pybullet_envs.deep_mimic.learning.rl_agent import RLAgent
 from pybullet_utils.logger import Logger
 from pybullet_envs.deep_mimic.learning.tf_normalizer import TFNormalizer

+
 class TFAgent(RLAgent):
-    RESOURCE_SCOPE = 'resource'
-    SOLVER_SCOPE = 'solvers'
+  RESOURCE_SCOPE = 'resource'
+  SOLVER_SCOPE = 'solvers'

-    def __init__(self, world, id, json_data):
-        self.tf_scope = 'agent'
-        self.graph = tf.Graph()
-        self.sess = tf.Session(graph=self.graph)
+  def __init__(self, world, id, json_data):
+    self.tf_scope = 'agent'
+    self.graph = tf.Graph()
+    self.sess = tf.Session(graph=self.graph)

-        super().__init__(world, id, json_data)
-        self._build_graph(json_data)
-        self._init_normalizers()
-        return
+    super().__init__(world, id, json_data)
+    self._build_graph(json_data)
+    self._init_normalizers()
+    return

-    def __del__(self):
-        self.sess.close()
-        return
+  def __del__(self):
+    self.sess.close()
+    return

-    def save_model(self, out_path):
-        with self.sess.as_default(), self.graph.as_default():
-            try:
-                save_path = self.saver.save(self.sess, out_path, write_meta_graph=False, write_state=False)
-                Logger.print2('Model saved to: ' + save_path)
-            except:
-                Logger.print2("Failed to save model to: " + save_path)
-        return
+  def save_model(self, out_path):
+    with self.sess.as_default(), self.graph.as_default():
+      try:
+        save_path = self.saver.save(self.sess, out_path, write_meta_graph=False, write_state=False)
+        Logger.print2('Model saved to: ' + save_path)
+      except:
+        Logger.print2("Failed to save model to: " + save_path)
+    return

-    def load_model(self, in_path):
-        with self.sess.as_default(), self.graph.as_default():
-            self.saver.restore(self.sess, in_path)
-            self._load_normalizers()
-            Logger.print2('Model loaded from: ' + in_path)
-        return
+  def load_model(self, in_path):
+    with self.sess.as_default(), self.graph.as_default():
+      self.saver.restore(self.sess, in_path)
+      self._load_normalizers()
+      Logger.print2('Model loaded from: ' + in_path)
+    return

-    def _get_output_path(self):
-        assert(self.output_dir != '')
-        file_path = self.output_dir + '/agent' + str(self.id) + '_model.ckpt'
-        return file_path
+  def _get_output_path(self):
+    assert (self.output_dir != '')
+    file_path = self.output_dir + '/agent' + str(self.id) + '_model.ckpt'
+    return file_path

-    def _get_int_output_path(self):
-        assert(self.int_output_dir != '')
-        file_path = self.int_output_dir + ('/agent{:d}_models/agent{:d}_int_model_{:010d}.ckpt').format(self.id, self.id, self.iter)
-        return file_path
+  def _get_int_output_path(self):
+    assert (self.int_output_dir != '')
+    file_path = self.int_output_dir + (
+        '/agent{:d}_models/agent{:d}_int_model_{:010d}.ckpt').format(self.id, self.id, self.iter)
+    return file_path

-    def _build_graph(self, json_data):
-        with self.sess.as_default(), self.graph.as_default():
-            with tf.variable_scope(self.tf_scope):
-                self._build_nets(json_data)
-                
-                with tf.variable_scope(self.SOLVER_SCOPE):
-                    self._build_losses(json_data)
-                    self._build_solvers(json_data)
+  def _build_graph(self, json_data):
+    with self.sess.as_default(), self.graph.as_default():
+      with tf.variable_scope(self.tf_scope):
+        self._build_nets(json_data)

-                self._initialize_vars()
-                self._build_saver()
-        return
+        with tf.variable_scope(self.SOLVER_SCOPE):
+          self._build_losses(json_data)
+          self._build_solvers(json_data)

-    def _init_normalizers(self):
-        with self.sess.as_default(), self.graph.as_default():
-            # update normalizers to sync the tensorflow tensors
-            self.s_norm.update()
-            self.g_norm.update()
-            self.a_norm.update()
-        return
+        self._initialize_vars()
+        self._build_saver()
+    return

-    @abstractmethod
-    def _build_nets(self, json_data):
-        pass
+  def _init_normalizers(self):
+    with self.sess.as_default(), self.graph.as_default():
+      # update normalizers to sync the tensorflow tensors
+      self.s_norm.update()
+      self.g_norm.update()
+      self.a_norm.update()
+    return

-    @abstractmethod
-    def _build_losses(self, json_data):
-        pass
+  @abstractmethod
+  def _build_nets(self, json_data):
+    pass

-    @abstractmethod
-    def _build_solvers(self, json_data):
-        pass
+  @abstractmethod
+  def _build_losses(self, json_data):
+    pass

-    def _tf_vars(self, scope=''):
-        with self.sess.as_default(), self.graph.as_default():
-            res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.tf_scope + '/' + scope)
-            assert len(res) > 0
-        return res
+  @abstractmethod
+  def _build_solvers(self, json_data):
+    pass

-    def _build_normalizers(self):
-        with self.sess.as_default(), self.graph.as_default(), tf.variable_scope(self.tf_scope):
-            with tf.variable_scope(self.RESOURCE_SCOPE):
-                self.s_norm = TFNormalizer(self.sess, 's_norm', self.get_state_size(), self.world.env.build_state_norm_groups(self.id))
-                state_offset = -self.world.env.build_state_offset(self.id)
-                print("state_offset=",state_offset)
-                state_scale = 1 / self.world.env.build_state_scale(self.id)
-                print("state_scale=",state_scale)
-                self.s_norm.set_mean_std(-self.world.env.build_state_offset(self.id), 
-                                         1 / self.world.env.build_state_scale(self.id))
-                
-                self.g_norm = TFNormalizer(self.sess, 'g_norm', self.get_goal_size(), self.world.env.build_goal_norm_groups(self.id))
-                self.g_norm.set_mean_std(-self.world.env.build_goal_offset(self.id), 
-                                         1 / self.world.env.build_goal_scale(self.id))
+  def _tf_vars(self, scope=''):
+    with self.sess.as_default(), self.graph.as_default():
+      res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.tf_scope + '/' + scope)
+      assert len(res) > 0
+    return res

-                self.a_norm = TFNormalizer(self.sess, 'a_norm', self.get_action_size())
-                self.a_norm.set_mean_std(-self.world.env.build_action_offset(self.id), 
-                                         1 / self.world.env.build_action_scale(self.id))
-        return
+  def _build_normalizers(self):
+    with self.sess.as_default(), self.graph.as_default(), tf.variable_scope(self.tf_scope):
+      with tf.variable_scope(self.RESOURCE_SCOPE):
+        self.s_norm = TFNormalizer(self.sess, 's_norm', self.get_state_size(),
+                                   self.world.env.build_state_norm_groups(self.id))
+        state_offset = -self.world.env.build_state_offset(self.id)
+        print("state_offset=", state_offset)
+        state_scale = 1 / self.world.env.build_state_scale(self.id)
+        print("state_scale=", state_scale)
+        self.s_norm.set_mean_std(-self.world.env.build_state_offset(self.id),
+                                 1 / self.world.env.build_state_scale(self.id))

-    def _load_normalizers(self):
-        self.s_norm.load()
-        self.g_norm.load()
-        self.a_norm.load()
-        return
+        self.g_norm = TFNormalizer(self.sess, 'g_norm', self.get_goal_size(),
+                                   self.world.env.build_goal_norm_groups(self.id))
+        self.g_norm.set_mean_std(-self.world.env.build_goal_offset(self.id),
+                                 1 / self.world.env.build_goal_scale(self.id))

-    def _update_normalizers(self):
-        with self.sess.as_default(), self.graph.as_default():
-            super()._update_normalizers()
-        return
+        self.a_norm = TFNormalizer(self.sess, 'a_norm', self.get_action_size())
+        self.a_norm.set_mean_std(-self.world.env.build_action_offset(self.id),
+                                 1 / self.world.env.build_action_scale(self.id))
+    return

-    def _initialize_vars(self):
-        self.sess.run(tf.global_variables_initializer())
-        return
+  def _load_normalizers(self):
+    self.s_norm.load()
+    self.g_norm.load()
+    self.a_norm.load()
+    return

-    def _build_saver(self):
-        vars = self._get_saver_vars()
-        self.saver = tf.train.Saver(vars, max_to_keep=0)
-        return
+  def _update_normalizers(self):
+    with self.sess.as_default(), self.graph.as_default():
+      super()._update_normalizers()
+    return

-    def _get_saver_vars(self):
-        with self.sess.as_default(), self.graph.as_default():
-            vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.tf_scope)
-            vars = [v for v in vars if '/' + self.SOLVER_SCOPE + '/' not in v.name]
-            #vars = [v for v in vars if '/target/' not in v.name]
-            assert len(vars) > 0
-        return vars
-    
-    def _weight_decay_loss(self, scope):
-        vars = self._tf_vars(scope)
-        vars_no_bias = [v for v in vars if 'bias' not in v.name]
-        loss = tf.add_n([tf.nn.l2_loss(v) for v in vars_no_bias])
-        return loss
+  def _initialize_vars(self):
+    self.sess.run(tf.global_variables_initializer())
+    return

-    def _train(self):
-        with self.sess.as_default(), self.graph.as_default():
-            super()._train()
-        return
+  def _build_saver(self):
+    vars = self._get_saver_vars()
+    self.saver = tf.train.Saver(vars, max_to_keep=0)
+    return
+
+  def _get_saver_vars(self):
+    with self.sess.as_default(), self.graph.as_default():
+      vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.tf_scope)
+      vars = [v for v in vars if '/' + self.SOLVER_SCOPE + '/' not in v.name]
+      #vars = [v for v in vars if '/target/' not in v.name]
+      assert len(vars) > 0
+    return vars
+
+  def _weight_decay_loss(self, scope):
+    vars = self._tf_vars(scope)
+    vars_no_bias = [v for v in vars if 'bias' not in v.name]
+    loss = tf.add_n([tf.nn.l2_loss(v) for v in vars_no_bias])
+    return loss
+
+  def _train(self):
+    with self.sess.as_default(), self.graph.as_default():
+      super()._train()
+    return
--- a/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/tf_normalizer.py
+++ b/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/tf_normalizer.py
@@ -3,65 +3,72 @@ import copy
 import tensorflow as tf
 from pybullet_envs.deep_mimic.learning.normalizer import Normalizer

+
 class TFNormalizer(Normalizer):

-    def __init__(self, sess, scope, size, groups_ids=None, eps=0.02, clip=np.inf):
-        self.sess = sess
-        self.scope = scope
-        super().__init__(size, groups_ids, eps, clip)
+  def __init__(self, sess, scope, size, groups_ids=None, eps=0.02, clip=np.inf):
+    self.sess = sess
+    self.scope = scope
+    super().__init__(size, groups_ids, eps, clip)

-        with tf.variable_scope(self.scope):
-            self._build_resource_tf()
-        return
+    with tf.variable_scope(self.scope):
+      self._build_resource_tf()
+    return

-    # initialze count when loading saved values so that things don't change to quickly during updates
-    def load(self):
-        self.count = self.count_tf.eval()[0]
-        self.mean = self.mean_tf.eval()
-        self.std = self.std_tf.eval()
-        self.mean_sq = self.calc_mean_sq(self.mean, self.std)
-        return
+  # initialze count when loading saved values so that things don't change to quickly during updates
+  def load(self):
+    self.count = self.count_tf.eval()[0]
+    self.mean = self.mean_tf.eval()
+    self.std = self.std_tf.eval()
+    self.mean_sq = self.calc_mean_sq(self.mean, self.std)
+    return

-    def update(self):
-        super().update()
-        self._update_resource_tf()
-        return
+  def update(self):
+    super().update()
+    self._update_resource_tf()
+    return

-    def set_mean_std(self, mean, std):
-        super().set_mean_std(mean, std)
-        self._update_resource_tf()
-        return
+  def set_mean_std(self, mean, std):
+    super().set_mean_std(mean, std)
+    self._update_resource_tf()
+    return

-    def normalize_tf(self, x):
-        norm_x = (x - self.mean_tf) / self.std_tf
-        norm_x = tf.clip_by_value(norm_x, -self.clip, self.clip)
-        return norm_x
+  def normalize_tf(self, x):
+    norm_x = (x - self.mean_tf) / self.std_tf
+    norm_x = tf.clip_by_value(norm_x, -self.clip, self.clip)
+    return norm_x

-    def unnormalize_tf(self, norm_x):
-        x = norm_x * self.std_tf + self.mean_tf
-        return x
-    
-    def _build_resource_tf(self):
-        self.count_tf = tf.get_variable(dtype=tf.int32, name='count', initializer=np.array([self.count], dtype=np.int32), trainable=False)
-        self.mean_tf = tf.get_variable(dtype=tf.float32, name='mean', initializer=self.mean.astype(np.float32), trainable=False)
-        self.std_tf = tf.get_variable(dtype=tf.float32, name='std', initializer=self.std.astype(np.float32), trainable=False)
-        
-        self.count_ph = tf.get_variable(dtype=tf.int32, name='count_ph', shape=[1])
-        self.mean_ph = tf.get_variable(dtype=tf.float32, name='mean_ph', shape=self.mean.shape)
-        self.std_ph = tf.get_variable(dtype=tf.float32, name='std_ph', shape=self.std.shape)
-        
-        self._update_op = tf.group(
-            self.count_tf.assign(self.count_ph),
-            self.mean_tf.assign(self.mean_ph),
-            self.std_tf.assign(self.std_ph)
-        )
-        return
+  def unnormalize_tf(self, norm_x):
+    x = norm_x * self.std_tf + self.mean_tf
+    return x

-    def _update_resource_tf(self):
-        feed = {
-            self.count_ph: np.array([self.count], dtype=np.int32),
-            self.mean_ph: self.mean,
-            self.std_ph: self.std
-        }
-        self.sess.run(self._update_op, feed_dict=feed)
-        return
+  def _build_resource_tf(self):
+    self.count_tf = tf.get_variable(dtype=tf.int32,
+                                    name='count',
+                                    initializer=np.array([self.count], dtype=np.int32),
+                                    trainable=False)
+    self.mean_tf = tf.get_variable(dtype=tf.float32,
+                                   name='mean',
+                                   initializer=self.mean.astype(np.float32),
+                                   trainable=False)
+    self.std_tf = tf.get_variable(dtype=tf.float32,
+                                  name='std',
+                                  initializer=self.std.astype(np.float32),
+                                  trainable=False)
+
+    self.count_ph = tf.get_variable(dtype=tf.int32, name='count_ph', shape=[1])
+    self.mean_ph = tf.get_variable(dtype=tf.float32, name='mean_ph', shape=self.mean.shape)
+    self.std_ph = tf.get_variable(dtype=tf.float32, name='std_ph', shape=self.std.shape)
+
+    self._update_op = tf.group(self.count_tf.assign(self.count_ph),
+                               self.mean_tf.assign(self.mean_ph), self.std_tf.assign(self.std_ph))
+    return
+
+  def _update_resource_tf(self):
+    feed = {
+        self.count_ph: np.array([self.count], dtype=np.int32),
+        self.mean_ph: self.mean,
+        self.std_ph: self.std
+    }
+    self.sess.run(self._update_op, feed_dict=feed)
+    return
--- a/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/tf_util.py
+++ b/examples/pybullet/gym/pybullet_envs/deep_mimic/learning/tf_util.py
@@ -4,101 +4,116 @@ import os

 xavier_initializer = tf.contrib.layers.xavier_initializer()

+
 def disable_gpu():
-    os.environ["CUDA_VISIBLE_DEVICES"] = '-1'
-    return
+  os.environ["CUDA_VISIBLE_DEVICES"] = '-1'
+  return
+

 def var_shape(x):
-    out = [k.value for k in x.get_shape()]
-    assert all(isinstance(a, int) for a in out), "shape function assumes that shape is fully known"
-    return out
+  out = [k.value for k in x.get_shape()]
+  assert all(isinstance(a, int) for a in out), "shape function assumes that shape is fully known"
+  return out
+

 def intprod(x):
-    return int(np.prod(x))
+  return int(np.prod(x))
+

 def numel(x):
-    n = intprod(var_shape(x))
-    return n
+  n = intprod(var_shape(x))
+  return n
+

 def flat_grad(loss, var_list, grad_ys=None):
-    grads = tf.gradients(loss, var_list, grad_ys)
-    return tf.concat([tf.reshape(grad, [numel(v)]) for (v, grad) in zip(var_list, grads)], axis=0)
+  grads = tf.gradients(loss, var_list, grad_ys)
+  return tf.concat([tf.reshape(grad, [numel(v)]) for (v, grad) in zip(var_list, grads)], axis=0)

-def fc_net(input, layers_sizes, activation, reuse=None, flatten=False): # build fully connected network
-    curr_tf = input
-    for i, size in enumerate(layers_sizes):
-        with tf.variable_scope(str(i), reuse=reuse):
-            curr_tf = tf.layers.dense(inputs=curr_tf,
-                                    units=size,
-                                    kernel_initializer=xavier_initializer,
-                                    activation = activation if i < len(layers_sizes)-1 else None)
-    if flatten:
-        assert layers_sizes[-1] == 1
-        curr_tf = tf.reshape(curr_tf, [-1])

-    return curr_tf
+def fc_net(input, layers_sizes, activation, reuse=None,
+           flatten=False):  # build fully connected network
+  curr_tf = input
+  for i, size in enumerate(layers_sizes):
+    with tf.variable_scope(str(i), reuse=reuse):
+      curr_tf = tf.layers.dense(inputs=curr_tf,
+                                units=size,
+                                kernel_initializer=xavier_initializer,
+                                activation=activation if i < len(layers_sizes) - 1 else None)
+  if flatten:
+    assert layers_sizes[-1] == 1
+    curr_tf = tf.reshape(curr_tf, [-1])
+
+  return curr_tf
+

 def copy(sess, src, dst):
-    assert len(src) == len(dst)
-    sess.run(list(map(lambda v: v[1].assign(v[0]), zip(src, dst))))
-    return
+  assert len(src) == len(dst)
+  sess.run(list(map(lambda v: v[1].assign(v[0]), zip(src, dst))))
+  return
+

 def flat_grad(loss, var_list):
-    grads = tf.gradients(loss, var_list)
-    return tf.concat(axis=0, values=[tf.reshape(grad, [numel(v)])
-        for (v, grad) in zip(var_list, grads)])
+  grads = tf.gradients(loss, var_list)
+  return tf.concat(axis=0,
+                   values=[tf.reshape(grad, [numel(v)]) for (v, grad) in zip(var_list, grads)])


 def calc_logp_gaussian(x_tf, mean_tf, std_tf):
-    dim = tf.to_float(tf.shape(x_tf)[-1])
+  dim = tf.to_float(tf.shape(x_tf)[-1])

-    if mean_tf is None:
-        diff_tf = x_tf
-    else:
-        diff_tf = x_tf - mean_tf
+  if mean_tf is None:
+    diff_tf = x_tf
+  else:
+    diff_tf = x_tf - mean_tf
+
+  logp_tf = -0.5 * tf.reduce_sum(tf.square(diff_tf / std_tf), axis=-1)
+  logp_tf += -0.5 * dim * np.log(2 * np.pi) - tf.reduce_sum(tf.log(std_tf), axis=-1)
+
+  return logp_tf

-    logp_tf = -0.5 * tf.reduce_sum(tf.square(diff_tf / std_tf), axis=-1)
-    logp_tf += -0.5 * dim * np.log(2 * np.pi) - tf.reduce_sum(tf.log(std_tf), axis=-1)
-    
-    return logp_tf

 def calc_bound_loss(x_tf, bound_min, bound_max):
-    # penalty for violating bounds
-    violation_min = tf.minimum(x_tf - bound_min, 0)
-    violation_max = tf.maximum(x_tf - bound_max, 0)
-    violation = tf.reduce_sum(tf.square(violation_min), axis=-1) + tf.reduce_sum(tf.square(violation_max), axis=-1)
-    loss = 0.5 * tf.reduce_mean(violation)
-    return loss
+  # penalty for violating bounds
+  violation_min = tf.minimum(x_tf - bound_min, 0)
+  violation_max = tf.maximum(x_tf - bound_max, 0)
+  violation = tf.reduce_sum(tf.square(violation_min), axis=-1) + tf.reduce_sum(
+      tf.square(violation_max), axis=-1)
+  loss = 0.5 * tf.reduce_mean(violation)
+  return loss
+

 class SetFromFlat(object):
-    def __init__(self, sess, var_list, dtype=tf.float32):
-        assigns = []
-        shapes = list(map(var_shape, var_list))
-        total_size = np.sum([intprod(shape) for shape in shapes])

-        self.sess = sess
-        self.theta = tf.placeholder(dtype,[total_size])
-        start=0
-        assigns = []
+  def __init__(self, sess, var_list, dtype=tf.float32):
+    assigns = []
+    shapes = list(map(var_shape, var_list))
+    total_size = np.sum([intprod(shape) for shape in shapes])

-        for (shape,v) in zip(shapes,var_list):
-            size = intprod(shape)
-            assigns.append(tf.assign(v, tf.reshape(self.theta[start:start+size],shape)))
-            start += size
+    self.sess = sess
+    self.theta = tf.placeholder(dtype, [total_size])
+    start = 0
+    assigns = []

-        self.op = tf.group(*assigns)
+    for (shape, v) in zip(shapes, var_list):
+      size = intprod(shape)
+      assigns.append(tf.assign(v, tf.reshape(self.theta[start:start + size], shape)))
+      start += size

-        return
+    self.op = tf.group(*assigns)
+
+    return
+
+  def __call__(self, theta):
+    self.sess.run(self.op, feed_dict={self.theta: theta})
+    return

-    def __call__(self, theta):
-        self.sess.run(self.op, feed_dict={self.theta:theta})
-        return

 class GetFlat(object):
-    def __init__(self, sess, var_list):
-        self.sess = sess
-        self.op = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list])
-        return

-    def __call__(self):
-        return self.sess.run(self.op)
+  def __init__(self, sess, var_list):
+    self.sess = sess
+    self.op = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list])
+    return
+
+  def __call__(self):
+    return self.sess.run(self.op)