From f6ea2a73796438734be1a0cb5f8124da8e976be8 Mon Sep 17 00:00:00 2001 From: Erwin Coumans Date: Thu, 1 Nov 2018 10:20:54 -0700 Subject: [PATCH 1/2] fix a potential data race condition. remove createObstacleCourse programmatic creation part until we can steam vertices/indices. --- .../SharedMemory/PhysicsServerExample.cpp | 3 ++- .../pybullet/examples/createObstacleCourse.py | 22 ------------------- 2 files changed, 2 insertions(+), 23 deletions(-) diff --git a/examples/SharedMemory/PhysicsServerExample.cpp b/examples/SharedMemory/PhysicsServerExample.cpp index 2dc7145be..36ec1f18b 100644 --- a/examples/SharedMemory/PhysicsServerExample.cpp +++ b/examples/SharedMemory/PhysicsServerExample.cpp @@ -1764,8 +1764,9 @@ void PhysicsServerExample::initPhysics() } } + m_args[0].m_cs->lock(); m_args[0].m_cs->setSharedParam(1, eGUIHelperIdle); - + m_args[0].m_cs->unlock(); m_args[0].m_cs2->lock(); { diff --git a/examples/pybullet/examples/createObstacleCourse.py b/examples/pybullet/examples/createObstacleCourse.py index 1194a213a..ef145c72c 100644 --- a/examples/pybullet/examples/createObstacleCourse.py +++ b/examples/pybullet/examples/createObstacleCourse.py @@ -21,28 +21,6 @@ colSphereId = p.createCollisionShape(p.GEOM_SPHERE,radius=sphereRadius) #convex mesh from obj stoneId = p.createCollisionShape(p.GEOM_MESH,fileName="stone.obj") -#concave mesh from obj -stoneId = p.createCollisionShape(p.GEOM_MESH,fileName="stone.obj", flags=p.GEOM_FORCE_CONCAVE_TRIMESH) - - -verts=[[-0.246350, -0.246483, -0.000624], - [ -0.151407, -0.176325, 0.172867], - [ -0.246350, 0.249205, -0.000624], - [ -0.151407, 0.129477, 0.172867], - [ 0.249338, -0.246483, -0.000624], - [ 0.154395, -0.176325, 0.172867], - [ 0.249338, 0.249205, -0.000624], - [ 0.154395, 0.129477, 0.172867]] -#convex mesh from vertices -stoneConvexId = p.createCollisionShape(p.GEOM_MESH,vertices=verts) - -indices=[0,3,2,3,6,2,7,4,6,5,0,4,6,0,2,3,5,7,0,1,3,3,7,6,7,5,4,5,1,0,6,4,0,3,1,5] - -#concave mesh from vertices+indices -stoneConcaveId = p.createCollisionShape(p.GEOM_MESH,vertices=verts, indices=indices) - -stoneId = stoneConvexId -#stoneId = stoneConcaveId boxHalfLength = 0.5 From 8b53e47fe8531710eb1ae0533035ef4e51f855d4 Mon Sep 17 00:00:00 2001 From: Erwin Coumans Date: Fri, 2 Nov 2018 11:19:46 -0700 Subject: [PATCH 2/2] add simpler ARS implementation, thanks to Alexis Jacq and Hadelin de Ponteves (will add save/restore of policy and rendering movies through command-line arguments soon) --- .../pybullet/gym/pybullet_envs/ARS/ars.py | 142 ++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 examples/pybullet/gym/pybullet_envs/ARS/ars.py diff --git a/examples/pybullet/gym/pybullet_envs/ARS/ars.py b/examples/pybullet/gym/pybullet_envs/ARS/ars.py new file mode 100644 index 000000000..b4c0c3127 --- /dev/null +++ b/examples/pybullet/gym/pybullet_envs/ARS/ars.py @@ -0,0 +1,142 @@ +# AI 2018 + +# Importing the libraries +import os +import numpy as np +import gym +from gym import wrappers +import pybullet_envs + +# Setting the Hyper Parameters + +class Hp(): + + def __init__(self): + self.nb_steps = 1000 + self.episode_length = 1000 + self.learning_rate = 0.02 + self.nb_directions = 16 + self.nb_best_directions = 16 + assert self.nb_best_directions <= self.nb_directions + self.noise = 0.03 + self.seed = 1 + self.env_name = 'HalfCheetahBulletEnv-v0' + +# Normalizing the states + +class Normalizer(): + + def __init__(self, nb_inputs): + self.n = np.zeros(nb_inputs) + self.mean = np.zeros(nb_inputs) + self.mean_diff = np.zeros(nb_inputs) + self.var = np.zeros(nb_inputs) + + def observe(self, x): + self.n += 1. + last_mean = self.mean.copy() + self.mean += (x - self.mean) / self.n + self.mean_diff += (x - last_mean) * (x - self.mean) + self.var = (self.mean_diff / self.n).clip(min = 1e-2) + + def normalize(self, inputs): + obs_mean = self.mean + obs_std = np.sqrt(self.var) + return (inputs - obs_mean) / obs_std + +# Building the AI + +class Policy(): + + def __init__(self, input_size, output_size): + self.theta = np.zeros((output_size, input_size)) + print("self.theta=",self.theta) + def evaluate(self, input, delta = None, direction = None): + if direction is None: + return np.clip(self.theta.dot(input), -1.0, 1.0) + elif direction == "positive": + return np.clip((self.theta + hp.noise*delta).dot(input), -1.0, 1.0) + else: + return np.clip((self.theta - hp.noise*delta).dot(input), -1.0, 1.0) + + def sample_deltas(self): + return [np.random.randn(*self.theta.shape) for _ in range(hp.nb_directions)] + + def update(self, rollouts, sigma_r): + step = np.zeros(self.theta.shape) + for r_pos, r_neg, d in rollouts: + step += (r_pos - r_neg) * d + self.theta += hp.learning_rate / (hp.nb_best_directions * sigma_r) * step + +# Exploring the policy on one specific direction and over one episode + +def explore(env, normalizer, policy, direction = None, delta = None): + state = env.reset() + done = False + num_plays = 0. + sum_rewards = 0 + while not done and num_plays < hp.episode_length: + normalizer.observe(state) + state = normalizer.normalize(state) + action = policy.evaluate(state, delta, direction) + state, reward, done, _ = env.step(action) + reward = max(min(reward, 1), -1) + sum_rewards += reward + num_plays += 1 + return sum_rewards + +# Training the AI + +def train(env, policy, normalizer, hp): + + for step in range(hp.nb_steps): + + # Initializing the perturbations deltas and the positive/negative rewards + deltas = policy.sample_deltas() + positive_rewards = [0] * hp.nb_directions + negative_rewards = [0] * hp.nb_directions + + # Getting the positive rewards in the positive directions + for k in range(hp.nb_directions): + positive_rewards[k] = explore(env, normalizer, policy, direction = "positive", delta = deltas[k]) + + # Getting the negative rewards in the negative/opposite directions + for k in range(hp.nb_directions): + negative_rewards[k] = explore(env, normalizer, policy, direction = "negative", delta = deltas[k]) + + # Gathering all the positive/negative rewards to compute the standard deviation of these rewards + all_rewards = np.array(positive_rewards + negative_rewards) + sigma_r = all_rewards.std() + + # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions + scores = {k:max(r_pos, r_neg) for k,(r_pos,r_neg) in enumerate(zip(positive_rewards, negative_rewards))} + order = sorted(scores.keys(), key = lambda x:scores[x])[:hp.nb_best_directions] + rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order] + + # Updating our policy + policy.update(rollouts, sigma_r) + + # Printing the final reward of the policy after the update + reward_evaluation = explore(env, normalizer, policy) + print('Step:', step, 'Reward:', reward_evaluation) + +# Running the main code + +def mkdir(base, name): + path = os.path.join(base, name) + if not os.path.exists(path): + os.makedirs(path) + return path +work_dir = mkdir('exp', 'brs') +monitor_dir = mkdir(work_dir, 'monitor') + +hp = Hp() +np.random.seed(hp.seed) +env = gym.make(hp.env_name) +# env.render(mode = "human") +#env = wrappers.Monitor(env, monitor_dir, force = True) +nb_inputs = env.observation_space.shape[0] +nb_outputs = env.action_space.shape[0] +policy = Policy(nb_inputs, nb_outputs) +normalizer = Normalizer(nb_inputs) +train(env, policy, normalizer, hp)