make sure that the pre-trained galloping and trotting policies work for the minitaur_reactive_env and minitaur_trotting_env environments.

This commit is contained in:
Jie
2018-04-24 21:48:27 -07:00
parent 982453abc6
commit a375a349ce
43 changed files with 4447 additions and 42 deletions

View File

@@ -0,0 +1,11 @@
This folder contains a number of simulated Minitaur environments implemented using pybullet.
The following two environments are used in the RSS paper "Sim-to-Real: Learning Agile Locomotion For Quadruped Robots":
1) Galloping example: minitaur_reactive_env.py
python minitaur_reactive_env_example.py runs a pre-trained PPO agent that performs galloping gait.
2) Trotting example: minitaur_trotting_env.py
python minitaur_trotting_env_example.py runs a pre-trained PPO agent that performs trotting gait.
The rest are experimental environments.

View File

@@ -216,7 +216,7 @@ class MinitaurFourLegStandEnv(minitaur_gym_env.MinitaurGymEnv):
self._env_step_counter += 1
done = self._termination()
obs = self._get_true_observation()
reward = self._reward(action, obs)
reward = self._reward()
if self._log_path is not None:
minitaur_logging.update_episode_proto(self._episode_proto, self.minitaur,
action, self._env_step_counter)
@@ -272,7 +272,7 @@ class MinitaurFourLegStandEnv(minitaur_gym_env.MinitaurGymEnv):
np.asarray([0, 0, 1]), np.asarray(local_up))
return local_global_up_dot_product < 0.85 or height < 0.15
def _reward(self, action, obs):
def _reward(self):
roll, pitch, _ = self.minitaur.GetBaseRollPitchYaw()
return 1.0 / (0.001 + math.fabs(roll) + math.fabs(pitch))

View File

@@ -1,8 +1,4 @@
r"""An example to use simple_ppo_agent.
blaze run -c opt \
//robotics/reinforcement_learning/minitaur/envs:minitaur_reactive_env_example
"""
r"""Running a pre-trained ppo agent on minitaur_reactive_env."""
from __future__ import absolute_import
from __future__ import division
@@ -11,13 +7,13 @@ from __future__ import print_function
import os
import time
import tensorflow as tf
from agents.scripts import utility
from pybullet_envs.minitaur.agents.scripts import utility
import pybullet_data
import simple_ppo_agent
flags = tf.app.flags
FLAGS = tf.app.flags.FLAGS
LOG_DIR = (
"testdata/minitaur_reactive_env_test")
LOG_DIR = os.path.join(pybullet_data.getDataPath(), "policies/ppo/minitaur_reactive_env")
CHECKPOINT = "model.ckpt-14000000"
@@ -43,7 +39,6 @@ def main(argv):
while True:
action = agent.get_action([observation])
observation, reward, done, _ = env.step(action[0])
# This sleep is to prevent serial communication error on the real robot.
time.sleep(0.002)
sum_reward += reward
if done:

View File

@@ -1,41 +1,51 @@
"""An example to run the minitaur environment of trotting gait.
r"""Running a pre-trained ppo agent on minitaur_trotting_env."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""
import time
import os
import numpy as np
import time
import tensorflow as tf
from pybullet_envs.minitaur.envs import minitaur_gym_env
from pybullet_envs.minitaur.envs import minitaur_trotting_env
from pybullet_envs.minitaur.agents.scripts import utility
import pybullet_data
import simple_ppo_agent
#FLAGS = tf.flags.FLAGS
#tf.flags.DEFINE_string("log_path", None, "The directory to write the log file.")
flags = tf.app.flags
FLAGS = tf.app.flags.FLAGS
LOG_DIR = os.path.join(pybullet_data.getDataPath(), "policies/ppo/minitaur_trotting_env")
CHECKPOINT = "model.ckpt-14000000"
def main(unused_argv):
environment = minitaur_trotting_env.MinitaurTrottingEnv(
urdf_version=minitaur_gym_env.RAINBOW_DASH_V0_URDF_VERSION,
use_signal_in_observation=False,
use_angle_in_observation=False,
render=True,
log_path=os.getcwd())
def main(argv):
del argv # Unused.
config = utility.load_config(LOG_DIR)
policy_layers = config.policy_layers
value_layers = config.value_layers
env = config.env(render=True)
network = config.network
np.random.seed(100)
sum_reward = 0
environment.reset()
with tf.Session() as sess:
agent = simple_ppo_agent.SimplePPOPolicy(
sess,
env,
network,
policy_layers=policy_layers,
value_layers=value_layers,
checkpoint=os.path.join(LOG_DIR, CHECKPOINT))
steps = 5000
for _ in range(steps):
# Sleep to prevent serial buffer overflow on microcontroller.
time.sleep(0.002)
action = [0] * 8
_, reward, done, _ = environment.step(action)
sum_reward += reward
if done:
break
tf.logging.info("reward: {}".format(sum_reward))
sum_reward = 0
observation = env.reset()
while True:
action = agent.get_action([observation])
observation, reward, done, _ = env.step(action[0])
time.sleep(0.002)
sum_reward += reward
if done:
break
tf.logging.info("reward: %s", sum_reward)
if __name__ == "__main__":
tf.logging.set_verbosity(tf.logging.INFO)
tf.app.run()
tf.app.run(main)