added the learning algorithm from RL-lab

2017-03-10 12:29:47 -08:00
parent 923fbe8588
commit 37a809f5d1
4 changed files with 100 additions and 33 deletions
--- a/examples/pybullet/gym/envs/bullet/cartpole_bullet.py
+++ b/examples/pybullet/gym/envs/bullet/cartpole_bullet.py
@@ -24,13 +24,8 @@ class CartPoleBulletEnv(gym.Env):
  def __init__(self):
    # start the bullet physics server
 #    cmdStartBulletServer=['/Users/jietan/Projects/bullet3/build_cmake_python3/examples/SharedMemory/App_SharedMemoryPhysics_GUI']
 #    subprocess.Popen(cmdStartBulletServer)
    # wait to make sure that the physics server is ready
 #    time.sleep(1)
    # connect to the physics server
 #    p.connect(p.SHARED_MEMORY)
    p.connect(p.GUI)
 #    p.connect(p.DIRECT)
    observation_high = np.array([
          np.finfo(np.float32).max,
          np.finfo(np.float32).max,
--- a/examples/pybullet/gym/minitaur_bullet_gym_example.py
+++ b/examples/pybullet/gym/minitaur_bullet_gym_example.py
@@ -1,27 +0,0 @@
 import gym
 import numpy as np
 import math
 from envs.bullet.minitaur_bullet import MinitaurBulletEnv
 def main():
    environment = gym.make('MinitaurBulletEnv-v0')
    sum_reward = 0
    steps = 1000
    amplitude = 0.5
    speed = 0.3
    for stepCounter in range(steps):
      a1 = math.sin(stepCounter*speed)*amplitude
      a2 = math.sin(stepCounter*speed+3.14)*amplitude
      action = [a1, 0, a2, 0, 0, a1, 0, a2]
      state, reward, done, info = environment.step(action)
      sum_reward += reward
      print(state)
      if done:
        environment.reset()
    average_reward = sum_reward / steps
    print("avg reward: ", average_reward)
 main()
--- a/examples/pybullet/gym/trpo_cartpole_bullet_gym.py
+++ b/examples/pybullet/gym/trpo_cartpole_bullet_gym.py
@@ -0,0 +1,51 @@
 from envs.bullet.cartpole_bullet import CartPoleBulletEnv
 from rllab.algos.trpo import TRPO
 from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
 from rllab.envs.gym_env import GymEnv
 from rllab.envs.normalized_env import normalize
 from rllab.misc.instrument import stub, run_experiment_lite
 from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
 import subprocess
 import time
 stub(globals())
 env = normalize(GymEnv("CartPoleBulletEnv-v0"))
 policy = GaussianMLPPolicy(
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(8,)
 )
 baseline = LinearFeatureBaseline(env_spec=env.spec)
 algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=5000,
    max_path_length=env.horizon,
    n_itr=50,
    discount=0.999,
    step_size=0.01,
    # Uncomment both lines (this and the plot parameter below) to enable plotting
 #    plot=True,
 )
 #cmdStartBulletServer=['~/Projects/rllab/bullet_examples/run_physics_server.sh']
 #subprocess.Popen(cmdStartBulletServer, shell=True)
 #time.sleep(1)
 run_experiment_lite(
    algo.train(),
    # Number of parallel workers for sampling
    n_parallel=1,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    # Specifies the seed for the experiment. If this is not provided, a random seed
    # will be used
    seed=1,
 #   plot=True,
 )
--- a/examples/pybullet/gym/trpo_tf_cartpole_bullet_gym.py
+++ b/examples/pybullet/gym/trpo_tf_cartpole_bullet_gym.py
@@ -0,0 +1,48 @@
 from envs.bullet.cartpole_bullet import CartPoleBulletEnv
 from sandbox.rocky.tf.algos.trpo import TRPO
 from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
 from sandbox.rocky.tf.envs.base import TfEnv
 from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
 from rllab.envs.gym_env import GymEnv
 from rllab.envs.normalized_env import normalize
 from rllab.misc.instrument import stub, run_experiment_lite
 stub(globals())
 env = TfEnv(normalize(GymEnv("CartPoleBulletEnv-v0")))
 policy = GaussianMLPPolicy(
    name = "tf_gaussian_mlp",
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(8,)
 )
 baseline = LinearFeatureBaseline(env_spec=env.spec)
 algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=5000,
    max_path_length=env.horizon,
    n_itr=50,
    discount=0.999,
    step_size=0.01,
    force_batch_sampler=True,
    # Uncomment both lines (this and the plot parameter below) to enable plotting
    #plot=True,
 )
 run_experiment_lite(
    algo.train(),
    # Number of parallel workers for sampling
    n_parallel=1,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    # Specifies the seed for the experiment. If this is not provided, a random seed
    # will be used
    seed=1,
    #plot=True,
 )