finish exercise 2.5
This commit is contained in:
BIN
exercise_2-5.png
Normal file
BIN
exercise_2-5.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 59 KiB |
@@ -2,6 +2,8 @@ import numpy as np
|
|||||||
import random
|
import random
|
||||||
import matplotlib
|
import matplotlib
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
from tqdm import tqdm
|
||||||
|
from mpl_toolkits.axes_grid1.axes_divider import make_axes_area_auto_adjustable
|
||||||
|
|
||||||
class KArmedBandit:
|
class KArmedBandit:
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
@@ -45,7 +47,13 @@ class KArmedBandit:
|
|||||||
if step >= self.steps or step < 0:
|
if step >= self.steps or step < 0:
|
||||||
ValueError("Step {} out of bounds. Current steps: {}".format(step, self.steps))
|
ValueError("Step {} out of bounds. Current steps: {}".format(step, self.steps))
|
||||||
|
|
||||||
return self.rewards[action][step]
|
if self.random_walk:
|
||||||
|
# Slightly move all the averages
|
||||||
|
moves = np.random.normal(loc=0.0, scale=0.01, size=self.arms)
|
||||||
|
self.average_rewards = np.array(self.average_rewards) + moves
|
||||||
|
|
||||||
|
return np.random.normal(loc=self.average_rewards[action], scale=1, size=1)
|
||||||
|
# return self.rewards[action][step]
|
||||||
|
|
||||||
def get_max_reward_action(self):
|
def get_max_reward_action(self):
|
||||||
return (max(self.average_rewards), np.argmax(self.average_rewards))
|
return (max(self.average_rewards), np.argmax(self.average_rewards))
|
||||||
@@ -82,10 +90,15 @@ class Average:
|
|||||||
self.count += 1
|
self.count += 1
|
||||||
|
|
||||||
class Plotter:
|
class Plotter:
|
||||||
def __init__(self, karmedbandit: KArmedBandit, epsilongreedy: EpsilonGreedy, steps: int):
|
def __init__(self,
|
||||||
|
karmedbandit: KArmedBandit,
|
||||||
|
epsilongreedy: EpsilonGreedy,
|
||||||
|
steps: int,
|
||||||
|
alpha: bool = True):
|
||||||
self.karmedbandit = karmedbandit
|
self.karmedbandit = karmedbandit
|
||||||
self.agent = epsilongreedy
|
self.agent = epsilongreedy
|
||||||
self.steps = steps
|
self.steps = steps
|
||||||
|
self.alpha = alpha
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
@@ -111,8 +124,8 @@ class Plotter:
|
|||||||
self.optimal_action_fraction[step].update(self.optimal_action_counter / (step+1))
|
self.optimal_action_fraction[step].update(self.optimal_action_counter / (step+1))
|
||||||
|
|
||||||
def run(self, runs=1):
|
def run(self, runs=1):
|
||||||
for i in range(runs):
|
for i in tqdm(range(runs), desc="Runs"):
|
||||||
self.karmedbandit.generate_rewards()
|
self.karmedbandit.generate_average_rewards()
|
||||||
# optimal_action = self.karmedbandit.get_max_reward_action()[1]
|
# optimal_action = self.karmedbandit.get_max_reward_action()[1]
|
||||||
self.agent.reset()
|
self.agent.reset()
|
||||||
self.actions_counter = [0] * self.karmedbandit.arms
|
self.actions_counter = [0] * self.karmedbandit.arms
|
||||||
@@ -121,8 +134,11 @@ class Plotter:
|
|||||||
optimal_action = self.karmedbandit.get_max_reward_action()[1]
|
optimal_action = self.karmedbandit.get_max_reward_action()[1]
|
||||||
action = self.agent.choose_action()
|
action = self.agent.choose_action()
|
||||||
reward = self.karmedbandit.get_reward(action, step)
|
reward = self.karmedbandit.get_reward(action, step)
|
||||||
# self.agent.update(action, reward, 1/(self.actions_counter[action]+1))
|
|
||||||
self.agent.update(action, reward, 0.1)
|
if self.alpha:
|
||||||
|
self.agent.update(action, reward, 0.1)
|
||||||
|
else:
|
||||||
|
self.agent.update(action, reward, 1/(self.actions_counter[action]+1))
|
||||||
self.average_rewards[step].update(reward)
|
self.average_rewards[step].update(reward)
|
||||||
self.count_action(action)
|
self.count_action(action)
|
||||||
if action == optimal_action:
|
if action == optimal_action:
|
||||||
@@ -138,27 +154,38 @@ class Plotter:
|
|||||||
violin_data = self.karmedbandit.sample_rewards(steps=10000)
|
violin_data = self.karmedbandit.sample_rewards(steps=10000)
|
||||||
bp = ax1.violinplot(violin_data, showmeans=True, showmedians=False,
|
bp = ax1.violinplot(violin_data, showmeans=True, showmedians=False,
|
||||||
showextrema=False)
|
showextrema=False)
|
||||||
ax1.set_ylim([-4,4])
|
ax1.set_ylabel("Reward distribution", rotation='vertical')
|
||||||
|
# ax1.set_xlabel("Actions", rotation='horizontal')
|
||||||
|
|
||||||
ax2.plot(range(self.steps), [x.value for x in self.average_rewards])
|
ax2.plot(range(self.steps), [x.value for x in self.average_rewards])
|
||||||
|
|
||||||
ylabel = "Average reward. Max average reward: {}".format(self.karmedbandit.get_max_reward_action()[0])
|
# ylabel = "Average reward. Max average reward: {}".format(self.karmedbandit.get_max_reward_action()[0])
|
||||||
|
ylabel = "Average reward."
|
||||||
ax2.set(ylabel=ylabel, title='')
|
ax2.set(ylabel=ylabel, title='')
|
||||||
|
ax2.set_ylabel(ylabel, rotation='vertical')
|
||||||
|
# ax2.set_xlabel("Steps", rotation='horizontal')
|
||||||
|
|
||||||
ax3.plot(range(self.steps), [x.value for x in self.optimal_action_fraction])
|
ax3.plot(range(self.steps), [x.value for x in self.optimal_action_fraction])
|
||||||
ax3.set_ylim([0,1])
|
ax3.set_ylim([0,1])
|
||||||
|
ax3.set_ylabel("Optimal action", rotation='vertical')
|
||||||
|
# ax3.set_xlabel("Steps", rotation='horizontal')
|
||||||
|
|
||||||
plt.xlabel("Steps")
|
plt.xlabel("Steps")
|
||||||
|
fig.set_figheight(8)
|
||||||
|
fig.set_figwidth(8)
|
||||||
|
# make_axes_area_auto_adjustable(ax1)
|
||||||
|
# make_axes_area_auto_adjustable(ax2)
|
||||||
|
# make_axes_area_auto_adjustable(ax3)
|
||||||
fig.savefig("exercise_2-5.png")
|
fig.savefig("exercise_2-5.png")
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
arms = 10
|
arms = 10
|
||||||
steps = 1000
|
steps = 10000
|
||||||
armedbandit = KArmedBandit(arms=arms, steps=steps, random_walk=False)
|
armedbandit = KArmedBandit(arms=arms, steps=steps, random_walk=True)
|
||||||
greedy = EpsilonGreedy(0.1, arms)
|
greedy = EpsilonGreedy(0.1, arms)
|
||||||
|
|
||||||
plotter = Plotter(armedbandit, greedy, steps)
|
plotter = Plotter(armedbandit, greedy, steps)
|
||||||
plotter.run(runs=100)
|
plotter.run(runs=1000)
|
||||||
plotter.plot()
|
plotter.plot()
|
||||||
@@ -142,4 +142,5 @@ Q_{n+1} = & \; Q_n + \alpha_n \left[R_n - Q_n\right] \\
|
|||||||
|
|
||||||
# Exercise 2.5
|
# Exercise 2.5
|
||||||
See [exercise_2-5.py](./exercise_2-5.py) for code.
|
See [exercise_2-5.py](./exercise_2-5.py) for code.
|
||||||
|

|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user