diff --git a/exercise_2-5.png b/exercise_2-5.png new file mode 100644 index 0000000..7df47f3 Binary files /dev/null and b/exercise_2-5.png differ diff --git a/exercise_2-5.py b/exercise_2_5.py similarity index 76% rename from exercise_2-5.py rename to exercise_2_5.py index d1f2ace..3adee8a 100644 --- a/exercise_2-5.py +++ b/exercise_2_5.py @@ -2,6 +2,8 @@ import numpy as np import random import matplotlib import matplotlib.pyplot as plt +from tqdm import tqdm +from mpl_toolkits.axes_grid1.axes_divider import make_axes_area_auto_adjustable class KArmedBandit: def __init__(self, @@ -45,7 +47,13 @@ class KArmedBandit: if step >= self.steps or step < 0: ValueError("Step {} out of bounds. Current steps: {}".format(step, self.steps)) - return self.rewards[action][step] + if self.random_walk: + # Slightly move all the averages + moves = np.random.normal(loc=0.0, scale=0.01, size=self.arms) + self.average_rewards = np.array(self.average_rewards) + moves + + return np.random.normal(loc=self.average_rewards[action], scale=1, size=1) + # return self.rewards[action][step] def get_max_reward_action(self): return (max(self.average_rewards), np.argmax(self.average_rewards)) @@ -82,10 +90,15 @@ class Average: self.count += 1 class Plotter: - def __init__(self, karmedbandit: KArmedBandit, epsilongreedy: EpsilonGreedy, steps: int): + def __init__(self, + karmedbandit: KArmedBandit, + epsilongreedy: EpsilonGreedy, + steps: int, + alpha: bool = True): self.karmedbandit = karmedbandit self.agent = epsilongreedy self.steps = steps + self.alpha = alpha self.reset() def reset(self): @@ -111,8 +124,8 @@ class Plotter: self.optimal_action_fraction[step].update(self.optimal_action_counter / (step+1)) def run(self, runs=1): - for i in range(runs): - self.karmedbandit.generate_rewards() + for i in tqdm(range(runs), desc="Runs"): + self.karmedbandit.generate_average_rewards() # optimal_action = self.karmedbandit.get_max_reward_action()[1] self.agent.reset() self.actions_counter = [0] * self.karmedbandit.arms @@ -121,8 +134,11 @@ class Plotter: optimal_action = self.karmedbandit.get_max_reward_action()[1] action = self.agent.choose_action() reward = self.karmedbandit.get_reward(action, step) - # self.agent.update(action, reward, 1/(self.actions_counter[action]+1)) - self.agent.update(action, reward, 0.1) + + if self.alpha: + self.agent.update(action, reward, 0.1) + else: + self.agent.update(action, reward, 1/(self.actions_counter[action]+1)) self.average_rewards[step].update(reward) self.count_action(action) if action == optimal_action: @@ -138,27 +154,38 @@ class Plotter: violin_data = self.karmedbandit.sample_rewards(steps=10000) bp = ax1.violinplot(violin_data, showmeans=True, showmedians=False, showextrema=False) - ax1.set_ylim([-4,4]) + ax1.set_ylabel("Reward distribution", rotation='vertical') + # ax1.set_xlabel("Actions", rotation='horizontal') ax2.plot(range(self.steps), [x.value for x in self.average_rewards]) - ylabel = "Average reward. Max average reward: {}".format(self.karmedbandit.get_max_reward_action()[0]) + # ylabel = "Average reward. Max average reward: {}".format(self.karmedbandit.get_max_reward_action()[0]) + ylabel = "Average reward." ax2.set(ylabel=ylabel, title='') + ax2.set_ylabel(ylabel, rotation='vertical') + # ax2.set_xlabel("Steps", rotation='horizontal') ax3.plot(range(self.steps), [x.value for x in self.optimal_action_fraction]) ax3.set_ylim([0,1]) + ax3.set_ylabel("Optimal action", rotation='vertical') + # ax3.set_xlabel("Steps", rotation='horizontal') plt.xlabel("Steps") + fig.set_figheight(8) + fig.set_figwidth(8) + # make_axes_area_auto_adjustable(ax1) + # make_axes_area_auto_adjustable(ax2) + # make_axes_area_auto_adjustable(ax3) fig.savefig("exercise_2-5.png") plt.show() if __name__ == "__main__": arms = 10 - steps = 1000 - armedbandit = KArmedBandit(arms=arms, steps=steps, random_walk=False) + steps = 10000 + armedbandit = KArmedBandit(arms=arms, steps=steps, random_walk=True) greedy = EpsilonGreedy(0.1, arms) plotter = Plotter(armedbandit, greedy, steps) - plotter.run(runs=100) + plotter.run(runs=1000) plotter.plot() \ No newline at end of file diff --git a/exercises.md b/exercises.md index 0304ddf..784e580 100644 --- a/exercises.md +++ b/exercises.md @@ -142,4 +142,5 @@ Q_{n+1} = & \; Q_n + \alpha_n \left[R_n - Q_n\right] \\ # Exercise 2.5 See [exercise_2-5.py](./exercise_2-5.py) for code. +![](./exercise_2-5.png)