add exercise 2.2, 2.3, 2.4, 2.5

2020-02-21 11:40:58 +01:00
parent 49b372f967
commit e3e929689b
2 changed files with 268 additions and 2 deletions
--- a/exercise_2-5.py
+++ b/exercise_2-5.py
@@ -0,0 +1,164 @@
 import numpy as np
 import random
 import matplotlib
 import matplotlib.pyplot as plt
 class KArmedBandit:
    def __init__(self,
                average_rewards = None,
                arms = 10,
                steps = 1000,
                random_walk = False):
        self.average_rewards = average_rewards
        self.arms = arms
        self.steps = steps
        self.random_walk = random_walk
        if self.average_rewards is None:
            self.generate_average_rewards()
        self.generate_rewards()
    def generate_average_rewards(self):
        if self.random_walk:
            self.average_rewards = np.zeros(self.arms)
        else:
            self.average_rewards = np.random.normal(size=self.arms)
    def generate_rewards(self):
        if self.random_walk:
            self.rewards = [[0.0] * self.steps] * self.arms
            for step in range(self.steps):
                # Slightly move all the averages
                moves = np.random.normal(loc=0.0, scale=0.01, size=self.arms)
                self.average_rewards = np.array(self.average_rewards) + moves
                for action, average in enumerate(self.average_rewards):
                    self.rewards[action][step] = np.random.normal(loc=average, scale=1, size=1)
        else:
            self.rewards = self.sample_rewards(self.steps)
    def sample_rewards(self, steps):
        return [np.random.normal(loc=x, size=steps) for x in self.average_rewards]
    def get_reward(self, action: int, step: int):
        if action >= self.arms or action < 0:
            ValueError("Action {} out of bounds. Actions can go from 0 to {}.".format(action, self.arms))
        if step >= self.steps or step < 0:
            ValueError("Step {} out of bounds. Current steps: {}".format(step, self.steps))
        return self.rewards[action][step]
    def get_max_reward_action(self):
        return (max(self.average_rewards), np.argmax(self.average_rewards))
 class EpsilonGreedy:
    def __init__(self, epsilon: float, arms: int):
        self.epsilon = epsilon
        self.arms = arms
        self.actions = range(self.arms)
        self.reset()
    def reset(self):
        self.estimated_values = [0.0] * self.arms
    def update(self, action, reward, alpha):
        self.estimated_values[action] = self.estimated_values[action] + alpha*(reward-self.estimated_values[action])
    def choose_action(self) -> int:
        explore = bool(np.random.binomial(1, self.epsilon))
        if explore:
            return random.choice(self.actions)
        else:
            # Select action with highest estimated reward. If multiple highest exist: select a random one.
            max_estimate = max(self.estimated_values)
            return np.random.choice([index for index, x in enumerate(self.estimated_values) if x == max_estimate])
 class Average:
    def __init__(self):
        self.value = 0.0
        self.count = 1
    def update(self, value):
        self.value = self.value + (1/self.count) * (value - self.value)
        self.count += 1
 class Plotter:
    def __init__(self, karmedbandit: KArmedBandit, epsilongreedy: EpsilonGreedy, steps: int):
        self.karmedbandit = karmedbandit
        self.agent = epsilongreedy
        self.steps = steps
        self.reset()
    def reset(self):
        self.average_rewards = [Average() for _ in range(steps)]
        self.actions_counter = [0] * self.karmedbandit.arms
        self.total_steps = 0
        self.optimal_action_counter = 0
        self.optimal_action_fraction = [Average() for _ in range(steps)]
    def count_action(self, action):
        self.actions_counter[action] += 1
        self.total_steps += 1
    def count_optimal_action(self, is_optimal: bool, step):
        self.optimal_action_counter += 1
        if is_optimal:
            self.optimal_action_fraction[step].update(1)
        else:
            self.optimal_action_fraction[step].update(0)
    def update_optimal_action(self, optimal_action, step):
        # self.optimal_action_fraction[step].update(self.actions_counter[optimal_action] / (step+1))
        self.optimal_action_fraction[step].update(self.optimal_action_counter / (step+1))
    def run(self, runs=1):
        for i in range(runs):
            self.karmedbandit.generate_rewards()
            # optimal_action = self.karmedbandit.get_max_reward_action()[1]
            self.agent.reset()
            self.actions_counter = [0] * self.karmedbandit.arms
            self.optimal_action_counter = 0
            for step in range(steps):
                optimal_action = self.karmedbandit.get_max_reward_action()[1]
                action = self.agent.choose_action()
                reward = self.karmedbandit.get_reward(action, step)
                # self.agent.update(action, reward, 1/(self.actions_counter[action]+1))
                self.agent.update(action, reward, 0.1)
                self.average_rewards[step].update(reward)
                self.count_action(action)
                if action == optimal_action:
                    self.count_optimal_action(True, step)
                else:
                    self.count_optimal_action(False, step)
                # self.update_optimal_action(optimal_action, step)
    def plot(self):
        fig, (ax1, ax2, ax3) = plt.subplots(nrows=3)
        violin_data = self.karmedbandit.sample_rewards(steps=10000)
        bp = ax1.violinplot(violin_data, showmeans=True, showmedians=False,
                            showextrema=False)
        ax1.set_ylim([-4,4])
        ax2.plot(range(self.steps), [x.value for x in self.average_rewards])
        ylabel = "Average reward. Max average reward: {}".format(self.karmedbandit.get_max_reward_action()[0])
        ax2.set(ylabel=ylabel, title='')
        ax3.plot(range(self.steps), [x.value for x in self.optimal_action_fraction])
        ax3.set_ylim([0,1])
        plt.xlabel("Steps")
        fig.savefig("exercise_2-5.png")
        plt.show()
 if __name__ == "__main__":
    arms = 10
    steps = 1000
    armedbandit = KArmedBandit(arms=arms, steps=steps, random_walk=False)
    greedy = EpsilonGreedy(0.1, arms)
    plotter = Plotter(armedbandit, greedy, steps)
    plotter.run(runs=100)
    plotter.plot()
--- a/exercises.md
+++ b/exercises.md
@@ -1,4 +1,15 @@
 <!-- Exercises from Reinforcement Learning: An Introduction, Barto et al 2018 -->
 # Table of content
 - [Table of content](#table-of-content)
 - [Exercise 1.1: Self-Play](#exercise-11-self-play)
 - [Exercise 1.2: Symmetries](#exercise-12-symmetries)
 - [Exercise 1.3: Greedy Play](#exercise-13-greedy-play)
 - [Exercise 1.4: Learning from Exploration](#exercise-14-learning-from-exploration)
 - [Exercise 2.1](#exercise-21)
 - [Exercise 2.2](#exercise-22)
 - [Exercise 2.3](#exercise-23)
 - [Exercise 2.4](#exercise-24)
 - [Exercise 2.5](#exercise-25)
 # Exercise 1.1: Self-Play
 *Suppose, instead of playing against a random opponent, the reinforcement learning algorithm described above played against itself, with both sides learning. What do you think would happen in this case? Would it learn a different policy for selecting moves?*
@@ -38,6 +49,97 @@ If we learn from exploratory moves, the state values of above-average states wou
 # Exercise 2.1
 *In $\varepsilon$-greedy action selection, for the case of two actions and $\varepsilon = 0.5$, what is the probability that the greedy action is selected?*
-Let set $A = \{A_1,\,A_2\}$ be the set of actions where $V(A_1) > V(A_2)$. The chance $P(A_1)$ of selecting greedy action $A_1$ is then 
+Let set $A = \{A_1,\,A_2\}$ be the set of actions where $V(A_1) > V(A_2)$. The chance $P(A_1)$ of selecting greedy action $A_1$ is then:
-$$P(A_1)=\varepsilon + \varepsilon \frac{1}{|A|}=\varepsilon \left(1+\frac{1}{|A|}\right)=0.5\left(1+\frac{1}{2}\right)=0.75$$
+$$P(A_1)=(1-\varepsilon) + \varepsilon \frac{1}{|A|}=\left(1-0.5\right)+0.5 \cdot \frac{1}{2}=0.75$$
 # Exercise 2.2
 **Bandit example.* Consider a $k$-armed bandit problem with $k=4$ actions, denoted $1, 2, 3, 4$. Consider applying to this problem a bandit algorithm usig $\varepsilon$-greedy action selection, sample-average action-value estimates, and initial estimates of $Q_1(a)=0$ for all $a$. Suppose the initial sequence of actions and rewards is $A_1=1, R_1=-1, A_2=2, R_2=1, A_3=2, R_3=-2, A_4=2, R_4=2, A_5=3, R_5=0$. On some of these time steps the $\varepsilon$ case may have occurred, causing an action to be selected at random. On which time steps did this definitely occur? On which time steps could this possibly have occurred?*
 Let's go over the estimates $Q_t(a)$ step by step, using the rule
 $$Q_t(a)=\frac{\sum_{i=1}^{t-1}{R_i\cdot\mathbb{1}_{A_i=a}}}{\sum_{i=1}^{t-1}\mathbb{1}_{A_i=a}}$$
 >$t=0$
 >$$Q_0(1)=0, Q_0(2)=0, Q_0(3)=0, Q_0(4)=0$$
 >$t=1$  
 >$A_1=1, R_1=-1$
 >$$Q_1(1)=-1, Q_1(2)=0, Q_1(3)=0, Q_1(4)=0$$
 >$t=2$  
 >$A_2=2, R_2=1$
 >$$Q_2(1)=-1, Q_2(2)=1, Q_2(3)=0, Q_2(4)=0$$
 Action 2 is now the best greedy action to take, and should be taken the next action unless the $\varepsilon$ case occurrs.
 >$t=3$  
 >$A_3=2, R_3=-2$
 >$$Q_3(1)=-1, Q_3(2)=-0.5, Q_3(3)=0, Q_3(4)=0$$
 As expected, action $2$ was selected. This might also have been an $\varepsilon$ case, which by chance selected the highest valued action $2$. The best actions to take in the greedy case are now $3$ and $4$.
 >$t=4$  
 >$A_4=2, R_4=2$
 >$$Q_4(1)=-1, Q_4(2)=\frac{1}{3}, Q_4(3)=0, Q_4(4)=0$$
 Since action $2$ was selected, while it had a lower than average estimated value, this must have been an $\varepsilon$ case!
 >$t=5$  
 >$A_5=3, R_5=0$
 >$$Q_5(1)=-1, Q_5(2)=\frac{1}{3}, Q_5(3)=0, Q_5(4)=0$$
 Since action $3$ was selected, while it had a lower than average estimated value, this must have been an $\varepsilon$ case!
 We can say with certainty that timesteps $t=4$ and $t=5$ were $\varepsilon$ cases. In an exploratory case, it's also possible that the highest valued action is chosen. This means that it's never possible to prove that a case was not exploratory. All of the timesteps $1,2,3,4,5$ might have been exploratory cases.
 # Exercise 2.3
 *In the comparison shown in Figure 2.2, which method will perform best in the long run in terms of cumulative reward and probability of selecting the best action? How much better will it be? Express your answer quantitatively.*
 In Figure 2.1 we can see that action $3$ has the highest average reward: $q_*(3)\approxeq1.55$. 
 Any $\varepsilon$-greedy method with $\varepsilon>0$ will in the long run learn the mean rewards $q_*(a)$:
 $$
 \lim_{t \to \infty} Q_t(a) = q_*(a)\;|\;\varepsilon>0
 $$
 This means that when step $t$ goes to infinity, the chance to select the most rewarding action $3$ only depends on $\varepsilon$:
 $$
 \lim_{t \to \infty} P(A_t=3)=(1-\varepsilon)+\varepsilon\cdot\frac{1}{|A|}\;|\;\varepsilon>0
 $$
 The average reward $R_t$ with step $t$ going to infinity would then be:
 $$
 \lim_{t \to \infty} R_t =
 \lim_{t \to \infty} P(A_t=3)\cdot q_*(3)=
 \left((1-\varepsilon)+\varepsilon\cdot\frac{1}{|A|}\right)\cdot q_*(3)\;|\;\varepsilon>0
 $$
 We can see that methods with a lower $\varepsilon$ will get a higher average reward in the distant future. We can calculate the average reward $R_t$ for the two values of $\varepsilon$ given in Figure 2.2:  
 $$
 \varepsilon = 0.1 \Rightarrow 
 \lim_{t \to \infty}R_t = 1.55\cdot \left((1-0.1)+ 0.1 \cdot \frac{1}{10} \right)=
 1.55\cdot0.91\approxeq1.41
 $$
 $$
 \varepsilon = 0.01 \Rightarrow 
 \lim_{t \to \infty}R_t = 1.55\cdot \left((1-0.01)+ 0.01 \cdot \frac{1}{10} \right)=
 1.55\cdot0.991\approxeq1.536
 $$
 # Exercise 2.4
 *If the step-size parameters, $\alpha_n$, are not constant, then the estimate $Q_n$ is a weighted average of previously received rewards with a weighting differen from that given by (2.6). What is the weighting on each prior reward for the general case, analogous to (2.6), in terms of the sequence of step-size parameters?*
 $$\begin{aligned}
 Q_{n+1} = & \; Q_n + \alpha_n \left[R_n - Q_n\right] \\
 = & \; \alpha_n R_n +\left(1-\alpha_n\right)Q_n \\
 = & \; \alpha_n R_n+ (1-\alpha_n)\left[\alpha_{n-1}R_{n-1}+(1-\alpha_{n-1})Q_{n-1}\right] \\
 = & \; \alpha_n R_n+(1-\alpha_n)\alpha_{n-1}R_{n-1}+(1-\alpha_n)(1-\alpha_{n-1})Q_{n-1} \\
 = & \; \alpha_n R_n+(1-\alpha_n)\alpha_{n-1}R_{n-1}+(1-\alpha_n)(1-\alpha_{n-1})\left[\alpha_{n-2}R_{n-2}+(1-\alpha_{n-2})Q_{n-2}\right] \\
 = & \; \alpha_n R_n+(1-\alpha_n)\alpha_{n-1}R_{n-1}+(1-\alpha_n)(1-\alpha_{n-1})\alpha_{n-2} R_{n-2} \\
 & + (1-\alpha_n)(1-\alpha_{n-1})(1-\alpha_{n-2})Q_{n-2} \\
 = & \; \sum_{i=1}^{n}\left[R_i\alpha_i\prod_{j=i+1}^{n}(1-\alpha_{j})\right] + Q_1\prod_{i=1}^n(1-\alpha_i)
 \end{aligned}$$
 # Exercise 2.5
 See [exercise_2-5.py](./exercise_2-5.py) for code.