From e3e929689be194cda57c7ec2df9ce6a89c7a5917 Mon Sep 17 00:00:00 2001
From: Bart Moyaers <bart.moyaers@kuleuven.be>
Date: Fri, 21 Feb 2020 11:40:58 +0100
Subject: [PATCH] add exercise 2.2, 2.3, 2.4, 2.5

---
 exercise_2-5.py | 164 ++++++++++++++++++++++++++++++++++++++++++++++++
 exercises.md    | 106 ++++++++++++++++++++++++++++++-
 2 files changed, 268 insertions(+), 2 deletions(-)
 create mode 100644 exercise_2-5.py

diff --git a/exercise_2-5.py b/exercise_2-5.py
new file mode 100644
index 0000000..d1f2ace
--- /dev/null
+++ b/exercise_2-5.py
@@ -0,0 +1,164 @@
+import numpy as np
+import random
+import matplotlib
+import matplotlib.pyplot as plt
+
+class KArmedBandit:
+    def __init__(self,
+                average_rewards = None,
+                arms = 10,
+                steps = 1000,
+                random_walk = False):
+        self.average_rewards = average_rewards
+        self.arms = arms
+        self.steps = steps
+        self.random_walk = random_walk
+        if self.average_rewards is None:
+            self.generate_average_rewards()
+        self.generate_rewards()
+
+    def generate_average_rewards(self):
+        if self.random_walk:
+            self.average_rewards = np.zeros(self.arms)
+        else:
+            self.average_rewards = np.random.normal(size=self.arms)
+
+    def generate_rewards(self):
+        if self.random_walk:
+            self.rewards = [[0.0] * self.steps] * self.arms
+            for step in range(self.steps):
+                # Slightly move all the averages
+                moves = np.random.normal(loc=0.0, scale=0.01, size=self.arms)
+                self.average_rewards = np.array(self.average_rewards) + moves
+                for action, average in enumerate(self.average_rewards):
+                    self.rewards[action][step] = np.random.normal(loc=average, scale=1, size=1)
+        else:
+            self.rewards = self.sample_rewards(self.steps)
+            
+
+    def sample_rewards(self, steps):
+        return [np.random.normal(loc=x, size=steps) for x in self.average_rewards]
+
+    def get_reward(self, action: int, step: int):
+        if action >= self.arms or action < 0:
+            ValueError("Action {} out of bounds. Actions can go from 0 to {}.".format(action, self.arms))
+        if step >= self.steps or step < 0:
+            ValueError("Step {} out of bounds. Current steps: {}".format(step, self.steps))
+
+        return self.rewards[action][step]
+
+    def get_max_reward_action(self):
+        return (max(self.average_rewards), np.argmax(self.average_rewards))
+
+class EpsilonGreedy:
+    def __init__(self, epsilon: float, arms: int):
+        self.epsilon = epsilon
+        self.arms = arms
+        self.actions = range(self.arms)
+        self.reset()
+
+    def reset(self):
+        self.estimated_values = [0.0] * self.arms
+
+    def update(self, action, reward, alpha):
+        self.estimated_values[action] = self.estimated_values[action] + alpha*(reward-self.estimated_values[action])
+
+    def choose_action(self) -> int:
+        explore = bool(np.random.binomial(1, self.epsilon))
+        if explore:
+            return random.choice(self.actions)
+        else:
+            # Select action with highest estimated reward. If multiple highest exist: select a random one.
+            max_estimate = max(self.estimated_values)
+            return np.random.choice([index for index, x in enumerate(self.estimated_values) if x == max_estimate])
+
+class Average:
+    def __init__(self):
+        self.value = 0.0
+        self.count = 1
+
+    def update(self, value):
+        self.value = self.value + (1/self.count) * (value - self.value)
+        self.count += 1
+
+class Plotter:
+    def __init__(self, karmedbandit: KArmedBandit, epsilongreedy: EpsilonGreedy, steps: int):
+        self.karmedbandit = karmedbandit
+        self.agent = epsilongreedy
+        self.steps = steps
+        self.reset()
+
+    def reset(self):
+        self.average_rewards = [Average() for _ in range(steps)]
+        self.actions_counter = [0] * self.karmedbandit.arms
+        self.total_steps = 0
+        self.optimal_action_counter = 0
+        self.optimal_action_fraction = [Average() for _ in range(steps)]
+
+    def count_action(self, action):
+        self.actions_counter[action] += 1
+        self.total_steps += 1
+
+    def count_optimal_action(self, is_optimal: bool, step):
+        self.optimal_action_counter += 1
+        if is_optimal:
+            self.optimal_action_fraction[step].update(1)
+        else:
+            self.optimal_action_fraction[step].update(0)
+
+    def update_optimal_action(self, optimal_action, step):
+        # self.optimal_action_fraction[step].update(self.actions_counter[optimal_action] / (step+1))
+        self.optimal_action_fraction[step].update(self.optimal_action_counter / (step+1))
+
+    def run(self, runs=1):
+        for i in range(runs):
+            self.karmedbandit.generate_rewards()
+            # optimal_action = self.karmedbandit.get_max_reward_action()[1]
+            self.agent.reset()
+            self.actions_counter = [0] * self.karmedbandit.arms
+            self.optimal_action_counter = 0
+            for step in range(steps):
+                optimal_action = self.karmedbandit.get_max_reward_action()[1]
+                action = self.agent.choose_action()
+                reward = self.karmedbandit.get_reward(action, step)
+                # self.agent.update(action, reward, 1/(self.actions_counter[action]+1))
+                self.agent.update(action, reward, 0.1)
+                self.average_rewards[step].update(reward)
+                self.count_action(action)
+                if action == optimal_action:
+                    self.count_optimal_action(True, step)
+                else:
+                    self.count_optimal_action(False, step)
+                # self.update_optimal_action(optimal_action, step)
+    
+
+    def plot(self):
+        fig, (ax1, ax2, ax3) = plt.subplots(nrows=3)
+
+        violin_data = self.karmedbandit.sample_rewards(steps=10000)
+        bp = ax1.violinplot(violin_data, showmeans=True, showmedians=False,
+                            showextrema=False)
+        ax1.set_ylim([-4,4])
+
+        ax2.plot(range(self.steps), [x.value for x in self.average_rewards])
+
+        ylabel = "Average reward. Max average reward: {}".format(self.karmedbandit.get_max_reward_action()[0])
+        ax2.set(ylabel=ylabel, title='')
+        
+        ax3.plot(range(self.steps), [x.value for x in self.optimal_action_fraction])
+        ax3.set_ylim([0,1])
+
+        plt.xlabel("Steps")
+        fig.savefig("exercise_2-5.png")
+        plt.show()
+
+
+if __name__ == "__main__":
+    arms = 10
+    steps = 1000
+    armedbandit = KArmedBandit(arms=arms, steps=steps, random_walk=False)
+    greedy = EpsilonGreedy(0.1, arms)
+
+    plotter = Plotter(armedbandit, greedy, steps)
+    plotter.run(runs=100)
+    plotter.plot()
\ No newline at end of file
diff --git a/exercises.md b/exercises.md
index 33aa75f..0304ddf 100644
--- a/exercises.md
+++ b/exercises.md
@@ -1,4 +1,15 @@
 <!-- Exercises from Reinforcement Learning: An Introduction, Barto et al 2018 -->
+# Table of content
+- [Table of content](#table-of-content)
+- [Exercise 1.1: Self-Play](#exercise-11-self-play)
+- [Exercise 1.2: Symmetries](#exercise-12-symmetries)
+- [Exercise 1.3: Greedy Play](#exercise-13-greedy-play)
+- [Exercise 1.4: Learning from Exploration](#exercise-14-learning-from-exploration)
+- [Exercise 2.1](#exercise-21)
+- [Exercise 2.2](#exercise-22)
+- [Exercise 2.3](#exercise-23)
+- [Exercise 2.4](#exercise-24)
+- [Exercise 2.5](#exercise-25)
 
 # Exercise 1.1: Self-Play
 *Suppose, instead of playing against a random opponent, the reinforcement learning algorithm described above played against itself, with both sides learning. What do you think would happen in this case? Would it learn a different policy for selecting moves?*
@@ -38,6 +49,97 @@ If we learn from exploratory moves, the state values of above-average states wou
 # Exercise 2.1
 *In $\varepsilon$-greedy action selection, for the case of two actions and $\varepsilon = 0.5$, what is the probability that the greedy action is selected?*
 
-Let set $A = \{A_1,\,A_2\}$ be the set of actions where $V(A_1) > V(A_2)$. The chance $P(A_1)$ of selecting greedy action $A_1$ is then 
-$$P(A_1)=\varepsilon + \varepsilon \frac{1}{|A|}=\varepsilon \left(1+\frac{1}{|A|}\right)=0.5\left(1+\frac{1}{2}\right)=0.75$$
+Let set $A = \{A_1,\,A_2\}$ be the set of actions where $V(A_1) > V(A_2)$. The chance $P(A_1)$ of selecting greedy action $A_1$ is then:
+$$P(A_1)=(1-\varepsilon) + \varepsilon \frac{1}{|A|}=\left(1-0.5\right)+0.5 \cdot \frac{1}{2}=0.75$$
+
+# Exercise 2.2
+**Bandit example.* Consider a $k$-armed bandit problem with $k=4$ actions, denoted $1, 2, 3, 4$. Consider applying to this problem a bandit algorithm usig $\varepsilon$-greedy action selection, sample-average action-value estimates, and initial estimates of $Q_1(a)=0$ for all $a$. Suppose the initial sequence of actions and rewards is $A_1=1, R_1=-1, A_2=2, R_2=1, A_3=2, R_3=-2, A_4=2, R_4=2, A_5=3, R_5=0$. On some of these time steps the $\varepsilon$ case may have occurred, causing an action to be selected at random. On which time steps did this definitely occur? On which time steps could this possibly have occurred?*
+
+Let's go over the estimates $Q_t(a)$ step by step, using the rule
+$$Q_t(a)=\frac{\sum_{i=1}^{t-1}{R_i\cdot\mathbb{1}_{A_i=a}}}{\sum_{i=1}^{t-1}\mathbb{1}_{A_i=a}}$$
+
+>$t=0$
+>$$Q_0(1)=0, Q_0(2)=0, Q_0(3)=0, Q_0(4)=0$$
+
+>$t=1$  
+>$A_1=1, R_1=-1$
+>$$Q_1(1)=-1, Q_1(2)=0, Q_1(3)=0, Q_1(4)=0$$
+
+>$t=2$  
+>$A_2=2, R_2=1$
+>$$Q_2(1)=-1, Q_2(2)=1, Q_2(3)=0, Q_2(4)=0$$
+
+Action 2 is now the best greedy action to take, and should be taken the next action unless the $\varepsilon$ case occurrs.
+
+>$t=3$  
+>$A_3=2, R_3=-2$
+>$$Q_3(1)=-1, Q_3(2)=-0.5, Q_3(3)=0, Q_3(4)=0$$
+
+As expected, action $2$ was selected. This might also have been an $\varepsilon$ case, which by chance selected the highest valued action $2$. The best actions to take in the greedy case are now $3$ and $4$.
+
+>$t=4$  
+>$A_4=2, R_4=2$
+>$$Q_4(1)=-1, Q_4(2)=\frac{1}{3}, Q_4(3)=0, Q_4(4)=0$$
+
+Since action $2$ was selected, while it had a lower than average estimated value, this must have been an $\varepsilon$ case!
+
+>$t=5$  
+>$A_5=3, R_5=0$
+>$$Q_5(1)=-1, Q_5(2)=\frac{1}{3}, Q_5(3)=0, Q_5(4)=0$$
+
+Since action $3$ was selected, while it had a lower than average estimated value, this must have been an $\varepsilon$ case!
+
+We can say with certainty that timesteps $t=4$ and $t=5$ were $\varepsilon$ cases. In an exploratory case, it's also possible that the highest valued action is chosen. This means that it's never possible to prove that a case was not exploratory. All of the timesteps $1,2,3,4,5$ might have been exploratory cases.
+
+# Exercise 2.3
+*In the comparison shown in Figure 2.2, which method will perform best in the long run in terms of cumulative reward and probability of selecting the best action? How much better will it be? Express your answer quantitatively.*
+
+In Figure 2.1 we can see that action $3$ has the highest average reward: $q_*(3)\approxeq1.55$. 
+
+Any $\varepsilon$-greedy method with $\varepsilon>0$ will in the long run learn the mean rewards $q_*(a)$:
+$$
+\lim_{t \to \infty} Q_t(a) = q_*(a)\;|\;\varepsilon>0
+$$
+
+This means that when step $t$ goes to infinity, the chance to select the most rewarding action $3$ only depends on $\varepsilon$:
+$$
+\lim_{t \to \infty} P(A_t=3)=(1-\varepsilon)+\varepsilon\cdot\frac{1}{|A|}\;|\;\varepsilon>0
+$$
+
+The average reward $R_t$ with step $t$ going to infinity would then be:
+$$
+\lim_{t \to \infty} R_t =
+\lim_{t \to \infty} P(A_t=3)\cdot q_*(3)=
+\left((1-\varepsilon)+\varepsilon\cdot\frac{1}{|A|}\right)\cdot q_*(3)\;|\;\varepsilon>0
+$$
+
+We can see that methods with a lower $\varepsilon$ will get a higher average reward in the distant future. We can calculate the average reward $R_t$ for the two values of $\varepsilon$ given in Figure 2.2:  
+$$
+\varepsilon = 0.1 \Rightarrow 
+\lim_{t \to \infty}R_t = 1.55\cdot \left((1-0.1)+ 0.1 \cdot \frac{1}{10} \right)=
+1.55\cdot0.91\approxeq1.41
+$$
+
+$$
+\varepsilon = 0.01 \Rightarrow 
+\lim_{t \to \infty}R_t = 1.55\cdot \left((1-0.01)+ 0.01 \cdot \frac{1}{10} \right)=
+1.55\cdot0.991\approxeq1.536
+$$
+
+# Exercise 2.4
+*If the step-size parameters, $\alpha_n$, are not constant, then the estimate $Q_n$ is a weighted average of previously received rewards with a weighting differen from that given by (2.6). What is the weighting on each prior reward for the general case, analogous to (2.6), in terms of the sequence of step-size parameters?*
+
+$$\begin{aligned}
+Q_{n+1} = & \; Q_n + \alpha_n \left[R_n - Q_n\right] \\
+= & \; \alpha_n R_n +\left(1-\alpha_n\right)Q_n \\
+= & \; \alpha_n R_n+ (1-\alpha_n)\left[\alpha_{n-1}R_{n-1}+(1-\alpha_{n-1})Q_{n-1}\right] \\
+= & \; \alpha_n R_n+(1-\alpha_n)\alpha_{n-1}R_{n-1}+(1-\alpha_n)(1-\alpha_{n-1})Q_{n-1} \\
+= & \; \alpha_n R_n+(1-\alpha_n)\alpha_{n-1}R_{n-1}+(1-\alpha_n)(1-\alpha_{n-1})\left[\alpha_{n-2}R_{n-2}+(1-\alpha_{n-2})Q_{n-2}\right] \\
+= & \; \alpha_n R_n+(1-\alpha_n)\alpha_{n-1}R_{n-1}+(1-\alpha_n)(1-\alpha_{n-1})\alpha_{n-2} R_{n-2} \\
+ & + (1-\alpha_n)(1-\alpha_{n-1})(1-\alpha_{n-2})Q_{n-2} \\
+= & \; \sum_{i=1}^{n}\left[R_i\alpha_i\prod_{j=i+1}^{n}(1-\alpha_{j})\right] + Q_1\prod_{i=1}^n(1-\alpha_i)
+\end{aligned}$$
+
+# Exercise 2.5
+See [exercise_2-5.py](./exercise_2-5.py) for code.