add exercise 2.2, 2.3, 2.4, 2.5
This commit is contained in:
164
exercise_2-5.py
Normal file
164
exercise_2-5.py
Normal file
@@ -0,0 +1,164 @@
|
||||
import numpy as np
|
||||
import random
|
||||
import matplotlib
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
class KArmedBandit:
|
||||
def __init__(self,
|
||||
average_rewards = None,
|
||||
arms = 10,
|
||||
steps = 1000,
|
||||
random_walk = False):
|
||||
self.average_rewards = average_rewards
|
||||
self.arms = arms
|
||||
self.steps = steps
|
||||
self.random_walk = random_walk
|
||||
if self.average_rewards is None:
|
||||
self.generate_average_rewards()
|
||||
self.generate_rewards()
|
||||
|
||||
def generate_average_rewards(self):
|
||||
if self.random_walk:
|
||||
self.average_rewards = np.zeros(self.arms)
|
||||
else:
|
||||
self.average_rewards = np.random.normal(size=self.arms)
|
||||
|
||||
def generate_rewards(self):
|
||||
if self.random_walk:
|
||||
self.rewards = [[0.0] * self.steps] * self.arms
|
||||
for step in range(self.steps):
|
||||
# Slightly move all the averages
|
||||
moves = np.random.normal(loc=0.0, scale=0.01, size=self.arms)
|
||||
self.average_rewards = np.array(self.average_rewards) + moves
|
||||
for action, average in enumerate(self.average_rewards):
|
||||
self.rewards[action][step] = np.random.normal(loc=average, scale=1, size=1)
|
||||
else:
|
||||
self.rewards = self.sample_rewards(self.steps)
|
||||
|
||||
|
||||
def sample_rewards(self, steps):
|
||||
return [np.random.normal(loc=x, size=steps) for x in self.average_rewards]
|
||||
|
||||
def get_reward(self, action: int, step: int):
|
||||
if action >= self.arms or action < 0:
|
||||
ValueError("Action {} out of bounds. Actions can go from 0 to {}.".format(action, self.arms))
|
||||
if step >= self.steps or step < 0:
|
||||
ValueError("Step {} out of bounds. Current steps: {}".format(step, self.steps))
|
||||
|
||||
return self.rewards[action][step]
|
||||
|
||||
def get_max_reward_action(self):
|
||||
return (max(self.average_rewards), np.argmax(self.average_rewards))
|
||||
|
||||
class EpsilonGreedy:
|
||||
def __init__(self, epsilon: float, arms: int):
|
||||
self.epsilon = epsilon
|
||||
self.arms = arms
|
||||
self.actions = range(self.arms)
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.estimated_values = [0.0] * self.arms
|
||||
|
||||
def update(self, action, reward, alpha):
|
||||
self.estimated_values[action] = self.estimated_values[action] + alpha*(reward-self.estimated_values[action])
|
||||
|
||||
def choose_action(self) -> int:
|
||||
explore = bool(np.random.binomial(1, self.epsilon))
|
||||
if explore:
|
||||
return random.choice(self.actions)
|
||||
else:
|
||||
# Select action with highest estimated reward. If multiple highest exist: select a random one.
|
||||
max_estimate = max(self.estimated_values)
|
||||
return np.random.choice([index for index, x in enumerate(self.estimated_values) if x == max_estimate])
|
||||
|
||||
class Average:
|
||||
def __init__(self):
|
||||
self.value = 0.0
|
||||
self.count = 1
|
||||
|
||||
def update(self, value):
|
||||
self.value = self.value + (1/self.count) * (value - self.value)
|
||||
self.count += 1
|
||||
|
||||
class Plotter:
|
||||
def __init__(self, karmedbandit: KArmedBandit, epsilongreedy: EpsilonGreedy, steps: int):
|
||||
self.karmedbandit = karmedbandit
|
||||
self.agent = epsilongreedy
|
||||
self.steps = steps
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.average_rewards = [Average() for _ in range(steps)]
|
||||
self.actions_counter = [0] * self.karmedbandit.arms
|
||||
self.total_steps = 0
|
||||
self.optimal_action_counter = 0
|
||||
self.optimal_action_fraction = [Average() for _ in range(steps)]
|
||||
|
||||
def count_action(self, action):
|
||||
self.actions_counter[action] += 1
|
||||
self.total_steps += 1
|
||||
|
||||
def count_optimal_action(self, is_optimal: bool, step):
|
||||
self.optimal_action_counter += 1
|
||||
if is_optimal:
|
||||
self.optimal_action_fraction[step].update(1)
|
||||
else:
|
||||
self.optimal_action_fraction[step].update(0)
|
||||
|
||||
def update_optimal_action(self, optimal_action, step):
|
||||
# self.optimal_action_fraction[step].update(self.actions_counter[optimal_action] / (step+1))
|
||||
self.optimal_action_fraction[step].update(self.optimal_action_counter / (step+1))
|
||||
|
||||
def run(self, runs=1):
|
||||
for i in range(runs):
|
||||
self.karmedbandit.generate_rewards()
|
||||
# optimal_action = self.karmedbandit.get_max_reward_action()[1]
|
||||
self.agent.reset()
|
||||
self.actions_counter = [0] * self.karmedbandit.arms
|
||||
self.optimal_action_counter = 0
|
||||
for step in range(steps):
|
||||
optimal_action = self.karmedbandit.get_max_reward_action()[1]
|
||||
action = self.agent.choose_action()
|
||||
reward = self.karmedbandit.get_reward(action, step)
|
||||
# self.agent.update(action, reward, 1/(self.actions_counter[action]+1))
|
||||
self.agent.update(action, reward, 0.1)
|
||||
self.average_rewards[step].update(reward)
|
||||
self.count_action(action)
|
||||
if action == optimal_action:
|
||||
self.count_optimal_action(True, step)
|
||||
else:
|
||||
self.count_optimal_action(False, step)
|
||||
# self.update_optimal_action(optimal_action, step)
|
||||
|
||||
|
||||
def plot(self):
|
||||
fig, (ax1, ax2, ax3) = plt.subplots(nrows=3)
|
||||
|
||||
violin_data = self.karmedbandit.sample_rewards(steps=10000)
|
||||
bp = ax1.violinplot(violin_data, showmeans=True, showmedians=False,
|
||||
showextrema=False)
|
||||
ax1.set_ylim([-4,4])
|
||||
|
||||
ax2.plot(range(self.steps), [x.value for x in self.average_rewards])
|
||||
|
||||
ylabel = "Average reward. Max average reward: {}".format(self.karmedbandit.get_max_reward_action()[0])
|
||||
ax2.set(ylabel=ylabel, title='')
|
||||
|
||||
ax3.plot(range(self.steps), [x.value for x in self.optimal_action_fraction])
|
||||
ax3.set_ylim([0,1])
|
||||
|
||||
plt.xlabel("Steps")
|
||||
fig.savefig("exercise_2-5.png")
|
||||
plt.show()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
arms = 10
|
||||
steps = 1000
|
||||
armedbandit = KArmedBandit(arms=arms, steps=steps, random_walk=False)
|
||||
greedy = EpsilonGreedy(0.1, arms)
|
||||
|
||||
plotter = Plotter(armedbandit, greedy, steps)
|
||||
plotter.run(runs=100)
|
||||
plotter.plot()
|
||||
Reference in New Issue
Block a user