Contextual Bandit: Code Example (Epsilon-Greedy)
import numpy as np
class ContextualBandit:
def __init__(self, n_actions, n_contexts):
self.n_actions = n_actions
self.n_contexts = n_contexts
self.Q = np.zeros((n_contexts, n_actions)) # Estimated rewards
self.N = np.zeros((n_contexts, n_actions)) # Visit counts
def select_action(self, context, epsilon=0.1):
if np.random.rand() < epsilon:
return np.random.randint(self.n_actions)
else:
return np.argmax(self.Q[context])
def update(self, context, action, reward):
self.N[context, action] += 1
alpha = 1 / self.N[context, action]
self.Q[context, action] += alpha * (reward - self.Q[context, action])
# Simulated environment
def generate_reward(context, action):
# Toy reward matrix: rows = contexts, cols = actions
true_rewards = np.array([
[0.1, 0.9, 0.2],
[0.8, 0.1, 0.4],
[0.3, 0.3, 0.7]
])
return np.random.rand() < true_rewards[context, action] # Bernoulli trial
# Setup
n_contexts = 3
n_actions = 3
bandit = ContextualBandit(n_actions, n_contexts)
# Run simulation
n_rounds = 10000
rewards = []
for _ in range(n_rounds):
context = np.random.randint(n_contexts)
action = bandit.select_action(context, epsilon=0.1)
reward = generate_reward(context, action)
bandit.update(context, action, reward)
rewards.append(reward)
print(f"Average reward over {n_rounds} rounds: {np.mean(rewards):.4f}")
Notes:
- This example uses a toy reward function to simulate different contextual response probabilities.
- The bandit learns which action performs best for each context.
- The exploration is controlled using the epsilon parameter (
epsilon-greedy).
Log in or sign up for Devpost to join the conversation.