posted an update

Contextual Bandit: Code Example (Epsilon-Greedy)

import numpy as np

class ContextualBandit:
    def __init__(self, n_actions, n_contexts):
        self.n_actions = n_actions
        self.n_contexts = n_contexts
        self.Q = np.zeros((n_contexts, n_actions))  # Estimated rewards
        self.N = np.zeros((n_contexts, n_actions))  # Visit counts

    def select_action(self, context, epsilon=0.1):
        if np.random.rand() < epsilon:
            return np.random.randint(self.n_actions)
        else:
            return np.argmax(self.Q[context])

    def update(self, context, action, reward):
        self.N[context, action] += 1
        alpha = 1 / self.N[context, action]
        self.Q[context, action] += alpha * (reward - self.Q[context, action])

# Simulated environment
def generate_reward(context, action):
    # Toy reward matrix: rows = contexts, cols = actions
    true_rewards = np.array([
        [0.1, 0.9, 0.2],
        [0.8, 0.1, 0.4],
        [0.3, 0.3, 0.7]
    ])
    return np.random.rand() < true_rewards[context, action]  # Bernoulli trial

# Setup
n_contexts = 3
n_actions = 3
bandit = ContextualBandit(n_actions, n_contexts)

# Run simulation
n_rounds = 10000
rewards = []

for _ in range(n_rounds):
    context = np.random.randint(n_contexts)
    action = bandit.select_action(context, epsilon=0.1)
    reward = generate_reward(context, action)
    bandit.update(context, action, reward)
    rewards.append(reward)

print(f"Average reward over {n_rounds} rounds: {np.mean(rewards):.4f}")

Notes:

  • This example uses a toy reward function to simulate different contextual response probabilities.
  • The bandit learns which action performs best for each context.
  • The exploration is controlled using the epsilon parameter (epsilon-greedy).

Log in or sign up for Devpost to join the conversation.