Quickstart#

The core functionality of \(\texttt{Colosseum}\) is to provide a principled benchmarking procedure. The goal of this tutorial is to implement a simple q-learning agent and benchmark it.

The agent’s components

Although new agents can be implemented from the abstract BaseAgent class by implementing its abstract methods, it is possible to follow a module approach that allows to re-use components for different agents.

stores and updates the agent’s estimates, e.g. estimated transition probabilities or q-value estimates

maintains the policy for interacting with the environment.

We’ll implement an MDP model for the q-values QModel and an Actor that implements the \(\epsilon\)-greedy policy QValuesActor. The implementation of the agent QLearningAgent will combine these components.

QModel

class QModel(BaseMDPModel):
    """ An MDP model for q-value estimates. """

    def __init__(
        self,
        seed: int,
        mdp_specs: "MDPSpec",
        alpha: float,
    ):
        """
        Parameters
        ----------
        seed : int
           the seed controlling the random parts of the model.
        mdp_specs : MDPSpec
            the specification of the MDP properties, e.g. number of states.
        alpha : float
            the coefficient for the q-learning update rule.
        """
        super(QModel, self).__init__(seed, mdp_specs)
        self._alpha = alpha
        
        # Random initialization of the q value estimates
        self.Q = self._rng.randn(self._H, self._n_states, self._n_actions).astype(np.float32)
        
        # State values initialised to the maximum reward
        self.V = np.zeros((self._H + 1, self._n_states), np.float32) + mdp_specs.rewards_range[1]

    def step_update(
        self, ts_t: dm_env.TimeStep, a_t: "ACTION_TYPE", ts_tp1: dm_env.TimeStep, h: int
    ):
        """
        updates the q value estimate using the q-learning update rule.
        
        Parameters
        ----------
        ts_t : dm_env.TimeStep
          the object containing the information at time step t.
        a_t : int
          the action the agent has taken at time step t.
        ts_tp1 : dm_env.TimeStep
          the object containing the information at time step t + 1.
        h : int
          the in-episode time step.
        """
    
        s_t = ts_t.observation
        s_tp1 = ts_tp1.observation

        # The q-learning update for all q value estimates
        self.Q[h, s_t, a_t] = self._alpha * self.Q[h, s_t, a_t] + (1 - self._alpha) * (
            ts_tp1.reward + self.V[h + 1, s_tp1]
        )
        self.V[h, s_t] = min(self._H, self.Q[h, s_t].max())

QValuesActor

class QValuesActor(BaseActor):
    """
    The epsilon greedy actor.
    """

    def __init__(
        self,
        seed: int,
        mdp_specs: MDPSpec,
        epsilon: float = 0.05,
    ):
        """
        Parameters
        ----------
        seed : int
            The random seed.
        mdp_specs : MDPSpec
            The full specification of the MDP.
        epsilon : float
            The probability of selecting an action at random. By default, the probability is set to 0.05.
        """
        super(QValuesActor, self).__init__(seed, mdp_specs)

        self._epsilon = epsilon
        self._n_states = self._mdp_spec.observations.num_values
        self._n_actions = self._mdp_spec.actions.num_values
        
        self._q_values = None

    def set_q_values(self, Q: np.ndarray):
        """
        updates the q-values estimates of the component with the one given in input.
        Parameters
        ----------
        Q : np.ndarray
            The q-values estimates.
        """
        self._q_values = Q
        self._episodic = Q.ndim == 3

    def select_action(self, ts: dm_env.TimeStep, time: int) -> "ACTION_TYPE":
        assert self._q_values is not None, "The q values have not been initialized."

        # Epsilon exploration
        if self._rng_fast.random() < self._epsilon:
            return self._rng_fast.randint(0, self._n_actions - 1)

        # Retrieve the q-values
        q = self._q_values[(time, ts.observation) if self._episodic else ts.observation]

        # Greedy selection
        return self._rng.choice(np.where(q == q.max())[0])

QLearningAgent

@gin.configurable
class QLearningAgent(BaseAgent):
    """
    The Agent class for the q-learning algorithm.
    """
    
    @staticmethod
    def is_episodic() -> bool:
        return True

    def __init__(
        self,
        # Base parameters
        seed: int,
        mdp_specs: "MDPSpec",
        optimization_horizon: int,
        # MDP model hyperparameters
        alpha: float,
        # Actor hyperparameters
        epsilon : float
    ):
        """
        Parameters
        ----------
        seed : int
            The random seed.
        mdp_specs : MDPSpec
            The specification of the MDP properties, e.g. number of states.
        optimization_horizon : int
            The total number of interactions with the MDP.
        alpha : float
            The coefficient for the q-learning update rule.
        epsilon : float
            The probability of selecting an action at random. By default, the probability is set to 0.05.
        """
        assert 0 < alpha < 1
        
        # The MDP model
        mdp_model = QModel(seed, mdp_specs, alpha)
        
        # The Actor model
        actor = QValuesActor(seed, mdp_specs, epsilon)

        super(QLearningAgent, self).__init__(
            seed, mdp_specs, mdp_model, actor, optimization_horizon
        )

    @property
    def current_optimal_stochastic_policy(self) -> np.ndarray:
        """
        Returns
        -------
        np.ndarray
            The optimal policy the agent would use if no more training was allowed.
        """
        Q_average = self._mdp_model.Q
        return get_policy_from_q_values(Q_average, True)
    
    def before_start_interacting(self):
        # Initialize the q-values in the actor
        self._actor.set_q_values(self._mdp_model.Q)
    
    def episode_end_update(self):
        """
        The Q-learning algorithm does not update anything at the end of the episode.
        """
    
    def step_update(
            self,
            ts_t: dm_env.TimeStep,
            a_t: "ACTION_TYPE",
            ts_tp1: dm_env.TimeStep,
            time: int,
        ):
        # This automatically updates the MDP model
        super(QLearningAgent, self).step_update(ts_t, a_t, ts_tp1, time)
        
        # Update the q-estimates at every time step
        self._actor.set_q_values(self._mdp_model.Q)

    @staticmethod
    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
        """
        Returns
        -------
            The sampling spaces associated with the hyperparameters of the algorithm.
        """
        return {
            "epsilon": tune.uniform(0.001, 0.1),
            "alpha": tune.uniform(0.001, 0.4),
        }

    @staticmethod
    def produce_gin_file_from_parameters(
        hyperparameters: Dict[str, Any], index: int = 0
    ) -> str:
        """
        Returns
        -------
          The hyperparameter values in gin file format.
        """
        string = ""
        for k, v in hyperparameters.items():
            string += f"prms_{index}/QLearningAgent.{k} = {v}\n"
        return string[:-1]

    @staticmethod
    def get_agent_instance_from_parameters(
        seed: int,
        optimization_horizon: int,
        mdp_specs: MDPSpec,
        hyperparameters: Dict[str, Any],
    ) -> "BaseAgent":
        """
        Returns
        -------
            The agent instance corresponding to the hyperparameters and the MDP specification.
            actions.
        """
        return QLearningAgent(
            mdp_specs=mdp_specs,
            seed=seed,
            optimization_horizon=optimization_horizon,
            alpha=hyperparameters["alpha"],
            epsilon=hyperparameters["epsilon"],
        )

    @staticmethod
    def is_emission_map_accepted(emission_map: "EmissionMap") -> bool:
        """
        Returns
        -------
        bool
          True if the emission map in input is the tabular one.
        """
        return emission_map.is_tabular

Benchmarking procedure

We can now proceed to perform the benchmarking procedure, which is done in two steps: the hyperparameters optimization and the benchmarking.

Hyperparameters Optimization

# Register the new agent class
config.register_agent_class(QLearningAgent)

# Obtain the best hyperparameters
hyperopt_agents_and_benchmarks = sample_agent_configs_and_benchmarks_for_hyperopt([QLearningAgent], SMALL_HYPEROPT_CONF)
hp_exp_instances = instantiate_and_get_exp_instances_from_agents_and_benchmarks_for_hyperopt(
    hyperopt_agents_and_benchmarks
)
run_experiment_instances(hp_exp_instances)
agents_hps = retrieve_best_agent_config_from_hp_folder()

print(agents_hps[QLearningAgent])
prms_0/QLearningAgent.epsilon = 0.0381
prms_0/QLearningAgent.alpha = 0.3803

Benchmarking

We’ll benchmark the agent using the EPISODIC_QUICK_TEST benchmark, which is the default quick benchmark for the episodic setting, against the QLearningEpisodic implemented in the package.

# Add the QLearningEpisodic agent
# When the hyperparameters of a Colosseum agent class are not specified then they are retrieved
# from the package cache
agents_hps[QLearningEpisodic] = None

# Retrieve the EPISODIC_QUICK_TEST benchmark
benchmark = ColosseumDefaultBenchmark.EPISODIC_QUICK_TEST.get_benchmark()
agents_and_benchmarks = [
    (agents_hps, benchmark)
]

# Instantiate the benchmark
benchmark = ColosseumDefaultBenchmark.EPISODIC_QUICK_TEST.get_benchmark()

# Benchmarking the agents
experiment_instances = instantiate_and_get_exp_instances_from_agents_and_benchmarks(agents_and_benchmarks)
run_experiment_instances(experiment_instances)

The results of the benchmarking procedure are stored locally and can be visualized.

agent_performances_per_mdp_plot(
    benchmark.get_experiments_benchmark_log_folder(),
    "cumulative_regret"
);
../_images/quickstart_13_0.png