colosseum.agent.agents.episodic.posterior_sampling

  1from typing import Any, Callable, Dict, Union
  2
  3import dm_env
  4import gin
  5import numpy as np
  6from ray import tune
  7
  8from colosseum.agent.actors import QValuesActor
  9from colosseum.agent.agents.base import BaseAgent
 10from colosseum.agent.mdp_models.bayesian_model import BayesianMDPModel
 11from colosseum.agent.mdp_models.bayesian_models import RewardsConjugateModel
 12from colosseum.agent.mdp_models.bayesian_models import TransitionsConjugateModel
 13from colosseum.dynamic_programming import episodic_value_iteration
 14from colosseum.dynamic_programming.utils import get_policy_from_q_values
 15from colosseum.emission_maps import EmissionMap
 16from colosseum.utils.acme.specs import MDPSpec
 17
 18
 19@gin.configurable
 20class PSRLEpisodic(BaseAgent):
 21    """
 22    The posterior sampling for reinforcement learning algorithm.
 23
 24    Osband, Ian, Daniel Russo, and Benjamin Van Roy. "(More) efficient reinforcement learning via posterior sampling."
 25    Advances in Neural Information Processing Systems 26 (2013).
 26    """
 27
 28    def step_update(
 29        self,
 30        ts_t: dm_env.TimeStep,
 31        a_t: "ACTION_TYPE",
 32        ts_tp1: dm_env.TimeStep,
 33        time: int,
 34    ):
 35        super(PSRLEpisodic, self).step_update(ts_t, a_t, ts_tp1, time)
 36
 37    @staticmethod
 38    def is_emission_map_accepted(emission_map: "EmissionMap") -> bool:
 39        return emission_map.is_tabular
 40
 41    @staticmethod
 42    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
 43        return (
 44            "from colosseum.agent.mdp_models import bayesian_models\n"
 45            f"prms_{index}/PSRLEpisodic.reward_prior_model = %bayesian_models.RewardsConjugateModel.N_NIG\n"
 46            f"prms_{index}/PSRLEpisodic.transitions_prior_model = %bayesian_models.TransitionsConjugateModel.M_DIR\n"
 47            f"prms_{index}/PSRLEpisodic.rewards_prior_prms = [{parameters['prior_mean']}, 1, 1, 1]\n"
 48            f"prms_{index}/PSRLEpisodic.transitions_prior_prms = [{parameters['transition_prior']}]"
 49        )
 50
 51    @staticmethod
 52    def is_episodic() -> bool:
 53        return True
 54
 55    @staticmethod
 56    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
 57        return dict(
 58            prior_mean=tune.uniform(0.001, 2.0), transition_prior=tune.uniform(0.001, 2)
 59        )
 60
 61    @staticmethod
 62    def get_agent_instance_from_parameters(
 63        seed: int,
 64        optimization_horizon: int,
 65        mdp_specs: MDPSpec,
 66        parameters: Dict[str, Any],
 67    ) -> "BaseAgent":
 68        return PSRLEpisodic(
 69            mdp_specs=mdp_specs,
 70            seed=seed,
 71            optimization_horizon=optimization_horizon,
 72            reward_prior_model=RewardsConjugateModel.N_NIG,
 73            transitions_prior_model=TransitionsConjugateModel.M_DIR,
 74            rewards_prior_prms=[parameters["prior_mean"], 1, 1, 1],
 75            transitions_prior_prms=[parameters["transition_prior"]],
 76        )
 77
 78    @property
 79    def current_optimal_stochastic_policy(self) -> np.ndarray:
 80        T_map, R_map = self._mdp_model.get_map_estimate()
 81        Q, _ = episodic_value_iteration(self._time_horizon, T_map, R_map)
 82        return get_policy_from_q_values(Q, True)
 83
 84    def __init__(
 85        self,
 86        seed: int,
 87        mdp_specs: MDPSpec,
 88        optimization_horizon: int,
 89        # MDP model parameters
 90        reward_prior_model: RewardsConjugateModel = None,
 91        transitions_prior_model: TransitionsConjugateModel = None,
 92        rewards_prior_prms=None,
 93        transitions_prior_prms=None,
 94        # Actor parameters
 95        epsilon_greedy: Union[float, Callable] = None,
 96        boltzmann_temperature: Union[float, Callable] = None,
 97    ):
 98        """
 99
100        Parameters
101        ----------
102        seed : int
103            The random seed.
104        mdp_specs : MDPSpec
105            The full specification of the MDP.
106        optimization_horizon : int
107            The total number of interactions that the agent is expected to have with the MDP.
108        reward_prior_model : RewardsConjugateModel, optional
109            The reward priors.
110        transitions_prior_model : TransitionsConjugateModel, optional
111            The transitions priors.
112        rewards_prior_prms : Any
113            The reward prior parameters.
114        transitions_prior_prms : Any
115            The transitions prior parameters.
116        epsilon_greedy : Union[float, Callable], optional
117            The probability of selecting an action at random. It can be provided as a float or as a function of the
118            total number of interactions. By default, the probability is set to zero.
119        boltzmann_temperature : Union[float, Callable], optional
120            The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of
121            the total number of interactions. By default, Boltzmann exploration is disabled.
122        """
123
124        mdp_model = BayesianMDPModel(
125            seed,
126            mdp_specs,
127            reward_prior_model=reward_prior_model,
128            transitions_prior_model=transitions_prior_model,
129            rewards_prior_prms=rewards_prior_prms,
130            transitions_prior_prms=transitions_prior_prms,
131        )
132        actor = QValuesActor(seed, mdp_specs, epsilon_greedy, boltzmann_temperature)
133
134        super(PSRLEpisodic, self).__init__(
135            seed,
136            mdp_specs,
137            mdp_model,
138            actor,
139            optimization_horizon,
140        )
141
142    def episode_end_update(self):
143        Q, _ = episodic_value_iteration(self._time_horizon, *self._mdp_model.sample())
144        self._actor.set_q_values(Q)
145
146    def before_start_interacting(self):
147        self.episode_end_update()
@gin.configurable
class PSRLEpisodic(colosseum.agent.agents.base.BaseAgent):
 20@gin.configurable
 21class PSRLEpisodic(BaseAgent):
 22    """
 23    The posterior sampling for reinforcement learning algorithm.
 24
 25    Osband, Ian, Daniel Russo, and Benjamin Van Roy. "(More) efficient reinforcement learning via posterior sampling."
 26    Advances in Neural Information Processing Systems 26 (2013).
 27    """
 28
 29    def step_update(
 30        self,
 31        ts_t: dm_env.TimeStep,
 32        a_t: "ACTION_TYPE",
 33        ts_tp1: dm_env.TimeStep,
 34        time: int,
 35    ):
 36        super(PSRLEpisodic, self).step_update(ts_t, a_t, ts_tp1, time)
 37
 38    @staticmethod
 39    def is_emission_map_accepted(emission_map: "EmissionMap") -> bool:
 40        return emission_map.is_tabular
 41
 42    @staticmethod
 43    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
 44        return (
 45            "from colosseum.agent.mdp_models import bayesian_models\n"
 46            f"prms_{index}/PSRLEpisodic.reward_prior_model = %bayesian_models.RewardsConjugateModel.N_NIG\n"
 47            f"prms_{index}/PSRLEpisodic.transitions_prior_model = %bayesian_models.TransitionsConjugateModel.M_DIR\n"
 48            f"prms_{index}/PSRLEpisodic.rewards_prior_prms = [{parameters['prior_mean']}, 1, 1, 1]\n"
 49            f"prms_{index}/PSRLEpisodic.transitions_prior_prms = [{parameters['transition_prior']}]"
 50        )
 51
 52    @staticmethod
 53    def is_episodic() -> bool:
 54        return True
 55
 56    @staticmethod
 57    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
 58        return dict(
 59            prior_mean=tune.uniform(0.001, 2.0), transition_prior=tune.uniform(0.001, 2)
 60        )
 61
 62    @staticmethod
 63    def get_agent_instance_from_parameters(
 64        seed: int,
 65        optimization_horizon: int,
 66        mdp_specs: MDPSpec,
 67        parameters: Dict[str, Any],
 68    ) -> "BaseAgent":
 69        return PSRLEpisodic(
 70            mdp_specs=mdp_specs,
 71            seed=seed,
 72            optimization_horizon=optimization_horizon,
 73            reward_prior_model=RewardsConjugateModel.N_NIG,
 74            transitions_prior_model=TransitionsConjugateModel.M_DIR,
 75            rewards_prior_prms=[parameters["prior_mean"], 1, 1, 1],
 76            transitions_prior_prms=[parameters["transition_prior"]],
 77        )
 78
 79    @property
 80    def current_optimal_stochastic_policy(self) -> np.ndarray:
 81        T_map, R_map = self._mdp_model.get_map_estimate()
 82        Q, _ = episodic_value_iteration(self._time_horizon, T_map, R_map)
 83        return get_policy_from_q_values(Q, True)
 84
 85    def __init__(
 86        self,
 87        seed: int,
 88        mdp_specs: MDPSpec,
 89        optimization_horizon: int,
 90        # MDP model parameters
 91        reward_prior_model: RewardsConjugateModel = None,
 92        transitions_prior_model: TransitionsConjugateModel = None,
 93        rewards_prior_prms=None,
 94        transitions_prior_prms=None,
 95        # Actor parameters
 96        epsilon_greedy: Union[float, Callable] = None,
 97        boltzmann_temperature: Union[float, Callable] = None,
 98    ):
 99        """
100
101        Parameters
102        ----------
103        seed : int
104            The random seed.
105        mdp_specs : MDPSpec
106            The full specification of the MDP.
107        optimization_horizon : int
108            The total number of interactions that the agent is expected to have with the MDP.
109        reward_prior_model : RewardsConjugateModel, optional
110            The reward priors.
111        transitions_prior_model : TransitionsConjugateModel, optional
112            The transitions priors.
113        rewards_prior_prms : Any
114            The reward prior parameters.
115        transitions_prior_prms : Any
116            The transitions prior parameters.
117        epsilon_greedy : Union[float, Callable], optional
118            The probability of selecting an action at random. It can be provided as a float or as a function of the
119            total number of interactions. By default, the probability is set to zero.
120        boltzmann_temperature : Union[float, Callable], optional
121            The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of
122            the total number of interactions. By default, Boltzmann exploration is disabled.
123        """
124
125        mdp_model = BayesianMDPModel(
126            seed,
127            mdp_specs,
128            reward_prior_model=reward_prior_model,
129            transitions_prior_model=transitions_prior_model,
130            rewards_prior_prms=rewards_prior_prms,
131            transitions_prior_prms=transitions_prior_prms,
132        )
133        actor = QValuesActor(seed, mdp_specs, epsilon_greedy, boltzmann_temperature)
134
135        super(PSRLEpisodic, self).__init__(
136            seed,
137            mdp_specs,
138            mdp_model,
139            actor,
140            optimization_horizon,
141        )
142
143    def episode_end_update(self):
144        Q, _ = episodic_value_iteration(self._time_horizon, *self._mdp_model.sample())
145        self._actor.set_q_values(Q)
146
147    def before_start_interacting(self):
148        self.episode_end_update()

The posterior sampling for reinforcement learning algorithm.

Osband, Ian, Daniel Russo, and Benjamin Van Roy. "(More) efficient reinforcement learning via posterior sampling." Advances in Neural Information Processing Systems 26 (2013).

PSRLEpisodic( seed: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, optimization_horizon: int, reward_prior_model: colosseum.agent.mdp_models.bayesian_models.RewardsConjugateModel = None, transitions_prior_model: colosseum.agent.mdp_models.bayesian_models.TransitionsConjugateModel = None, rewards_prior_prms=None, transitions_prior_prms=None, epsilon_greedy: Union[float, Callable] = None, boltzmann_temperature: Union[float, Callable] = None)
 85    def __init__(
 86        self,
 87        seed: int,
 88        mdp_specs: MDPSpec,
 89        optimization_horizon: int,
 90        # MDP model parameters
 91        reward_prior_model: RewardsConjugateModel = None,
 92        transitions_prior_model: TransitionsConjugateModel = None,
 93        rewards_prior_prms=None,
 94        transitions_prior_prms=None,
 95        # Actor parameters
 96        epsilon_greedy: Union[float, Callable] = None,
 97        boltzmann_temperature: Union[float, Callable] = None,
 98    ):
 99        """
100
101        Parameters
102        ----------
103        seed : int
104            The random seed.
105        mdp_specs : MDPSpec
106            The full specification of the MDP.
107        optimization_horizon : int
108            The total number of interactions that the agent is expected to have with the MDP.
109        reward_prior_model : RewardsConjugateModel, optional
110            The reward priors.
111        transitions_prior_model : TransitionsConjugateModel, optional
112            The transitions priors.
113        rewards_prior_prms : Any
114            The reward prior parameters.
115        transitions_prior_prms : Any
116            The transitions prior parameters.
117        epsilon_greedy : Union[float, Callable], optional
118            The probability of selecting an action at random. It can be provided as a float or as a function of the
119            total number of interactions. By default, the probability is set to zero.
120        boltzmann_temperature : Union[float, Callable], optional
121            The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of
122            the total number of interactions. By default, Boltzmann exploration is disabled.
123        """
124
125        mdp_model = BayesianMDPModel(
126            seed,
127            mdp_specs,
128            reward_prior_model=reward_prior_model,
129            transitions_prior_model=transitions_prior_model,
130            rewards_prior_prms=rewards_prior_prms,
131            transitions_prior_prms=transitions_prior_prms,
132        )
133        actor = QValuesActor(seed, mdp_specs, epsilon_greedy, boltzmann_temperature)
134
135        super(PSRLEpisodic, self).__init__(
136            seed,
137            mdp_specs,
138            mdp_model,
139            actor,
140            optimization_horizon,
141        )
Parameters
  • seed (int): The random seed.
  • mdp_specs (MDPSpec): The full specification of the MDP.
  • optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
  • reward_prior_model (RewardsConjugateModel, optional): The reward priors.
  • transitions_prior_model (TransitionsConjugateModel, optional): The transitions priors.
  • rewards_prior_prms (Any): The reward prior parameters.
  • transitions_prior_prms (Any): The transitions prior parameters.
  • epsilon_greedy (Union[float, Callable], optional): The probability of selecting an action at random. It can be provided as a float or as a function of the total number of interactions. By default, the probability is set to zero.
  • boltzmann_temperature (Union[float, Callable], optional): The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of the total number of interactions. By default, Boltzmann exploration is disabled.
def step_update( self, ts_t: dm_env._environment.TimeStep, a_t: 'ACTION_TYPE', ts_tp1: dm_env._environment.TimeStep, time: int):
29    def step_update(
30        self,
31        ts_t: dm_env.TimeStep,
32        a_t: "ACTION_TYPE",
33        ts_tp1: dm_env.TimeStep,
34        time: int,
35    ):
36        super(PSRLEpisodic, self).step_update(ts_t, a_t, ts_tp1, time)

adds the transition in input to the MDP model.

Parameters
  • ts_t (dm_env.TimeStep): The TimeStep at time t.
  • a_t ("ACTION_TYPE"): The action taken by the agent at time t.
  • ts_tp1 (dm_env.TimeStep): The TimeStep at time t + 1.
  • time (int): The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in the continuous case this refers to the total number of previous interactions.
@staticmethod
def is_emission_map_accepted(emission_map: colosseum.emission_maps.base.EmissionMap) -> bool:
38    @staticmethod
39    def is_emission_map_accepted(emission_map: "EmissionMap") -> bool:
40        return emission_map.is_tabular
Returns
  • bool: True if the agent class accepts the emission map.
@staticmethod
def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
42    @staticmethod
43    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
44        return (
45            "from colosseum.agent.mdp_models import bayesian_models\n"
46            f"prms_{index}/PSRLEpisodic.reward_prior_model = %bayesian_models.RewardsConjugateModel.N_NIG\n"
47            f"prms_{index}/PSRLEpisodic.transitions_prior_model = %bayesian_models.TransitionsConjugateModel.M_DIR\n"
48            f"prms_{index}/PSRLEpisodic.rewards_prior_prms = [{parameters['prior_mean']}, 1, 1, 1]\n"
49            f"prms_{index}/PSRLEpisodic.transitions_prior_prms = [{parameters['transition_prior']}]"
50        )

produces a string containing the gin config file corresponding to the parameters given in input.

Parameters
  • parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
  • index (int): The index assigned to the gin configuration.
Returns
  • gin_config (str): The gin configuration file.
@staticmethod
def is_episodic() -> bool:
52    @staticmethod
53    def is_episodic() -> bool:
54        return True
Returns
  • bool: True if the agent is suited for the episodic setting.
@staticmethod
def get_hyperparameters_search_spaces() -> Dict[str, ray.tune.sample.Domain]:
56    @staticmethod
57    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
58        return dict(
59            prior_mean=tune.uniform(0.001, 2.0), transition_prior=tune.uniform(0.001, 2)
60        )
Returns
  • Dict[str, tune.sample.Domain]: The dictionary with key value pairs corresponding to hyperparameter names and corresponding ray.tune samplers.
@staticmethod
def get_agent_instance_from_parameters( seed: int, optimization_horizon: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, parameters: Dict[str, Any]) -> colosseum.agent.agents.base.BaseAgent:
62    @staticmethod
63    def get_agent_instance_from_parameters(
64        seed: int,
65        optimization_horizon: int,
66        mdp_specs: MDPSpec,
67        parameters: Dict[str, Any],
68    ) -> "BaseAgent":
69        return PSRLEpisodic(
70            mdp_specs=mdp_specs,
71            seed=seed,
72            optimization_horizon=optimization_horizon,
73            reward_prior_model=RewardsConjugateModel.N_NIG,
74            transitions_prior_model=TransitionsConjugateModel.M_DIR,
75            rewards_prior_prms=[parameters["prior_mean"], 1, 1, 1],
76            transitions_prior_prms=[parameters["transition_prior"]],
77        )

returns an agent instance for the mdp specification and agent parameters given in input.

Parameters
  • seed (int): The random seed.
  • optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
  • mdp_specs (MDPSpec): The full specification of the MDP.
  • parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
Returns
  • BaseAgent: The agent instance.
current_optimal_stochastic_policy: numpy.ndarray
Returns
  • np.ndarray: The estimates of the best optimal policy given the current knowledge of the agent in the form of distribution over actions.
def episode_end_update(self):
143    def episode_end_update(self):
144        Q, _ = episodic_value_iteration(self._time_horizon, *self._mdp_model.sample())
145        self._actor.set_q_values(Q)

is called when an episode ends. In the infinite horizon case, we refer to artificial episodes.

def before_start_interacting(self):
147    def before_start_interacting(self):
148        self.episode_end_update()

is called before the agent starts interacting with the MDP.