colosseum.agent.agents.episodic.posterior_sampling
1from typing import Any, Callable, Dict, Union 2 3import dm_env 4import gin 5import numpy as np 6from ray import tune 7 8from colosseum.agent.actors import QValuesActor 9from colosseum.agent.agents.base import BaseAgent 10from colosseum.agent.mdp_models.bayesian_model import BayesianMDPModel 11from colosseum.agent.mdp_models.bayesian_models import RewardsConjugateModel 12from colosseum.agent.mdp_models.bayesian_models import TransitionsConjugateModel 13from colosseum.dynamic_programming import episodic_value_iteration 14from colosseum.dynamic_programming.utils import get_policy_from_q_values 15from colosseum.emission_maps import EmissionMap 16from colosseum.utils.acme.specs import MDPSpec 17 18 19@gin.configurable 20class PSRLEpisodic(BaseAgent): 21 """ 22 The posterior sampling for reinforcement learning algorithm. 23 24 Osband, Ian, Daniel Russo, and Benjamin Van Roy. "(More) efficient reinforcement learning via posterior sampling." 25 Advances in Neural Information Processing Systems 26 (2013). 26 """ 27 28 def step_update( 29 self, 30 ts_t: dm_env.TimeStep, 31 a_t: "ACTION_TYPE", 32 ts_tp1: dm_env.TimeStep, 33 time: int, 34 ): 35 super(PSRLEpisodic, self).step_update(ts_t, a_t, ts_tp1, time) 36 37 @staticmethod 38 def is_emission_map_accepted(emission_map: "EmissionMap") -> bool: 39 return emission_map.is_tabular 40 41 @staticmethod 42 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 43 return ( 44 "from colosseum.agent.mdp_models import bayesian_models\n" 45 f"prms_{index}/PSRLEpisodic.reward_prior_model = %bayesian_models.RewardsConjugateModel.N_NIG\n" 46 f"prms_{index}/PSRLEpisodic.transitions_prior_model = %bayesian_models.TransitionsConjugateModel.M_DIR\n" 47 f"prms_{index}/PSRLEpisodic.rewards_prior_prms = [{parameters['prior_mean']}, 1, 1, 1]\n" 48 f"prms_{index}/PSRLEpisodic.transitions_prior_prms = [{parameters['transition_prior']}]" 49 ) 50 51 @staticmethod 52 def is_episodic() -> bool: 53 return True 54 55 @staticmethod 56 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 57 return dict( 58 prior_mean=tune.uniform(0.001, 2.0), transition_prior=tune.uniform(0.001, 2) 59 ) 60 61 @staticmethod 62 def get_agent_instance_from_parameters( 63 seed: int, 64 optimization_horizon: int, 65 mdp_specs: MDPSpec, 66 parameters: Dict[str, Any], 67 ) -> "BaseAgent": 68 return PSRLEpisodic( 69 mdp_specs=mdp_specs, 70 seed=seed, 71 optimization_horizon=optimization_horizon, 72 reward_prior_model=RewardsConjugateModel.N_NIG, 73 transitions_prior_model=TransitionsConjugateModel.M_DIR, 74 rewards_prior_prms=[parameters["prior_mean"], 1, 1, 1], 75 transitions_prior_prms=[parameters["transition_prior"]], 76 ) 77 78 @property 79 def current_optimal_stochastic_policy(self) -> np.ndarray: 80 T_map, R_map = self._mdp_model.get_map_estimate() 81 Q, _ = episodic_value_iteration(self._time_horizon, T_map, R_map) 82 return get_policy_from_q_values(Q, True) 83 84 def __init__( 85 self, 86 seed: int, 87 mdp_specs: MDPSpec, 88 optimization_horizon: int, 89 # MDP model parameters 90 reward_prior_model: RewardsConjugateModel = None, 91 transitions_prior_model: TransitionsConjugateModel = None, 92 rewards_prior_prms=None, 93 transitions_prior_prms=None, 94 # Actor parameters 95 epsilon_greedy: Union[float, Callable] = None, 96 boltzmann_temperature: Union[float, Callable] = None, 97 ): 98 """ 99 100 Parameters 101 ---------- 102 seed : int 103 The random seed. 104 mdp_specs : MDPSpec 105 The full specification of the MDP. 106 optimization_horizon : int 107 The total number of interactions that the agent is expected to have with the MDP. 108 reward_prior_model : RewardsConjugateModel, optional 109 The reward priors. 110 transitions_prior_model : TransitionsConjugateModel, optional 111 The transitions priors. 112 rewards_prior_prms : Any 113 The reward prior parameters. 114 transitions_prior_prms : Any 115 The transitions prior parameters. 116 epsilon_greedy : Union[float, Callable], optional 117 The probability of selecting an action at random. It can be provided as a float or as a function of the 118 total number of interactions. By default, the probability is set to zero. 119 boltzmann_temperature : Union[float, Callable], optional 120 The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of 121 the total number of interactions. By default, Boltzmann exploration is disabled. 122 """ 123 124 mdp_model = BayesianMDPModel( 125 seed, 126 mdp_specs, 127 reward_prior_model=reward_prior_model, 128 transitions_prior_model=transitions_prior_model, 129 rewards_prior_prms=rewards_prior_prms, 130 transitions_prior_prms=transitions_prior_prms, 131 ) 132 actor = QValuesActor(seed, mdp_specs, epsilon_greedy, boltzmann_temperature) 133 134 super(PSRLEpisodic, self).__init__( 135 seed, 136 mdp_specs, 137 mdp_model, 138 actor, 139 optimization_horizon, 140 ) 141 142 def episode_end_update(self): 143 Q, _ = episodic_value_iteration(self._time_horizon, *self._mdp_model.sample()) 144 self._actor.set_q_values(Q) 145 146 def before_start_interacting(self): 147 self.episode_end_update()
20@gin.configurable 21class PSRLEpisodic(BaseAgent): 22 """ 23 The posterior sampling for reinforcement learning algorithm. 24 25 Osband, Ian, Daniel Russo, and Benjamin Van Roy. "(More) efficient reinforcement learning via posterior sampling." 26 Advances in Neural Information Processing Systems 26 (2013). 27 """ 28 29 def step_update( 30 self, 31 ts_t: dm_env.TimeStep, 32 a_t: "ACTION_TYPE", 33 ts_tp1: dm_env.TimeStep, 34 time: int, 35 ): 36 super(PSRLEpisodic, self).step_update(ts_t, a_t, ts_tp1, time) 37 38 @staticmethod 39 def is_emission_map_accepted(emission_map: "EmissionMap") -> bool: 40 return emission_map.is_tabular 41 42 @staticmethod 43 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 44 return ( 45 "from colosseum.agent.mdp_models import bayesian_models\n" 46 f"prms_{index}/PSRLEpisodic.reward_prior_model = %bayesian_models.RewardsConjugateModel.N_NIG\n" 47 f"prms_{index}/PSRLEpisodic.transitions_prior_model = %bayesian_models.TransitionsConjugateModel.M_DIR\n" 48 f"prms_{index}/PSRLEpisodic.rewards_prior_prms = [{parameters['prior_mean']}, 1, 1, 1]\n" 49 f"prms_{index}/PSRLEpisodic.transitions_prior_prms = [{parameters['transition_prior']}]" 50 ) 51 52 @staticmethod 53 def is_episodic() -> bool: 54 return True 55 56 @staticmethod 57 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 58 return dict( 59 prior_mean=tune.uniform(0.001, 2.0), transition_prior=tune.uniform(0.001, 2) 60 ) 61 62 @staticmethod 63 def get_agent_instance_from_parameters( 64 seed: int, 65 optimization_horizon: int, 66 mdp_specs: MDPSpec, 67 parameters: Dict[str, Any], 68 ) -> "BaseAgent": 69 return PSRLEpisodic( 70 mdp_specs=mdp_specs, 71 seed=seed, 72 optimization_horizon=optimization_horizon, 73 reward_prior_model=RewardsConjugateModel.N_NIG, 74 transitions_prior_model=TransitionsConjugateModel.M_DIR, 75 rewards_prior_prms=[parameters["prior_mean"], 1, 1, 1], 76 transitions_prior_prms=[parameters["transition_prior"]], 77 ) 78 79 @property 80 def current_optimal_stochastic_policy(self) -> np.ndarray: 81 T_map, R_map = self._mdp_model.get_map_estimate() 82 Q, _ = episodic_value_iteration(self._time_horizon, T_map, R_map) 83 return get_policy_from_q_values(Q, True) 84 85 def __init__( 86 self, 87 seed: int, 88 mdp_specs: MDPSpec, 89 optimization_horizon: int, 90 # MDP model parameters 91 reward_prior_model: RewardsConjugateModel = None, 92 transitions_prior_model: TransitionsConjugateModel = None, 93 rewards_prior_prms=None, 94 transitions_prior_prms=None, 95 # Actor parameters 96 epsilon_greedy: Union[float, Callable] = None, 97 boltzmann_temperature: Union[float, Callable] = None, 98 ): 99 """ 100 101 Parameters 102 ---------- 103 seed : int 104 The random seed. 105 mdp_specs : MDPSpec 106 The full specification of the MDP. 107 optimization_horizon : int 108 The total number of interactions that the agent is expected to have with the MDP. 109 reward_prior_model : RewardsConjugateModel, optional 110 The reward priors. 111 transitions_prior_model : TransitionsConjugateModel, optional 112 The transitions priors. 113 rewards_prior_prms : Any 114 The reward prior parameters. 115 transitions_prior_prms : Any 116 The transitions prior parameters. 117 epsilon_greedy : Union[float, Callable], optional 118 The probability of selecting an action at random. It can be provided as a float or as a function of the 119 total number of interactions. By default, the probability is set to zero. 120 boltzmann_temperature : Union[float, Callable], optional 121 The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of 122 the total number of interactions. By default, Boltzmann exploration is disabled. 123 """ 124 125 mdp_model = BayesianMDPModel( 126 seed, 127 mdp_specs, 128 reward_prior_model=reward_prior_model, 129 transitions_prior_model=transitions_prior_model, 130 rewards_prior_prms=rewards_prior_prms, 131 transitions_prior_prms=transitions_prior_prms, 132 ) 133 actor = QValuesActor(seed, mdp_specs, epsilon_greedy, boltzmann_temperature) 134 135 super(PSRLEpisodic, self).__init__( 136 seed, 137 mdp_specs, 138 mdp_model, 139 actor, 140 optimization_horizon, 141 ) 142 143 def episode_end_update(self): 144 Q, _ = episodic_value_iteration(self._time_horizon, *self._mdp_model.sample()) 145 self._actor.set_q_values(Q) 146 147 def before_start_interacting(self): 148 self.episode_end_update()
The posterior sampling for reinforcement learning algorithm.
Osband, Ian, Daniel Russo, and Benjamin Van Roy. "(More) efficient reinforcement learning via posterior sampling." Advances in Neural Information Processing Systems 26 (2013).
PSRLEpisodic( seed: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, optimization_horizon: int, reward_prior_model: colosseum.agent.mdp_models.bayesian_models.RewardsConjugateModel = None, transitions_prior_model: colosseum.agent.mdp_models.bayesian_models.TransitionsConjugateModel = None, rewards_prior_prms=None, transitions_prior_prms=None, epsilon_greedy: Union[float, Callable] = None, boltzmann_temperature: Union[float, Callable] = None)
85 def __init__( 86 self, 87 seed: int, 88 mdp_specs: MDPSpec, 89 optimization_horizon: int, 90 # MDP model parameters 91 reward_prior_model: RewardsConjugateModel = None, 92 transitions_prior_model: TransitionsConjugateModel = None, 93 rewards_prior_prms=None, 94 transitions_prior_prms=None, 95 # Actor parameters 96 epsilon_greedy: Union[float, Callable] = None, 97 boltzmann_temperature: Union[float, Callable] = None, 98 ): 99 """ 100 101 Parameters 102 ---------- 103 seed : int 104 The random seed. 105 mdp_specs : MDPSpec 106 The full specification of the MDP. 107 optimization_horizon : int 108 The total number of interactions that the agent is expected to have with the MDP. 109 reward_prior_model : RewardsConjugateModel, optional 110 The reward priors. 111 transitions_prior_model : TransitionsConjugateModel, optional 112 The transitions priors. 113 rewards_prior_prms : Any 114 The reward prior parameters. 115 transitions_prior_prms : Any 116 The transitions prior parameters. 117 epsilon_greedy : Union[float, Callable], optional 118 The probability of selecting an action at random. It can be provided as a float or as a function of the 119 total number of interactions. By default, the probability is set to zero. 120 boltzmann_temperature : Union[float, Callable], optional 121 The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of 122 the total number of interactions. By default, Boltzmann exploration is disabled. 123 """ 124 125 mdp_model = BayesianMDPModel( 126 seed, 127 mdp_specs, 128 reward_prior_model=reward_prior_model, 129 transitions_prior_model=transitions_prior_model, 130 rewards_prior_prms=rewards_prior_prms, 131 transitions_prior_prms=transitions_prior_prms, 132 ) 133 actor = QValuesActor(seed, mdp_specs, epsilon_greedy, boltzmann_temperature) 134 135 super(PSRLEpisodic, self).__init__( 136 seed, 137 mdp_specs, 138 mdp_model, 139 actor, 140 optimization_horizon, 141 )
Parameters
- seed (int): The random seed.
- mdp_specs (MDPSpec): The full specification of the MDP.
- optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
- reward_prior_model (RewardsConjugateModel, optional): The reward priors.
- transitions_prior_model (TransitionsConjugateModel, optional): The transitions priors.
- rewards_prior_prms (Any): The reward prior parameters.
- transitions_prior_prms (Any): The transitions prior parameters.
- epsilon_greedy (Union[float, Callable], optional): The probability of selecting an action at random. It can be provided as a float or as a function of the total number of interactions. By default, the probability is set to zero.
- boltzmann_temperature (Union[float, Callable], optional): The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of the total number of interactions. By default, Boltzmann exploration is disabled.
def
step_update( self, ts_t: dm_env._environment.TimeStep, a_t: 'ACTION_TYPE', ts_tp1: dm_env._environment.TimeStep, time: int):
29 def step_update( 30 self, 31 ts_t: dm_env.TimeStep, 32 a_t: "ACTION_TYPE", 33 ts_tp1: dm_env.TimeStep, 34 time: int, 35 ): 36 super(PSRLEpisodic, self).step_update(ts_t, a_t, ts_tp1, time)
adds the transition in input to the MDP model.
Parameters
- ts_t (dm_env.TimeStep): The TimeStep at time t.
- a_t ("ACTION_TYPE"): The action taken by the agent at time t.
- ts_tp1 (dm_env.TimeStep): The TimeStep at time t + 1.
- time (int): The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in the continuous case this refers to the total number of previous interactions.
@staticmethod
def
is_emission_map_accepted(emission_map: colosseum.emission_maps.base.EmissionMap) -> bool:
38 @staticmethod 39 def is_emission_map_accepted(emission_map: "EmissionMap") -> bool: 40 return emission_map.is_tabular
Returns
- bool: True if the agent class accepts the emission map.
@staticmethod
def
produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
42 @staticmethod 43 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 44 return ( 45 "from colosseum.agent.mdp_models import bayesian_models\n" 46 f"prms_{index}/PSRLEpisodic.reward_prior_model = %bayesian_models.RewardsConjugateModel.N_NIG\n" 47 f"prms_{index}/PSRLEpisodic.transitions_prior_model = %bayesian_models.TransitionsConjugateModel.M_DIR\n" 48 f"prms_{index}/PSRLEpisodic.rewards_prior_prms = [{parameters['prior_mean']}, 1, 1, 1]\n" 49 f"prms_{index}/PSRLEpisodic.transitions_prior_prms = [{parameters['transition_prior']}]" 50 )
produces a string containing the gin config file corresponding to the parameters given in input.
Parameters
- parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
- index (int): The index assigned to the gin configuration.
Returns
- gin_config (str): The gin configuration file.
@staticmethod
def
is_episodic() -> bool:
Returns
- bool: True if the agent is suited for the episodic setting.
@staticmethod
def
get_hyperparameters_search_spaces() -> Dict[str, ray.tune.sample.Domain]:
56 @staticmethod 57 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 58 return dict( 59 prior_mean=tune.uniform(0.001, 2.0), transition_prior=tune.uniform(0.001, 2) 60 )
Returns
- Dict[str, tune.sample.Domain]: The dictionary with key value pairs corresponding to hyperparameter names and corresponding
ray.tune
samplers.
@staticmethod
def
get_agent_instance_from_parameters( seed: int, optimization_horizon: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, parameters: Dict[str, Any]) -> colosseum.agent.agents.base.BaseAgent:
62 @staticmethod 63 def get_agent_instance_from_parameters( 64 seed: int, 65 optimization_horizon: int, 66 mdp_specs: MDPSpec, 67 parameters: Dict[str, Any], 68 ) -> "BaseAgent": 69 return PSRLEpisodic( 70 mdp_specs=mdp_specs, 71 seed=seed, 72 optimization_horizon=optimization_horizon, 73 reward_prior_model=RewardsConjugateModel.N_NIG, 74 transitions_prior_model=TransitionsConjugateModel.M_DIR, 75 rewards_prior_prms=[parameters["prior_mean"], 1, 1, 1], 76 transitions_prior_prms=[parameters["transition_prior"]], 77 )
returns an agent instance for the mdp specification and agent parameters given in input.
Parameters
- seed (int): The random seed.
- optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
- mdp_specs (MDPSpec): The full specification of the MDP.
- parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
Returns
- BaseAgent: The agent instance.
current_optimal_stochastic_policy: numpy.ndarray
Returns
- np.ndarray: The estimates of the best optimal policy given the current knowledge of the agent in the form of distribution over actions.
def
episode_end_update(self):
143 def episode_end_update(self): 144 Q, _ = episodic_value_iteration(self._time_horizon, *self._mdp_model.sample()) 145 self._actor.set_q_values(Q)
is called when an episode ends. In the infinite horizon case, we refer to artificial episodes.