colosseum.agent.mdp_models.bayesian_model
1from typing import TYPE_CHECKING, Tuple 2 3import dm_env 4import numpy as np 5 6from colosseum.agent.mdp_models.base import BaseMDPModel 7from colosseum.agent.mdp_models.bayesian_models import RewardsConjugateModel 8from colosseum.agent.mdp_models.bayesian_models import TransitionsConjugateModel 9from colosseum.utils.acme.specs import MDPSpec 10 11if TYPE_CHECKING: 12 from colosseum.mdp import ACTION_TYPE 13 14 15class BayesianMDPModel(BaseMDPModel): 16 """ 17 The `BayesianMDPModel` is the wrapper for Bayesian tabular MDP models. 18 """ 19 20 def __init__( 21 self, 22 seed: int, 23 mdp_specs: MDPSpec, 24 reward_prior_model: RewardsConjugateModel = None, 25 transitions_prior_model: TransitionsConjugateModel = None, 26 rewards_prior_prms=None, 27 transitions_prior_prms=None, 28 ): 29 """ 30 Parameters 31 ---------- 32 seed : int 33 The random seed. 34 mdp_specs : MDPSpec 35 The full specification of the MDP. 36 reward_prior_model : RewardsConjugateModel, optional 37 The reward priors. 38 transitions_prior_model : TransitionsConjugateModel, optional 39 The transitions priors. 40 rewards_prior_prms : Any 41 The reward prior parameters. 42 transitions_prior_prms : Any 43 The transitions prior parameters. 44 """ 45 46 super(BayesianMDPModel, self).__init__(seed, mdp_specs) 47 48 if reward_prior_model is None: 49 reward_prior_model = RewardsConjugateModel.N_NIG 50 rewards_prior_prms = [self._reward_range[1], 1, 1, 1] 51 if transitions_prior_model is None: 52 transitions_prior_model = TransitionsConjugateModel.M_DIR 53 transitions_prior_prms = [1.0 / self._n_states] 54 55 self._rewards_model = reward_prior_model.get_class()( 56 self._n_states, self._n_actions, rewards_prior_prms, seed 57 ) 58 self._transitions_model = transitions_prior_model.get_class()( 59 self._n_states, self._n_actions, transitions_prior_prms, seed 60 ) 61 62 def sample(self) -> Tuple[np.ndarray, np.ndarray]: 63 """ 64 returns an MDP model in terms of transitions probabilities matrix and rewards matrix. 65 """ 66 return self._transitions_model.sample(), self._rewards_model.sample() 67 68 def sample_T(self): 69 return self._transitions_model.sample() 70 71 def sample_R(self): 72 return self._rewards_model.sample() 73 74 def get_map_estimate(self) -> Tuple[np.ndarray, np.ndarray]: 75 return ( 76 self._transitions_model.get_map_estimate(), 77 self._rewards_model.get_map_estimate(), 78 ) 79 80 def step_update( 81 self, 82 ts_t: dm_env.TimeStep, 83 a_t: "ACTION_TYPE", 84 ts_tp1: dm_env.TimeStep, 85 time: int, 86 ): 87 self._rewards_model.update_single_transition( 88 ts_t.observation, a_t, ts_tp1.reward 89 ) 90 if not ts_tp1.last(): 91 self._transitions_model.update_single_transition( 92 ts_t.observation, a_t, ts_tp1.observation 93 )
16class BayesianMDPModel(BaseMDPModel): 17 """ 18 The `BayesianMDPModel` is the wrapper for Bayesian tabular MDP models. 19 """ 20 21 def __init__( 22 self, 23 seed: int, 24 mdp_specs: MDPSpec, 25 reward_prior_model: RewardsConjugateModel = None, 26 transitions_prior_model: TransitionsConjugateModel = None, 27 rewards_prior_prms=None, 28 transitions_prior_prms=None, 29 ): 30 """ 31 Parameters 32 ---------- 33 seed : int 34 The random seed. 35 mdp_specs : MDPSpec 36 The full specification of the MDP. 37 reward_prior_model : RewardsConjugateModel, optional 38 The reward priors. 39 transitions_prior_model : TransitionsConjugateModel, optional 40 The transitions priors. 41 rewards_prior_prms : Any 42 The reward prior parameters. 43 transitions_prior_prms : Any 44 The transitions prior parameters. 45 """ 46 47 super(BayesianMDPModel, self).__init__(seed, mdp_specs) 48 49 if reward_prior_model is None: 50 reward_prior_model = RewardsConjugateModel.N_NIG 51 rewards_prior_prms = [self._reward_range[1], 1, 1, 1] 52 if transitions_prior_model is None: 53 transitions_prior_model = TransitionsConjugateModel.M_DIR 54 transitions_prior_prms = [1.0 / self._n_states] 55 56 self._rewards_model = reward_prior_model.get_class()( 57 self._n_states, self._n_actions, rewards_prior_prms, seed 58 ) 59 self._transitions_model = transitions_prior_model.get_class()( 60 self._n_states, self._n_actions, transitions_prior_prms, seed 61 ) 62 63 def sample(self) -> Tuple[np.ndarray, np.ndarray]: 64 """ 65 returns an MDP model in terms of transitions probabilities matrix and rewards matrix. 66 """ 67 return self._transitions_model.sample(), self._rewards_model.sample() 68 69 def sample_T(self): 70 return self._transitions_model.sample() 71 72 def sample_R(self): 73 return self._rewards_model.sample() 74 75 def get_map_estimate(self) -> Tuple[np.ndarray, np.ndarray]: 76 return ( 77 self._transitions_model.get_map_estimate(), 78 self._rewards_model.get_map_estimate(), 79 ) 80 81 def step_update( 82 self, 83 ts_t: dm_env.TimeStep, 84 a_t: "ACTION_TYPE", 85 ts_tp1: dm_env.TimeStep, 86 time: int, 87 ): 88 self._rewards_model.update_single_transition( 89 ts_t.observation, a_t, ts_tp1.reward 90 ) 91 if not ts_tp1.last(): 92 self._transitions_model.update_single_transition( 93 ts_t.observation, a_t, ts_tp1.observation 94 )
The BayesianMDPModel
is the wrapper for Bayesian tabular MDP models.
BayesianMDPModel( seed: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, reward_prior_model: colosseum.agent.mdp_models.bayesian_models.RewardsConjugateModel = None, transitions_prior_model: colosseum.agent.mdp_models.bayesian_models.TransitionsConjugateModel = None, rewards_prior_prms=None, transitions_prior_prms=None)
21 def __init__( 22 self, 23 seed: int, 24 mdp_specs: MDPSpec, 25 reward_prior_model: RewardsConjugateModel = None, 26 transitions_prior_model: TransitionsConjugateModel = None, 27 rewards_prior_prms=None, 28 transitions_prior_prms=None, 29 ): 30 """ 31 Parameters 32 ---------- 33 seed : int 34 The random seed. 35 mdp_specs : MDPSpec 36 The full specification of the MDP. 37 reward_prior_model : RewardsConjugateModel, optional 38 The reward priors. 39 transitions_prior_model : TransitionsConjugateModel, optional 40 The transitions priors. 41 rewards_prior_prms : Any 42 The reward prior parameters. 43 transitions_prior_prms : Any 44 The transitions prior parameters. 45 """ 46 47 super(BayesianMDPModel, self).__init__(seed, mdp_specs) 48 49 if reward_prior_model is None: 50 reward_prior_model = RewardsConjugateModel.N_NIG 51 rewards_prior_prms = [self._reward_range[1], 1, 1, 1] 52 if transitions_prior_model is None: 53 transitions_prior_model = TransitionsConjugateModel.M_DIR 54 transitions_prior_prms = [1.0 / self._n_states] 55 56 self._rewards_model = reward_prior_model.get_class()( 57 self._n_states, self._n_actions, rewards_prior_prms, seed 58 ) 59 self._transitions_model = transitions_prior_model.get_class()( 60 self._n_states, self._n_actions, transitions_prior_prms, seed 61 )
Parameters
- seed (int): The random seed.
- mdp_specs (MDPSpec): The full specification of the MDP.
- reward_prior_model (RewardsConjugateModel, optional): The reward priors.
- transitions_prior_model (TransitionsConjugateModel, optional): The transitions priors.
- rewards_prior_prms (Any): The reward prior parameters.
- transitions_prior_prms (Any): The transitions prior parameters.
def
sample(self) -> Tuple[numpy.ndarray, numpy.ndarray]:
63 def sample(self) -> Tuple[np.ndarray, np.ndarray]: 64 """ 65 returns an MDP model in terms of transitions probabilities matrix and rewards matrix. 66 """ 67 return self._transitions_model.sample(), self._rewards_model.sample()
returns an MDP model in terms of transitions probabilities matrix and rewards matrix.
def
step_update( self, ts_t: dm_env._environment.TimeStep, a_t: Union[int, float, numpy.ndarray], ts_tp1: dm_env._environment.TimeStep, time: int):
81 def step_update( 82 self, 83 ts_t: dm_env.TimeStep, 84 a_t: "ACTION_TYPE", 85 ts_tp1: dm_env.TimeStep, 86 time: int, 87 ): 88 self._rewards_model.update_single_transition( 89 ts_t.observation, a_t, ts_tp1.reward 90 ) 91 if not ts_tp1.last(): 92 self._transitions_model.update_single_transition( 93 ts_t.observation, a_t, ts_tp1.observation 94 )
updates the model with the transition in input.
Parameters
- ts_t (dm_env.TimeStep): The TimeStep at time t.
- a_t ("ACTION_TYPE"): The action taken by the agent at time t.
- ts_tp1 (dm_env.TimeStep): The TimeStep at time t + 1.
- time (int): The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in the continuous case this refers to the total number of previous interactions.