colosseum.agent.mdp_models.bayesian_model

 1from typing import TYPE_CHECKING, Tuple
 2
 3import dm_env
 4import numpy as np
 5
 6from colosseum.agent.mdp_models.base import BaseMDPModel
 7from colosseum.agent.mdp_models.bayesian_models import RewardsConjugateModel
 8from colosseum.agent.mdp_models.bayesian_models import TransitionsConjugateModel
 9from colosseum.utils.acme.specs import MDPSpec
10
11if TYPE_CHECKING:
12    from colosseum.mdp import ACTION_TYPE
13
14
15class BayesianMDPModel(BaseMDPModel):
16    """
17    The `BayesianMDPModel` is the wrapper for Bayesian tabular MDP models.
18    """
19
20    def __init__(
21        self,
22        seed: int,
23        mdp_specs: MDPSpec,
24        reward_prior_model: RewardsConjugateModel = None,
25        transitions_prior_model: TransitionsConjugateModel = None,
26        rewards_prior_prms=None,
27        transitions_prior_prms=None,
28    ):
29        """
30        Parameters
31        ----------
32        seed : int
33            The random seed.
34        mdp_specs : MDPSpec
35            The full specification of the MDP.
36        reward_prior_model : RewardsConjugateModel, optional
37            The reward priors.
38        transitions_prior_model : TransitionsConjugateModel, optional
39            The transitions priors.
40        rewards_prior_prms : Any
41            The reward prior parameters.
42        transitions_prior_prms : Any
43            The transitions prior parameters.
44        """
45
46        super(BayesianMDPModel, self).__init__(seed, mdp_specs)
47
48        if reward_prior_model is None:
49            reward_prior_model = RewardsConjugateModel.N_NIG
50            rewards_prior_prms = [self._reward_range[1], 1, 1, 1]
51        if transitions_prior_model is None:
52            transitions_prior_model = TransitionsConjugateModel.M_DIR
53            transitions_prior_prms = [1.0 / self._n_states]
54
55        self._rewards_model = reward_prior_model.get_class()(
56            self._n_states, self._n_actions, rewards_prior_prms, seed
57        )
58        self._transitions_model = transitions_prior_model.get_class()(
59            self._n_states, self._n_actions, transitions_prior_prms, seed
60        )
61
62    def sample(self) -> Tuple[np.ndarray, np.ndarray]:
63        """
64        returns an MDP model in terms of transitions probabilities matrix and rewards matrix.
65        """
66        return self._transitions_model.sample(), self._rewards_model.sample()
67
68    def sample_T(self):
69        return self._transitions_model.sample()
70
71    def sample_R(self):
72        return self._rewards_model.sample()
73
74    def get_map_estimate(self) -> Tuple[np.ndarray, np.ndarray]:
75        return (
76            self._transitions_model.get_map_estimate(),
77            self._rewards_model.get_map_estimate(),
78        )
79
80    def step_update(
81        self,
82        ts_t: dm_env.TimeStep,
83        a_t: "ACTION_TYPE",
84        ts_tp1: dm_env.TimeStep,
85        time: int,
86    ):
87        self._rewards_model.update_single_transition(
88            ts_t.observation, a_t, ts_tp1.reward
89        )
90        if not ts_tp1.last():
91            self._transitions_model.update_single_transition(
92                ts_t.observation, a_t, ts_tp1.observation
93            )
class BayesianMDPModel(colosseum.agent.mdp_models.base.BaseMDPModel):
16class BayesianMDPModel(BaseMDPModel):
17    """
18    The `BayesianMDPModel` is the wrapper for Bayesian tabular MDP models.
19    """
20
21    def __init__(
22        self,
23        seed: int,
24        mdp_specs: MDPSpec,
25        reward_prior_model: RewardsConjugateModel = None,
26        transitions_prior_model: TransitionsConjugateModel = None,
27        rewards_prior_prms=None,
28        transitions_prior_prms=None,
29    ):
30        """
31        Parameters
32        ----------
33        seed : int
34            The random seed.
35        mdp_specs : MDPSpec
36            The full specification of the MDP.
37        reward_prior_model : RewardsConjugateModel, optional
38            The reward priors.
39        transitions_prior_model : TransitionsConjugateModel, optional
40            The transitions priors.
41        rewards_prior_prms : Any
42            The reward prior parameters.
43        transitions_prior_prms : Any
44            The transitions prior parameters.
45        """
46
47        super(BayesianMDPModel, self).__init__(seed, mdp_specs)
48
49        if reward_prior_model is None:
50            reward_prior_model = RewardsConjugateModel.N_NIG
51            rewards_prior_prms = [self._reward_range[1], 1, 1, 1]
52        if transitions_prior_model is None:
53            transitions_prior_model = TransitionsConjugateModel.M_DIR
54            transitions_prior_prms = [1.0 / self._n_states]
55
56        self._rewards_model = reward_prior_model.get_class()(
57            self._n_states, self._n_actions, rewards_prior_prms, seed
58        )
59        self._transitions_model = transitions_prior_model.get_class()(
60            self._n_states, self._n_actions, transitions_prior_prms, seed
61        )
62
63    def sample(self) -> Tuple[np.ndarray, np.ndarray]:
64        """
65        returns an MDP model in terms of transitions probabilities matrix and rewards matrix.
66        """
67        return self._transitions_model.sample(), self._rewards_model.sample()
68
69    def sample_T(self):
70        return self._transitions_model.sample()
71
72    def sample_R(self):
73        return self._rewards_model.sample()
74
75    def get_map_estimate(self) -> Tuple[np.ndarray, np.ndarray]:
76        return (
77            self._transitions_model.get_map_estimate(),
78            self._rewards_model.get_map_estimate(),
79        )
80
81    def step_update(
82        self,
83        ts_t: dm_env.TimeStep,
84        a_t: "ACTION_TYPE",
85        ts_tp1: dm_env.TimeStep,
86        time: int,
87    ):
88        self._rewards_model.update_single_transition(
89            ts_t.observation, a_t, ts_tp1.reward
90        )
91        if not ts_tp1.last():
92            self._transitions_model.update_single_transition(
93                ts_t.observation, a_t, ts_tp1.observation
94            )

The BayesianMDPModel is the wrapper for Bayesian tabular MDP models.

BayesianMDPModel( seed: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, reward_prior_model: colosseum.agent.mdp_models.bayesian_models.RewardsConjugateModel = None, transitions_prior_model: colosseum.agent.mdp_models.bayesian_models.TransitionsConjugateModel = None, rewards_prior_prms=None, transitions_prior_prms=None)
21    def __init__(
22        self,
23        seed: int,
24        mdp_specs: MDPSpec,
25        reward_prior_model: RewardsConjugateModel = None,
26        transitions_prior_model: TransitionsConjugateModel = None,
27        rewards_prior_prms=None,
28        transitions_prior_prms=None,
29    ):
30        """
31        Parameters
32        ----------
33        seed : int
34            The random seed.
35        mdp_specs : MDPSpec
36            The full specification of the MDP.
37        reward_prior_model : RewardsConjugateModel, optional
38            The reward priors.
39        transitions_prior_model : TransitionsConjugateModel, optional
40            The transitions priors.
41        rewards_prior_prms : Any
42            The reward prior parameters.
43        transitions_prior_prms : Any
44            The transitions prior parameters.
45        """
46
47        super(BayesianMDPModel, self).__init__(seed, mdp_specs)
48
49        if reward_prior_model is None:
50            reward_prior_model = RewardsConjugateModel.N_NIG
51            rewards_prior_prms = [self._reward_range[1], 1, 1, 1]
52        if transitions_prior_model is None:
53            transitions_prior_model = TransitionsConjugateModel.M_DIR
54            transitions_prior_prms = [1.0 / self._n_states]
55
56        self._rewards_model = reward_prior_model.get_class()(
57            self._n_states, self._n_actions, rewards_prior_prms, seed
58        )
59        self._transitions_model = transitions_prior_model.get_class()(
60            self._n_states, self._n_actions, transitions_prior_prms, seed
61        )
Parameters
  • seed (int): The random seed.
  • mdp_specs (MDPSpec): The full specification of the MDP.
  • reward_prior_model (RewardsConjugateModel, optional): The reward priors.
  • transitions_prior_model (TransitionsConjugateModel, optional): The transitions priors.
  • rewards_prior_prms (Any): The reward prior parameters.
  • transitions_prior_prms (Any): The transitions prior parameters.
def sample(self) -> Tuple[numpy.ndarray, numpy.ndarray]:
63    def sample(self) -> Tuple[np.ndarray, np.ndarray]:
64        """
65        returns an MDP model in terms of transitions probabilities matrix and rewards matrix.
66        """
67        return self._transitions_model.sample(), self._rewards_model.sample()

returns an MDP model in terms of transitions probabilities matrix and rewards matrix.

def sample_T(self):
69    def sample_T(self):
70        return self._transitions_model.sample()
def sample_R(self):
72    def sample_R(self):
73        return self._rewards_model.sample()
def get_map_estimate(self) -> Tuple[numpy.ndarray, numpy.ndarray]:
75    def get_map_estimate(self) -> Tuple[np.ndarray, np.ndarray]:
76        return (
77            self._transitions_model.get_map_estimate(),
78            self._rewards_model.get_map_estimate(),
79        )
def step_update( self, ts_t: dm_env._environment.TimeStep, a_t: Union[int, float, numpy.ndarray], ts_tp1: dm_env._environment.TimeStep, time: int):
81    def step_update(
82        self,
83        ts_t: dm_env.TimeStep,
84        a_t: "ACTION_TYPE",
85        ts_tp1: dm_env.TimeStep,
86        time: int,
87    ):
88        self._rewards_model.update_single_transition(
89            ts_t.observation, a_t, ts_tp1.reward
90        )
91        if not ts_tp1.last():
92            self._transitions_model.update_single_transition(
93                ts_t.observation, a_t, ts_tp1.observation
94            )

updates the model with the transition in input.

Parameters
  • ts_t (dm_env.TimeStep): The TimeStep at time t.
  • a_t ("ACTION_TYPE"): The action taken by the agent at time t.
  • ts_tp1 (dm_env.TimeStep): The TimeStep at time t + 1.
  • time (int): The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in the continuous case this refers to the total number of previous interactions.