colosseum.agent.agents.infinite_horizon.posterior_sampling

  1from typing import TYPE_CHECKING, Any, Callable, Dict, Union
  2
  3import dm_env
  4import gin
  5import numpy as np
  6from ray import tune
  7
  8from colosseum.agent.actors import QValuesActor
  9from colosseum.agent.agents.base import BaseAgent
 10from colosseum.agent.mdp_models.bayesian_model import BayesianMDPModel
 11from colosseum.agent.mdp_models.bayesian_models import RewardsConjugateModel
 12from colosseum.agent.mdp_models.bayesian_models import TransitionsConjugateModel
 13from colosseum.dynamic_programming import discounted_value_iteration
 14from colosseum.dynamic_programming.utils import get_policy_from_q_values
 15from colosseum.emission_maps import EmissionMap
 16from colosseum.utils.acme.specs import MDPSpec
 17
 18if TYPE_CHECKING:
 19    from colosseum.mdp import ACTION_TYPE
 20
 21
 22def get_psi(n_states, n_actions, T, p) -> float:
 23    r"""
 24    computes the theoretical value of the :math:`\psi` parameter.
 25
 26    Parameters
 27    ----------
 28    n_states : int
 29        The number of states.
 30    n_actions : int
 31        The number of actions.
 32    T : int
 33        The optimization horizon.
 34    p : float
 35        The probability of failure.
 36
 37    Returns
 38    -------
 39    float
 40        The theoretical value of the :math:`\psi` parameter.
 41    """
 42    return n_states * np.log(n_states * n_actions / p)
 43
 44
 45def get_omega(n_states, n_actions, T, p) -> float:
 46    r"""
 47    computes the theoretical value of the :math:`\omega` parameter.
 48
 49    Parameters
 50    ----------
 51    n_states : int
 52        The number of states.
 53    n_actions : int
 54        The number of actions.
 55    T : int
 56        The optimization horizon.
 57    p : float
 58        The probability of failure.
 59
 60    Returns
 61    -------
 62    float
 63        The theoretical value of the :math:`\omega` parameter.
 64    """
 65    return np.log(T / p)
 66
 67
 68def get_kappa(n_states, n_actions, T, p) -> float:
 69    r"""
 70    computes the theoretical value of the :math:`\kappa` parameter.
 71
 72    Parameters
 73    ----------
 74    n_states : int
 75        The number of states.
 76    n_actions : int
 77        The number of actions.
 78    T : int
 79        The optimization horizon.
 80    p : float
 81        The probability of failure.
 82
 83    Returns
 84    -------
 85    float
 86        The theoretical value of the :math:`\kappa` parameter.
 87    """
 88    return np.log(T / p)
 89
 90
 91def get_eta(n_states, n_actions, T, p, omega) -> float:
 92    r"""
 93    computes the theoretical value of the :math:`\eta` parameter.
 94
 95    Parameters
 96    ----------
 97    n_states : int
 98        The number of states.
 99    n_actions : int
100        The number of actions.
101    T : int
102        The optimization horizon.
103    p : float
104        The probability of failure.
105    omega : float
106        The omega parameter.
107
108    Returns
109    -------
110    float
111        The theoretical value of the :math:`\eta` parameter.
112    """
113    return np.sqrt(T * n_states / n_actions) + 12 * omega * n_states ** 4
114
115
116@gin.configurable
117class PSRLContinuous(BaseAgent):
118    """
119    The posterior sampling for reinforcement learning algorithm with optimism.
120
121    Agrawal, Shipra, and Randy Jia. "Posterior sampling for reinforcement learning: worst-case regret bounds." arXiv
122    preprint arXiv:1705.07041 (2017).
123    """
124
125    @staticmethod
126    def is_emission_map_accepted(emission_map: "EmissionMap") -> bool:
127        return emission_map.is_tabular
128
129    @staticmethod
130    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
131        return (
132            "from colosseum.agent.mdp_models import bayesian_models\n"
133            f"prms_{index}/PSRLContinuous.reward_prior_model = %bayesian_models.RewardsConjugateModel.N_NIG\n"
134            f"prms_{index}/PSRLContinuous.rewards_prior_prms = [{parameters['rewards_prior_mean']}, 1, 1, 1]\n"
135            f"prms_{index}/PSRLContinuous.psi_weight = {parameters['psi_weight']}\n"
136            f"prms_{index}/PSRLContinuous.omega_weight = {parameters['omega_weight']}\n"
137            f"prms_{index}/PSRLContinuous.kappa_weight = {parameters['kappa_weight']}\n"
138            f"prms_{index}/PSRLContinuous.eta_weight = {parameters['eta_weight']}"
139        )
140
141    @staticmethod
142    def is_episodic() -> bool:
143        return False
144
145    @staticmethod
146    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
147        return {
148            "psi_weight": tune.uniform(0.001, 0.1),
149            "omega_weight": tune.uniform(0.0001, 1),
150            "kappa_weight": tune.uniform(0.2, 4),
151            "eta_weight": tune.uniform(1e-10, 1e-6),
152            "rewards_prior_mean": tune.uniform(0.0, 1.2),
153        }
154
155    @staticmethod
156    def get_agent_instance_from_parameters(
157        seed: int,
158        optimization_horizon: int,
159        mdp_specs: MDPSpec,
160        parameters: Dict[str, Any],
161    ) -> "BaseAgent":
162        return PSRLContinuous(
163            mdp_specs=mdp_specs,
164            seed=seed,
165            optimization_horizon=optimization_horizon,
166            reward_prior_model=RewardsConjugateModel.N_NIG,
167            rewards_prior_prms=[parameters["rewards_prior_mean"], 1, 1, 1],
168            psi_weight=parameters["psi_weight"],
169            omega_weight=parameters["omega_weight"],
170            kappa_weight=parameters["kappa_weight"],
171            eta_weight=parameters["eta_weight"],
172        )
173
174    @property
175    def current_optimal_stochastic_policy(self) -> np.ndarray:
176        T_map, R_map = self._mdp_model.get_map_estimate()
177        Q, _ = discounted_value_iteration(T_map, R_map)
178        return get_policy_from_q_values(Q, True)
179
180    def __init__(
181        self,
182        seed: int,
183        mdp_specs: MDPSpec,
184        optimization_horizon: int,
185        # MDP model parameters
186        reward_prior_model: RewardsConjugateModel = None,
187        transitions_prior_model: TransitionsConjugateModel = None,
188        rewards_prior_prms=None,
189        transitions_prior_prms=None,
190        # Actor parameters
191        epsilon_greedy: Union[float, Callable] = None,
192        boltzmann_temperature: Union[float, Callable] = None,
193        psi_weight: float = 1.0,
194        omega_weight: float = 1.0,
195        kappa_weight: float = 1.0,
196        eta_weight: float = 1.0,
197        get_psi: Callable[[int, int, int, float], float] = get_psi,
198        get_omega: Callable[[int, int, int, float], float] = get_omega,
199        get_kappa: Callable[[int, int, int, float], float] = get_kappa,
200        get_eta: Callable[[int, int, int, float, float], float] = get_eta,
201        p: float = 0.05,
202        no_optimistic_sampling: bool = False,
203        truncate_reward_with_max: bool = False,
204        min_steps_before_new_episode: int = 0,
205        max_psi: int = 60,
206    ):
207        r"""
208        Parameters
209        ----------
210        seed : int
211            The random seed.
212        mdp_specs : MDPSpec
213            The full specification of the MDP.
214        optimization_horizon : int
215            The total number of interactions that the agent is expected to have with the MDP.
216        reward_prior_model : RewardsConjugateModel, optional
217            The reward priors.
218        transitions_prior_model : TransitionsConjugateModel, optional
219            The transitions priors.
220        rewards_prior_prms : Any
221            The reward prior parameters.
222        transitions_prior_prms : Any
223            The transitions prior parameters.
224        epsilon_greedy : Union[float, Callable], optional
225            The probability of selecting an action at random. It can be provided as a float or as a function of the
226            total number of interactions. By default, the probability is set to zero.
227        boltzmann_temperature : Union[float, Callable], optional
228            The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of
229            the total number of interactions. By default, Boltzmann exploration is disabled.
230        psi_weight : float
231            The coefficient for which the theoretical value of the :math:`\psi` parameter is multiplied for. By default,
232            it is set to one.
233        omega_weight : float
234            The coefficient for which the theoretical value of the :math:`\omega` parameter is multiplied for. By default,
235            it is set to one.
236        kappa_weight : float
237            The coefficient for which the theoretical value of the :math:`\kappa` parameter is multiplied for. By default,
238            it is set to one.
239        eta_weight : float
240            The coefficient for which the theoretical value of the :math:`\eta` parameter is multiplied for. By default,
241            it is set to one.
242        get_psi : Callable[[int, int, int, float], float]
243            The function that computes the value of the :math:`\psi` parameter given number of states, number of action,
244             optimization horizon, and probability of failure. By default, it is set to the theoretical value.
245        get_omega : Callable[[int, int, int, float], float]
246            The function that computes the value of the :math:`\omega` parameter given number of states, number of action,
247             optimization horizon, and probability of failure. By default, it is set to the theoretical value.
248        get_kappa : Callable[[int, int, int, float], float]
249            The function that computes the value of the :math:`\kappa` parameter given number of states, number of action,
250             optimization horizon, and probability of failure. By default, it is set to the theoretical value.
251        get_eta : Callable[[int, int, int, float, float], float]
252            The function that computes the value of the :math:`\eta` parameter given number of states, number of action,
253             optimization horizon, probability of failure, and the omega parameter. By default, it is set to the
254             theoretical value.
255        p : float
256            The probability of failure. By default, it is set to :math:`0.05`.
257        no_optimistic_sampling : bool
258            If True the optimistic sampling procedure is disabled.
259        truncate_reward_with_max : bool
260            If True, the sampled rewards are truncated to the maximum possible value of the reward. By default, it is
261            set to False.
262        min_steps_before_new_episode : int
263            The minimum interval length between artificial episodes. By default, it is set to zero.
264        max_psi : int
265            The maximum value of the :math:`\psi` parameter. By default, it is set to :math:`60`.
266        """
267
268        self._n_states = mdp_specs.observations.num_values
269        self._n_actions = mdp_specs.actions.num_values
270
271        self.truncate_reward_with_max = truncate_reward_with_max
272        self.no_optimistic_sampling = (
273            no_optimistic_sampling
274            or (self._n_states ** 2 * self._n_actions) > 6_000_000
275        )
276
277        self.p = p
278        self.psi = min(
279            max_psi,
280            max(
281                2,
282                int(
283                    psi_weight
284                    * get_psi(self._n_states, self._n_actions, optimization_horizon, p)
285                ),
286            ),
287        )
288        self.omega = omega_weight * get_omega(
289            self._n_states, self._n_actions, optimization_horizon, p
290        )
291        self.kappa = kappa_weight * get_kappa(
292            self._n_states, self._n_actions, optimization_horizon, p
293        )
294        self.eta = max(
295            5,
296            min(
297                10 * self._n_states,
298                eta_weight
299                * get_eta(
300                    self._n_states,
301                    self._n_actions,
302                    optimization_horizon,
303                    p,
304                    self.omega,
305                ),
306            ),
307        )
308
309        self._n_states = mdp_specs.observations.num_values
310        self.episode = 0
311        self.min_steps_before_new_episode = min_steps_before_new_episode
312        self.last_change = 0
313
314        self.M = np.zeros(
315            (self._n_states, self._n_actions, self._n_states), dtype=np.float32
316        )
317        self.N = np.zeros(
318            (self._n_states, self._n_actions, self._n_states), dtype=np.int32
319        )
320        q_shape = (
321            (self._n_states, self._n_actions, self._n_states)
322            if no_optimistic_sampling
323            else (self.psi, self._n_states, self._n_actions, self._n_states)
324        )
325        self.Q = np.zeros(q_shape, dtype=np.float32)
326        self.nu_k = np.zeros((self._n_states, self._n_actions), dtype=np.int8)
327        self.episode_transition_data = dict()
328
329        mdp_model = BayesianMDPModel(
330            seed,
331            mdp_specs,
332            reward_prior_model=reward_prior_model,
333            transitions_prior_model=transitions_prior_model,
334            rewards_prior_prms=rewards_prior_prms,
335            transitions_prior_prms=transitions_prior_prms,
336        )
337
338        super(PSRLContinuous, self).__init__(
339            seed,
340            mdp_specs,
341            mdp_model,
342            QValuesActor(seed, mdp_specs, epsilon_greedy, boltzmann_temperature),
343            optimization_horizon,
344        )
345
346    def is_episode_end(
347        self,
348        ts_t: dm_env.TimeStep,
349        a_t: "ACTION_TYPE",
350        ts_tp1: dm_env.TimeStep,
351        time: int,
352    ) -> bool:
353        if time - self.last_change < self.min_steps_before_new_episode:
354            return False
355        self.last_change = time
356        nu_k = len(self.episode_transition_data[ts_t.observation, a_t])
357        N_tau = self.N[ts_t.observation, a_t].sum()
358        return N_tau >= 2 * (N_tau - nu_k)
359
360    def episode_end_update(self):
361        if self.no_optimistic_sampling:
362            T = self._mdp_model.sample_T()
363        else:
364            self.optimistic_sampling()
365            T = np.moveaxis(self.Q, 0, 2)
366            T = T.reshape((self._n_states, -1, self._n_states))
367
368        R = self._mdp_model.sample_R()
369        if self.truncate_reward_with_max:
370            R = np.maximum(self.r_max, R)
371        if not self.no_optimistic_sampling:
372            R = np.tile(R, (1, self.psi))
373
374        Q, _ = discounted_value_iteration(T, R)
375        self._actor.set_q_values(Q)
376
377        self.episode_transition_data = dict()
378
379    def before_start_interacting(self):
380        self._actor.set_q_values(
381            self._rng.randn(self._n_states, self._n_actions * self.psi)
382        )
383        self.episode_end_update()
384
385    def select_action(self, ts: dm_env.TimeStep, time: int) -> "ACTION_TYPE":
386        return self.extended_action_to_real(
387            super(PSRLContinuous, self).select_action(ts, time)
388        )
389
390    def step_update(
391        self, ts_t: dm_env.TimeStep, a_t: "ACTION_TYPE", ts_tp1: dm_env.TimeStep, h: int
392    ):
393        super(PSRLContinuous, self).step_update(ts_t, a_t, ts_tp1, h)
394
395        self.M[ts_t.observation, a_t, ts_tp1.observation] = (
396            self.N[ts_t.observation, a_t, ts_tp1.observation] + self.omega
397        ) / self.kappa
398        self.N[ts_t.observation, a_t, ts_tp1.observation] += 1
399
400        if (ts_t.observation, a_t) in self.episode_transition_data:
401            if not ts_tp1.last():
402                self.episode_transition_data[ts_t.observation, a_t].append(
403                    ts_tp1.observation
404                )
405        else:
406            if not ts_tp1.last():
407                self.episode_transition_data[ts_t.observation, a_t] = [
408                    ts_tp1.observation
409                ]
410
411    def optimistic_sampling(self):
412        """
413        performs the optimistic sampling procedure.
414        """
415        Nsum = self.N.sum(-1)
416        cond = Nsum < self.eta
417        indices_2 = list(np.where(cond))
418        indices_1 = list(np.where(~cond))
419
420        do_simple_sampling = len(indices_2[0]) > 0
421        do_posterior_sampling = len(indices_1[0]) > 0
422        if do_simple_sampling:
423            P_hat = self.N / np.maximum(Nsum[..., None], 1)
424            N = np.maximum(self.N, 1)
425            P_minus = P_hat - np.minimum(
426                np.sqrt(3 * P_hat * np.log(4 * self._n_states) / N)
427                + 3 * np.log(4 * self._n_states) / N,
428                P_hat,
429            )
430
431        for psi in range(self.psi):
432            if do_posterior_sampling:
433                self.Q[
434                    tuple([np.array([psi] * len(indices_1[0]))] + indices_1)
435                ] = self._mdp_model._transitions_model.sample_sa(tuple(indices_1))
436            if do_simple_sampling:
437                z = self._rng.randint(self._n_states)
438                summing = 1 - P_minus.sum(-1)
439                P_minus[:, :, z] += summing
440                self.Q[
441                    tuple([np.array([psi] * len(indices_2[0]))] + indices_2)
442                ] = P_minus[tuple(indices_2)]
443                P_minus[:, :, z] -= summing
444
445    def extended_action_to_real(self, action) -> int:
446        """
447        transform the extended action used to induce optimistic to a real action of the MDP.
448        """
449        if self.no_optimistic_sampling:
450            return action
451        psi, real_action = action % self.psi, int(action / self.psi)
452        return real_action
def get_psi(n_states, n_actions, T, p) -> float:
23def get_psi(n_states, n_actions, T, p) -> float:
24    r"""
25    computes the theoretical value of the :math:`\psi` parameter.
26
27    Parameters
28    ----------
29    n_states : int
30        The number of states.
31    n_actions : int
32        The number of actions.
33    T : int
34        The optimization horizon.
35    p : float
36        The probability of failure.
37
38    Returns
39    -------
40    float
41        The theoretical value of the :math:`\psi` parameter.
42    """
43    return n_states * np.log(n_states * n_actions / p)

computes the theoretical value of the \( \psi \) parameter.

Parameters
  • n_states (int): The number of states.
  • n_actions (int): The number of actions.
  • T (int): The optimization horizon.
  • p (float): The probability of failure.
Returns
  • float: The theoretical value of the \( \psi \) parameter.
def get_omega(n_states, n_actions, T, p) -> float:
46def get_omega(n_states, n_actions, T, p) -> float:
47    r"""
48    computes the theoretical value of the :math:`\omega` parameter.
49
50    Parameters
51    ----------
52    n_states : int
53        The number of states.
54    n_actions : int
55        The number of actions.
56    T : int
57        The optimization horizon.
58    p : float
59        The probability of failure.
60
61    Returns
62    -------
63    float
64        The theoretical value of the :math:`\omega` parameter.
65    """
66    return np.log(T / p)

computes the theoretical value of the \( \omega \) parameter.

Parameters
  • n_states (int): The number of states.
  • n_actions (int): The number of actions.
  • T (int): The optimization horizon.
  • p (float): The probability of failure.
Returns
  • float: The theoretical value of the \( \omega \) parameter.
def get_kappa(n_states, n_actions, T, p) -> float:
69def get_kappa(n_states, n_actions, T, p) -> float:
70    r"""
71    computes the theoretical value of the :math:`\kappa` parameter.
72
73    Parameters
74    ----------
75    n_states : int
76        The number of states.
77    n_actions : int
78        The number of actions.
79    T : int
80        The optimization horizon.
81    p : float
82        The probability of failure.
83
84    Returns
85    -------
86    float
87        The theoretical value of the :math:`\kappa` parameter.
88    """
89    return np.log(T / p)

computes the theoretical value of the \( \kappa \) parameter.

Parameters
  • n_states (int): The number of states.
  • n_actions (int): The number of actions.
  • T (int): The optimization horizon.
  • p (float): The probability of failure.
Returns
  • float: The theoretical value of the \( \kappa \) parameter.
def get_eta(n_states, n_actions, T, p, omega) -> float:
 92def get_eta(n_states, n_actions, T, p, omega) -> float:
 93    r"""
 94    computes the theoretical value of the :math:`\eta` parameter.
 95
 96    Parameters
 97    ----------
 98    n_states : int
 99        The number of states.
100    n_actions : int
101        The number of actions.
102    T : int
103        The optimization horizon.
104    p : float
105        The probability of failure.
106    omega : float
107        The omega parameter.
108
109    Returns
110    -------
111    float
112        The theoretical value of the :math:`\eta` parameter.
113    """
114    return np.sqrt(T * n_states / n_actions) + 12 * omega * n_states ** 4

computes the theoretical value of the \( \eta \) parameter.

Parameters
  • n_states (int): The number of states.
  • n_actions (int): The number of actions.
  • T (int): The optimization horizon.
  • p (float): The probability of failure.
  • omega (float): The omega parameter.
Returns
  • float: The theoretical value of the \( \eta \) parameter.
@gin.configurable
class PSRLContinuous(colosseum.agent.agents.base.BaseAgent):
117@gin.configurable
118class PSRLContinuous(BaseAgent):
119    """
120    The posterior sampling for reinforcement learning algorithm with optimism.
121
122    Agrawal, Shipra, and Randy Jia. "Posterior sampling for reinforcement learning: worst-case regret bounds." arXiv
123    preprint arXiv:1705.07041 (2017).
124    """
125
126    @staticmethod
127    def is_emission_map_accepted(emission_map: "EmissionMap") -> bool:
128        return emission_map.is_tabular
129
130    @staticmethod
131    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
132        return (
133            "from colosseum.agent.mdp_models import bayesian_models\n"
134            f"prms_{index}/PSRLContinuous.reward_prior_model = %bayesian_models.RewardsConjugateModel.N_NIG\n"
135            f"prms_{index}/PSRLContinuous.rewards_prior_prms = [{parameters['rewards_prior_mean']}, 1, 1, 1]\n"
136            f"prms_{index}/PSRLContinuous.psi_weight = {parameters['psi_weight']}\n"
137            f"prms_{index}/PSRLContinuous.omega_weight = {parameters['omega_weight']}\n"
138            f"prms_{index}/PSRLContinuous.kappa_weight = {parameters['kappa_weight']}\n"
139            f"prms_{index}/PSRLContinuous.eta_weight = {parameters['eta_weight']}"
140        )
141
142    @staticmethod
143    def is_episodic() -> bool:
144        return False
145
146    @staticmethod
147    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
148        return {
149            "psi_weight": tune.uniform(0.001, 0.1),
150            "omega_weight": tune.uniform(0.0001, 1),
151            "kappa_weight": tune.uniform(0.2, 4),
152            "eta_weight": tune.uniform(1e-10, 1e-6),
153            "rewards_prior_mean": tune.uniform(0.0, 1.2),
154        }
155
156    @staticmethod
157    def get_agent_instance_from_parameters(
158        seed: int,
159        optimization_horizon: int,
160        mdp_specs: MDPSpec,
161        parameters: Dict[str, Any],
162    ) -> "BaseAgent":
163        return PSRLContinuous(
164            mdp_specs=mdp_specs,
165            seed=seed,
166            optimization_horizon=optimization_horizon,
167            reward_prior_model=RewardsConjugateModel.N_NIG,
168            rewards_prior_prms=[parameters["rewards_prior_mean"], 1, 1, 1],
169            psi_weight=parameters["psi_weight"],
170            omega_weight=parameters["omega_weight"],
171            kappa_weight=parameters["kappa_weight"],
172            eta_weight=parameters["eta_weight"],
173        )
174
175    @property
176    def current_optimal_stochastic_policy(self) -> np.ndarray:
177        T_map, R_map = self._mdp_model.get_map_estimate()
178        Q, _ = discounted_value_iteration(T_map, R_map)
179        return get_policy_from_q_values(Q, True)
180
181    def __init__(
182        self,
183        seed: int,
184        mdp_specs: MDPSpec,
185        optimization_horizon: int,
186        # MDP model parameters
187        reward_prior_model: RewardsConjugateModel = None,
188        transitions_prior_model: TransitionsConjugateModel = None,
189        rewards_prior_prms=None,
190        transitions_prior_prms=None,
191        # Actor parameters
192        epsilon_greedy: Union[float, Callable] = None,
193        boltzmann_temperature: Union[float, Callable] = None,
194        psi_weight: float = 1.0,
195        omega_weight: float = 1.0,
196        kappa_weight: float = 1.0,
197        eta_weight: float = 1.0,
198        get_psi: Callable[[int, int, int, float], float] = get_psi,
199        get_omega: Callable[[int, int, int, float], float] = get_omega,
200        get_kappa: Callable[[int, int, int, float], float] = get_kappa,
201        get_eta: Callable[[int, int, int, float, float], float] = get_eta,
202        p: float = 0.05,
203        no_optimistic_sampling: bool = False,
204        truncate_reward_with_max: bool = False,
205        min_steps_before_new_episode: int = 0,
206        max_psi: int = 60,
207    ):
208        r"""
209        Parameters
210        ----------
211        seed : int
212            The random seed.
213        mdp_specs : MDPSpec
214            The full specification of the MDP.
215        optimization_horizon : int
216            The total number of interactions that the agent is expected to have with the MDP.
217        reward_prior_model : RewardsConjugateModel, optional
218            The reward priors.
219        transitions_prior_model : TransitionsConjugateModel, optional
220            The transitions priors.
221        rewards_prior_prms : Any
222            The reward prior parameters.
223        transitions_prior_prms : Any
224            The transitions prior parameters.
225        epsilon_greedy : Union[float, Callable], optional
226            The probability of selecting an action at random. It can be provided as a float or as a function of the
227            total number of interactions. By default, the probability is set to zero.
228        boltzmann_temperature : Union[float, Callable], optional
229            The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of
230            the total number of interactions. By default, Boltzmann exploration is disabled.
231        psi_weight : float
232            The coefficient for which the theoretical value of the :math:`\psi` parameter is multiplied for. By default,
233            it is set to one.
234        omega_weight : float
235            The coefficient for which the theoretical value of the :math:`\omega` parameter is multiplied for. By default,
236            it is set to one.
237        kappa_weight : float
238            The coefficient for which the theoretical value of the :math:`\kappa` parameter is multiplied for. By default,
239            it is set to one.
240        eta_weight : float
241            The coefficient for which the theoretical value of the :math:`\eta` parameter is multiplied for. By default,
242            it is set to one.
243        get_psi : Callable[[int, int, int, float], float]
244            The function that computes the value of the :math:`\psi` parameter given number of states, number of action,
245             optimization horizon, and probability of failure. By default, it is set to the theoretical value.
246        get_omega : Callable[[int, int, int, float], float]
247            The function that computes the value of the :math:`\omega` parameter given number of states, number of action,
248             optimization horizon, and probability of failure. By default, it is set to the theoretical value.
249        get_kappa : Callable[[int, int, int, float], float]
250            The function that computes the value of the :math:`\kappa` parameter given number of states, number of action,
251             optimization horizon, and probability of failure. By default, it is set to the theoretical value.
252        get_eta : Callable[[int, int, int, float, float], float]
253            The function that computes the value of the :math:`\eta` parameter given number of states, number of action,
254             optimization horizon, probability of failure, and the omega parameter. By default, it is set to the
255             theoretical value.
256        p : float
257            The probability of failure. By default, it is set to :math:`0.05`.
258        no_optimistic_sampling : bool
259            If True the optimistic sampling procedure is disabled.
260        truncate_reward_with_max : bool
261            If True, the sampled rewards are truncated to the maximum possible value of the reward. By default, it is
262            set to False.
263        min_steps_before_new_episode : int
264            The minimum interval length between artificial episodes. By default, it is set to zero.
265        max_psi : int
266            The maximum value of the :math:`\psi` parameter. By default, it is set to :math:`60`.
267        """
268
269        self._n_states = mdp_specs.observations.num_values
270        self._n_actions = mdp_specs.actions.num_values
271
272        self.truncate_reward_with_max = truncate_reward_with_max
273        self.no_optimistic_sampling = (
274            no_optimistic_sampling
275            or (self._n_states ** 2 * self._n_actions) > 6_000_000
276        )
277
278        self.p = p
279        self.psi = min(
280            max_psi,
281            max(
282                2,
283                int(
284                    psi_weight
285                    * get_psi(self._n_states, self._n_actions, optimization_horizon, p)
286                ),
287            ),
288        )
289        self.omega = omega_weight * get_omega(
290            self._n_states, self._n_actions, optimization_horizon, p
291        )
292        self.kappa = kappa_weight * get_kappa(
293            self._n_states, self._n_actions, optimization_horizon, p
294        )
295        self.eta = max(
296            5,
297            min(
298                10 * self._n_states,
299                eta_weight
300                * get_eta(
301                    self._n_states,
302                    self._n_actions,
303                    optimization_horizon,
304                    p,
305                    self.omega,
306                ),
307            ),
308        )
309
310        self._n_states = mdp_specs.observations.num_values
311        self.episode = 0
312        self.min_steps_before_new_episode = min_steps_before_new_episode
313        self.last_change = 0
314
315        self.M = np.zeros(
316            (self._n_states, self._n_actions, self._n_states), dtype=np.float32
317        )
318        self.N = np.zeros(
319            (self._n_states, self._n_actions, self._n_states), dtype=np.int32
320        )
321        q_shape = (
322            (self._n_states, self._n_actions, self._n_states)
323            if no_optimistic_sampling
324            else (self.psi, self._n_states, self._n_actions, self._n_states)
325        )
326        self.Q = np.zeros(q_shape, dtype=np.float32)
327        self.nu_k = np.zeros((self._n_states, self._n_actions), dtype=np.int8)
328        self.episode_transition_data = dict()
329
330        mdp_model = BayesianMDPModel(
331            seed,
332            mdp_specs,
333            reward_prior_model=reward_prior_model,
334            transitions_prior_model=transitions_prior_model,
335            rewards_prior_prms=rewards_prior_prms,
336            transitions_prior_prms=transitions_prior_prms,
337        )
338
339        super(PSRLContinuous, self).__init__(
340            seed,
341            mdp_specs,
342            mdp_model,
343            QValuesActor(seed, mdp_specs, epsilon_greedy, boltzmann_temperature),
344            optimization_horizon,
345        )
346
347    def is_episode_end(
348        self,
349        ts_t: dm_env.TimeStep,
350        a_t: "ACTION_TYPE",
351        ts_tp1: dm_env.TimeStep,
352        time: int,
353    ) -> bool:
354        if time - self.last_change < self.min_steps_before_new_episode:
355            return False
356        self.last_change = time
357        nu_k = len(self.episode_transition_data[ts_t.observation, a_t])
358        N_tau = self.N[ts_t.observation, a_t].sum()
359        return N_tau >= 2 * (N_tau - nu_k)
360
361    def episode_end_update(self):
362        if self.no_optimistic_sampling:
363            T = self._mdp_model.sample_T()
364        else:
365            self.optimistic_sampling()
366            T = np.moveaxis(self.Q, 0, 2)
367            T = T.reshape((self._n_states, -1, self._n_states))
368
369        R = self._mdp_model.sample_R()
370        if self.truncate_reward_with_max:
371            R = np.maximum(self.r_max, R)
372        if not self.no_optimistic_sampling:
373            R = np.tile(R, (1, self.psi))
374
375        Q, _ = discounted_value_iteration(T, R)
376        self._actor.set_q_values(Q)
377
378        self.episode_transition_data = dict()
379
380    def before_start_interacting(self):
381        self._actor.set_q_values(
382            self._rng.randn(self._n_states, self._n_actions * self.psi)
383        )
384        self.episode_end_update()
385
386    def select_action(self, ts: dm_env.TimeStep, time: int) -> "ACTION_TYPE":
387        return self.extended_action_to_real(
388            super(PSRLContinuous, self).select_action(ts, time)
389        )
390
391    def step_update(
392        self, ts_t: dm_env.TimeStep, a_t: "ACTION_TYPE", ts_tp1: dm_env.TimeStep, h: int
393    ):
394        super(PSRLContinuous, self).step_update(ts_t, a_t, ts_tp1, h)
395
396        self.M[ts_t.observation, a_t, ts_tp1.observation] = (
397            self.N[ts_t.observation, a_t, ts_tp1.observation] + self.omega
398        ) / self.kappa
399        self.N[ts_t.observation, a_t, ts_tp1.observation] += 1
400
401        if (ts_t.observation, a_t) in self.episode_transition_data:
402            if not ts_tp1.last():
403                self.episode_transition_data[ts_t.observation, a_t].append(
404                    ts_tp1.observation
405                )
406        else:
407            if not ts_tp1.last():
408                self.episode_transition_data[ts_t.observation, a_t] = [
409                    ts_tp1.observation
410                ]
411
412    def optimistic_sampling(self):
413        """
414        performs the optimistic sampling procedure.
415        """
416        Nsum = self.N.sum(-1)
417        cond = Nsum < self.eta
418        indices_2 = list(np.where(cond))
419        indices_1 = list(np.where(~cond))
420
421        do_simple_sampling = len(indices_2[0]) > 0
422        do_posterior_sampling = len(indices_1[0]) > 0
423        if do_simple_sampling:
424            P_hat = self.N / np.maximum(Nsum[..., None], 1)
425            N = np.maximum(self.N, 1)
426            P_minus = P_hat - np.minimum(
427                np.sqrt(3 * P_hat * np.log(4 * self._n_states) / N)
428                + 3 * np.log(4 * self._n_states) / N,
429                P_hat,
430            )
431
432        for psi in range(self.psi):
433            if do_posterior_sampling:
434                self.Q[
435                    tuple([np.array([psi] * len(indices_1[0]))] + indices_1)
436                ] = self._mdp_model._transitions_model.sample_sa(tuple(indices_1))
437            if do_simple_sampling:
438                z = self._rng.randint(self._n_states)
439                summing = 1 - P_minus.sum(-1)
440                P_minus[:, :, z] += summing
441                self.Q[
442                    tuple([np.array([psi] * len(indices_2[0]))] + indices_2)
443                ] = P_minus[tuple(indices_2)]
444                P_minus[:, :, z] -= summing
445
446    def extended_action_to_real(self, action) -> int:
447        """
448        transform the extended action used to induce optimistic to a real action of the MDP.
449        """
450        if self.no_optimistic_sampling:
451            return action
452        psi, real_action = action % self.psi, int(action / self.psi)
453        return real_action

The posterior sampling for reinforcement learning algorithm with optimism.

Agrawal, Shipra, and Randy Jia. "Posterior sampling for reinforcement learning: worst-case regret bounds." arXiv preprint arXiv:1705.07041 (2017).

PSRLContinuous( seed: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, optimization_horizon: int, reward_prior_model: colosseum.agent.mdp_models.bayesian_models.RewardsConjugateModel = None, transitions_prior_model: colosseum.agent.mdp_models.bayesian_models.TransitionsConjugateModel = None, rewards_prior_prms=None, transitions_prior_prms=None, epsilon_greedy: Union[float, Callable] = None, boltzmann_temperature: Union[float, Callable] = None, psi_weight: float = 1.0, omega_weight: float = 1.0, kappa_weight: float = 1.0, eta_weight: float = 1.0, get_psi: Callable[[int, int, int, float], float] = <function get_psi>, get_omega: Callable[[int, int, int, float], float] = <function get_omega>, get_kappa: Callable[[int, int, int, float], float] = <function get_kappa>, get_eta: Callable[[int, int, int, float, float], float] = <function get_eta>, p: float = 0.05, no_optimistic_sampling: bool = False, truncate_reward_with_max: bool = False, min_steps_before_new_episode: int = 0, max_psi: int = 60)
181    def __init__(
182        self,
183        seed: int,
184        mdp_specs: MDPSpec,
185        optimization_horizon: int,
186        # MDP model parameters
187        reward_prior_model: RewardsConjugateModel = None,
188        transitions_prior_model: TransitionsConjugateModel = None,
189        rewards_prior_prms=None,
190        transitions_prior_prms=None,
191        # Actor parameters
192        epsilon_greedy: Union[float, Callable] = None,
193        boltzmann_temperature: Union[float, Callable] = None,
194        psi_weight: float = 1.0,
195        omega_weight: float = 1.0,
196        kappa_weight: float = 1.0,
197        eta_weight: float = 1.0,
198        get_psi: Callable[[int, int, int, float], float] = get_psi,
199        get_omega: Callable[[int, int, int, float], float] = get_omega,
200        get_kappa: Callable[[int, int, int, float], float] = get_kappa,
201        get_eta: Callable[[int, int, int, float, float], float] = get_eta,
202        p: float = 0.05,
203        no_optimistic_sampling: bool = False,
204        truncate_reward_with_max: bool = False,
205        min_steps_before_new_episode: int = 0,
206        max_psi: int = 60,
207    ):
208        r"""
209        Parameters
210        ----------
211        seed : int
212            The random seed.
213        mdp_specs : MDPSpec
214            The full specification of the MDP.
215        optimization_horizon : int
216            The total number of interactions that the agent is expected to have with the MDP.
217        reward_prior_model : RewardsConjugateModel, optional
218            The reward priors.
219        transitions_prior_model : TransitionsConjugateModel, optional
220            The transitions priors.
221        rewards_prior_prms : Any
222            The reward prior parameters.
223        transitions_prior_prms : Any
224            The transitions prior parameters.
225        epsilon_greedy : Union[float, Callable], optional
226            The probability of selecting an action at random. It can be provided as a float or as a function of the
227            total number of interactions. By default, the probability is set to zero.
228        boltzmann_temperature : Union[float, Callable], optional
229            The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of
230            the total number of interactions. By default, Boltzmann exploration is disabled.
231        psi_weight : float
232            The coefficient for which the theoretical value of the :math:`\psi` parameter is multiplied for. By default,
233            it is set to one.
234        omega_weight : float
235            The coefficient for which the theoretical value of the :math:`\omega` parameter is multiplied for. By default,
236            it is set to one.
237        kappa_weight : float
238            The coefficient for which the theoretical value of the :math:`\kappa` parameter is multiplied for. By default,
239            it is set to one.
240        eta_weight : float
241            The coefficient for which the theoretical value of the :math:`\eta` parameter is multiplied for. By default,
242            it is set to one.
243        get_psi : Callable[[int, int, int, float], float]
244            The function that computes the value of the :math:`\psi` parameter given number of states, number of action,
245             optimization horizon, and probability of failure. By default, it is set to the theoretical value.
246        get_omega : Callable[[int, int, int, float], float]
247            The function that computes the value of the :math:`\omega` parameter given number of states, number of action,
248             optimization horizon, and probability of failure. By default, it is set to the theoretical value.
249        get_kappa : Callable[[int, int, int, float], float]
250            The function that computes the value of the :math:`\kappa` parameter given number of states, number of action,
251             optimization horizon, and probability of failure. By default, it is set to the theoretical value.
252        get_eta : Callable[[int, int, int, float, float], float]
253            The function that computes the value of the :math:`\eta` parameter given number of states, number of action,
254             optimization horizon, probability of failure, and the omega parameter. By default, it is set to the
255             theoretical value.
256        p : float
257            The probability of failure. By default, it is set to :math:`0.05`.
258        no_optimistic_sampling : bool
259            If True the optimistic sampling procedure is disabled.
260        truncate_reward_with_max : bool
261            If True, the sampled rewards are truncated to the maximum possible value of the reward. By default, it is
262            set to False.
263        min_steps_before_new_episode : int
264            The minimum interval length between artificial episodes. By default, it is set to zero.
265        max_psi : int
266            The maximum value of the :math:`\psi` parameter. By default, it is set to :math:`60`.
267        """
268
269        self._n_states = mdp_specs.observations.num_values
270        self._n_actions = mdp_specs.actions.num_values
271
272        self.truncate_reward_with_max = truncate_reward_with_max
273        self.no_optimistic_sampling = (
274            no_optimistic_sampling
275            or (self._n_states ** 2 * self._n_actions) > 6_000_000
276        )
277
278        self.p = p
279        self.psi = min(
280            max_psi,
281            max(
282                2,
283                int(
284                    psi_weight
285                    * get_psi(self._n_states, self._n_actions, optimization_horizon, p)
286                ),
287            ),
288        )
289        self.omega = omega_weight * get_omega(
290            self._n_states, self._n_actions, optimization_horizon, p
291        )
292        self.kappa = kappa_weight * get_kappa(
293            self._n_states, self._n_actions, optimization_horizon, p
294        )
295        self.eta = max(
296            5,
297            min(
298                10 * self._n_states,
299                eta_weight
300                * get_eta(
301                    self._n_states,
302                    self._n_actions,
303                    optimization_horizon,
304                    p,
305                    self.omega,
306                ),
307            ),
308        )
309
310        self._n_states = mdp_specs.observations.num_values
311        self.episode = 0
312        self.min_steps_before_new_episode = min_steps_before_new_episode
313        self.last_change = 0
314
315        self.M = np.zeros(
316            (self._n_states, self._n_actions, self._n_states), dtype=np.float32
317        )
318        self.N = np.zeros(
319            (self._n_states, self._n_actions, self._n_states), dtype=np.int32
320        )
321        q_shape = (
322            (self._n_states, self._n_actions, self._n_states)
323            if no_optimistic_sampling
324            else (self.psi, self._n_states, self._n_actions, self._n_states)
325        )
326        self.Q = np.zeros(q_shape, dtype=np.float32)
327        self.nu_k = np.zeros((self._n_states, self._n_actions), dtype=np.int8)
328        self.episode_transition_data = dict()
329
330        mdp_model = BayesianMDPModel(
331            seed,
332            mdp_specs,
333            reward_prior_model=reward_prior_model,
334            transitions_prior_model=transitions_prior_model,
335            rewards_prior_prms=rewards_prior_prms,
336            transitions_prior_prms=transitions_prior_prms,
337        )
338
339        super(PSRLContinuous, self).__init__(
340            seed,
341            mdp_specs,
342            mdp_model,
343            QValuesActor(seed, mdp_specs, epsilon_greedy, boltzmann_temperature),
344            optimization_horizon,
345        )
Parameters
  • seed (int): The random seed.
  • mdp_specs (MDPSpec): The full specification of the MDP.
  • optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
  • reward_prior_model (RewardsConjugateModel, optional): The reward priors.
  • transitions_prior_model (TransitionsConjugateModel, optional): The transitions priors.
  • rewards_prior_prms (Any): The reward prior parameters.
  • transitions_prior_prms (Any): The transitions prior parameters.
  • epsilon_greedy (Union[float, Callable], optional): The probability of selecting an action at random. It can be provided as a float or as a function of the total number of interactions. By default, the probability is set to zero.
  • boltzmann_temperature (Union[float, Callable], optional): The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of the total number of interactions. By default, Boltzmann exploration is disabled.
  • psi_weight (float): The coefficient for which the theoretical value of the \( \psi \) parameter is multiplied for. By default, it is set to one.
  • omega_weight (float): The coefficient for which the theoretical value of the \( \omega \) parameter is multiplied for. By default, it is set to one.
  • kappa_weight (float): The coefficient for which the theoretical value of the \( \kappa \) parameter is multiplied for. By default, it is set to one.
  • eta_weight (float): The coefficient for which the theoretical value of the \( \eta \) parameter is multiplied for. By default, it is set to one.
  • get_psi (Callable[[int, int, int, float], float]): The function that computes the value of the \( \psi \) parameter given number of states, number of action, optimization horizon, and probability of failure. By default, it is set to the theoretical value.
  • get_omega (Callable[[int, int, int, float], float]): The function that computes the value of the \( \omega \) parameter given number of states, number of action, optimization horizon, and probability of failure. By default, it is set to the theoretical value.
  • get_kappa (Callable[[int, int, int, float], float]): The function that computes the value of the \( \kappa \) parameter given number of states, number of action, optimization horizon, and probability of failure. By default, it is set to the theoretical value.
  • get_eta (Callable[[int, int, int, float, float], float]): The function that computes the value of the \( \eta \) parameter given number of states, number of action, optimization horizon, probability of failure, and the omega parameter. By default, it is set to the theoretical value.
  • p (float): The probability of failure. By default, it is set to \( 0.05 \).
  • no_optimistic_sampling (bool): If True the optimistic sampling procedure is disabled.
  • truncate_reward_with_max (bool): If True, the sampled rewards are truncated to the maximum possible value of the reward. By default, it is set to False.
  • min_steps_before_new_episode (int): The minimum interval length between artificial episodes. By default, it is set to zero.
  • max_psi (int): The maximum value of the \( \psi \) parameter. By default, it is set to \( 60 \).
@staticmethod
def is_emission_map_accepted(emission_map: colosseum.emission_maps.base.EmissionMap) -> bool:
126    @staticmethod
127    def is_emission_map_accepted(emission_map: "EmissionMap") -> bool:
128        return emission_map.is_tabular
Returns
  • bool: True if the agent class accepts the emission map.
@staticmethod
def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
130    @staticmethod
131    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
132        return (
133            "from colosseum.agent.mdp_models import bayesian_models\n"
134            f"prms_{index}/PSRLContinuous.reward_prior_model = %bayesian_models.RewardsConjugateModel.N_NIG\n"
135            f"prms_{index}/PSRLContinuous.rewards_prior_prms = [{parameters['rewards_prior_mean']}, 1, 1, 1]\n"
136            f"prms_{index}/PSRLContinuous.psi_weight = {parameters['psi_weight']}\n"
137            f"prms_{index}/PSRLContinuous.omega_weight = {parameters['omega_weight']}\n"
138            f"prms_{index}/PSRLContinuous.kappa_weight = {parameters['kappa_weight']}\n"
139            f"prms_{index}/PSRLContinuous.eta_weight = {parameters['eta_weight']}"
140        )

produces a string containing the gin config file corresponding to the parameters given in input.

Parameters
  • parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
  • index (int): The index assigned to the gin configuration.
Returns
  • gin_config (str): The gin configuration file.
@staticmethod
def is_episodic() -> bool:
142    @staticmethod
143    def is_episodic() -> bool:
144        return False
Returns
  • bool: True if the agent is suited for the episodic setting.
@staticmethod
def get_hyperparameters_search_spaces() -> Dict[str, ray.tune.sample.Domain]:
146    @staticmethod
147    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
148        return {
149            "psi_weight": tune.uniform(0.001, 0.1),
150            "omega_weight": tune.uniform(0.0001, 1),
151            "kappa_weight": tune.uniform(0.2, 4),
152            "eta_weight": tune.uniform(1e-10, 1e-6),
153            "rewards_prior_mean": tune.uniform(0.0, 1.2),
154        }
Returns
  • Dict[str, tune.sample.Domain]: The dictionary with key value pairs corresponding to hyperparameter names and corresponding ray.tune samplers.
@staticmethod
def get_agent_instance_from_parameters( seed: int, optimization_horizon: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, parameters: Dict[str, Any]) -> colosseum.agent.agents.base.BaseAgent:
156    @staticmethod
157    def get_agent_instance_from_parameters(
158        seed: int,
159        optimization_horizon: int,
160        mdp_specs: MDPSpec,
161        parameters: Dict[str, Any],
162    ) -> "BaseAgent":
163        return PSRLContinuous(
164            mdp_specs=mdp_specs,
165            seed=seed,
166            optimization_horizon=optimization_horizon,
167            reward_prior_model=RewardsConjugateModel.N_NIG,
168            rewards_prior_prms=[parameters["rewards_prior_mean"], 1, 1, 1],
169            psi_weight=parameters["psi_weight"],
170            omega_weight=parameters["omega_weight"],
171            kappa_weight=parameters["kappa_weight"],
172            eta_weight=parameters["eta_weight"],
173        )

returns an agent instance for the mdp specification and agent parameters given in input.

Parameters
  • seed (int): The random seed.
  • optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
  • mdp_specs (MDPSpec): The full specification of the MDP.
  • parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
Returns
  • BaseAgent: The agent instance.
current_optimal_stochastic_policy: numpy.ndarray
Returns
  • np.ndarray: The estimates of the best optimal policy given the current knowledge of the agent in the form of distribution over actions.
def is_episode_end( self, ts_t: dm_env._environment.TimeStep, a_t: Union[int, float, numpy.ndarray], ts_tp1: dm_env._environment.TimeStep, time: int) -> bool:
347    def is_episode_end(
348        self,
349        ts_t: dm_env.TimeStep,
350        a_t: "ACTION_TYPE",
351        ts_tp1: dm_env.TimeStep,
352        time: int,
353    ) -> bool:
354        if time - self.last_change < self.min_steps_before_new_episode:
355            return False
356        self.last_change = time
357        nu_k = len(self.episode_transition_data[ts_t.observation, a_t])
358        N_tau = self.N[ts_t.observation, a_t].sum()
359        return N_tau >= 2 * (N_tau - nu_k)

checks whether the episode is terminated. By default, this checks whether the current time step exceeds the time horizon. In the continuous case, this can be used to define artificial episodes.

Parameters
  • ts_t (dm_env.TimeStep): The TimeStep at time t.
  • a_t ("ACTION_TYPE"): The action taken by the agent at time t.
  • ts_tp1 (dm_env.TimeStep): The TimeStep at time t + 1.
  • time (int): The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in the continuous case this refers to the total number of previous interactions.
Returns
  • bool: True if the episode terminated at time t+1.
def episode_end_update(self):
361    def episode_end_update(self):
362        if self.no_optimistic_sampling:
363            T = self._mdp_model.sample_T()
364        else:
365            self.optimistic_sampling()
366            T = np.moveaxis(self.Q, 0, 2)
367            T = T.reshape((self._n_states, -1, self._n_states))
368
369        R = self._mdp_model.sample_R()
370        if self.truncate_reward_with_max:
371            R = np.maximum(self.r_max, R)
372        if not self.no_optimistic_sampling:
373            R = np.tile(R, (1, self.psi))
374
375        Q, _ = discounted_value_iteration(T, R)
376        self._actor.set_q_values(Q)
377
378        self.episode_transition_data = dict()

is called when an episode ends. In the infinite horizon case, we refer to artificial episodes.

def before_start_interacting(self):
380    def before_start_interacting(self):
381        self._actor.set_q_values(
382            self._rng.randn(self._n_states, self._n_actions * self.psi)
383        )
384        self.episode_end_update()

is called before the agent starts interacting with the MDP.

def select_action( self, ts: dm_env._environment.TimeStep, time: int) -> Union[int, float, numpy.ndarray]:
386    def select_action(self, ts: dm_env.TimeStep, time: int) -> "ACTION_TYPE":
387        return self.extended_action_to_real(
388            super(PSRLContinuous, self).select_action(ts, time)
389        )
Parameters
  • ts (dm_env.TimeStep): The TimeStep for which the agent is required to calculate the next action.
  • time (int): The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in the continuous case this refers to the total number of previous interactions.
Returns
  • action (ACTION_TYPE): The action that the agent suggests to take given the observation and the time step.
def step_update( self, ts_t: dm_env._environment.TimeStep, a_t: Union[int, float, numpy.ndarray], ts_tp1: dm_env._environment.TimeStep, h: int):
391    def step_update(
392        self, ts_t: dm_env.TimeStep, a_t: "ACTION_TYPE", ts_tp1: dm_env.TimeStep, h: int
393    ):
394        super(PSRLContinuous, self).step_update(ts_t, a_t, ts_tp1, h)
395
396        self.M[ts_t.observation, a_t, ts_tp1.observation] = (
397            self.N[ts_t.observation, a_t, ts_tp1.observation] + self.omega
398        ) / self.kappa
399        self.N[ts_t.observation, a_t, ts_tp1.observation] += 1
400
401        if (ts_t.observation, a_t) in self.episode_transition_data:
402            if not ts_tp1.last():
403                self.episode_transition_data[ts_t.observation, a_t].append(
404                    ts_tp1.observation
405                )
406        else:
407            if not ts_tp1.last():
408                self.episode_transition_data[ts_t.observation, a_t] = [
409                    ts_tp1.observation
410                ]

adds the transition in input to the MDP model.

Parameters
  • ts_t (dm_env.TimeStep): The TimeStep at time t.
  • a_t ("ACTION_TYPE"): The action taken by the agent at time t.
  • ts_tp1 (dm_env.TimeStep): The TimeStep at time t + 1.
  • time (int): The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in the continuous case this refers to the total number of previous interactions.
def optimistic_sampling(self):
412    def optimistic_sampling(self):
413        """
414        performs the optimistic sampling procedure.
415        """
416        Nsum = self.N.sum(-1)
417        cond = Nsum < self.eta
418        indices_2 = list(np.where(cond))
419        indices_1 = list(np.where(~cond))
420
421        do_simple_sampling = len(indices_2[0]) > 0
422        do_posterior_sampling = len(indices_1[0]) > 0
423        if do_simple_sampling:
424            P_hat = self.N / np.maximum(Nsum[..., None], 1)
425            N = np.maximum(self.N, 1)
426            P_minus = P_hat - np.minimum(
427                np.sqrt(3 * P_hat * np.log(4 * self._n_states) / N)
428                + 3 * np.log(4 * self._n_states) / N,
429                P_hat,
430            )
431
432        for psi in range(self.psi):
433            if do_posterior_sampling:
434                self.Q[
435                    tuple([np.array([psi] * len(indices_1[0]))] + indices_1)
436                ] = self._mdp_model._transitions_model.sample_sa(tuple(indices_1))
437            if do_simple_sampling:
438                z = self._rng.randint(self._n_states)
439                summing = 1 - P_minus.sum(-1)
440                P_minus[:, :, z] += summing
441                self.Q[
442                    tuple([np.array([psi] * len(indices_2[0]))] + indices_2)
443                ] = P_minus[tuple(indices_2)]
444                P_minus[:, :, z] -= summing

performs the optimistic sampling procedure.

def extended_action_to_real(self, action) -> int:
446    def extended_action_to_real(self, action) -> int:
447        """
448        transform the extended action used to induce optimistic to a real action of the MDP.
449        """
450        if self.no_optimistic_sampling:
451            return action
452        psi, real_action = action % self.psi, int(action / self.psi)
453        return real_action

transform the extended action used to induce optimistic to a real action of the MDP.