colosseum.agent.agents.infinite_horizon.posterior

PSRLContinuous( seed: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, optimization_horizon: int, reward_prior_model: colosseum.agent.mdp_models.bayesian_models.RewardsConjugateModel = None, transitions_prior_model: colosseum.agent.mdp_models.bayesian_models.TransitionsConjugateModel = None, rewards_prior_prms=None, transitions_prior_prms=None, epsilon_greedy: Union[float, Callable] = None, boltzmann_temperature: Union[float, Callable] = None, psi_weight: float = 1.0, omega_weight: float = 1.0, kappa_weight: float = 1.0, eta_weight: float = 1.0, get_psi: Callable[[int, int, int, float], float] = <function get_psi>, get_omega: Callable[[int, int, int, float], float] = <function get_omega>, get_kappa: Callable[[int, int, int, float], float] = <function get_kappa>, get_eta: Callable[[int, int, int, float, float], float] = <function get_eta>, p: float = 0.05, no_optimistic_sampling: bool = False, truncate_reward_with_max: bool = False, min_steps_before_new_episode: int = 0, max_psi: int = 60) View Source

181    def __init__(
182        self,
183        seed: int,
184        mdp_specs: MDPSpec,
185        optimization_horizon: int,
186        # MDP model parameters
187        reward_prior_model: RewardsConjugateModel = None,
188        transitions_prior_model: TransitionsConjugateModel = None,
189        rewards_prior_prms=None,
190        transitions_prior_prms=None,
191        # Actor parameters
192        epsilon_greedy: Union[float, Callable] = None,
193        boltzmann_temperature: Union[float, Callable] = None,
194        psi_weight: float = 1.0,
195        omega_weight: float = 1.0,
196        kappa_weight: float = 1.0,
197        eta_weight: float = 1.0,
198        get_psi: Callable[[int, int, int, float], float] = get_psi,
199        get_omega: Callable[[int, int, int, float], float] = get_omega,
200        get_kappa: Callable[[int, int, int, float], float] = get_kappa,
201        get_eta: Callable[[int, int, int, float, float], float] = get_eta,
202        p: float = 0.05,
203        no_optimistic_sampling: bool = False,
204        truncate_reward_with_max: bool = False,
205        min_steps_before_new_episode: int = 0,
206        max_psi: int = 60,
207    ):
208        r"""
209        Parameters
210        ----------
211        seed : int
212            The random seed.
213        mdp_specs : MDPSpec
214            The full specification of the MDP.
215        optimization_horizon : int
216            The total number of interactions that the agent is expected to have with the MDP.
217        reward_prior_model : RewardsConjugateModel, optional
218            The reward priors.
219        transitions_prior_model : TransitionsConjugateModel, optional
220            The transitions priors.
221        rewards_prior_prms : Any
222            The reward prior parameters.
223        transitions_prior_prms : Any
224            The transitions prior parameters.
225        epsilon_greedy : Union[float, Callable], optional
226            The probability of selecting an action at random. It can be provided as a float or as a function of the
227            total number of interactions. By default, the probability is set to zero.
228        boltzmann_temperature : Union[float, Callable], optional
229            The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of
230            the total number of interactions. By default, Boltzmann exploration is disabled.
231        psi_weight : float
232            The coefficient for which the theoretical value of the :math:`\psi` parameter is multiplied for. By default,
233            it is set to one.
234        omega_weight : float
235            The coefficient for which the theoretical value of the :math:`\omega` parameter is multiplied for. By default,
236            it is set to one.
237        kappa_weight : float
238            The coefficient for which the theoretical value of the :math:`\kappa` parameter is multiplied for. By default,
239            it is set to one.
240        eta_weight : float
241            The coefficient for which the theoretical value of the :math:`\eta` parameter is multiplied for. By default,
242            it is set to one.
243        get_psi : Callable[[int, int, int, float], float]
244            The function that computes the value of the :math:`\psi` parameter given number of states, number of action,
245             optimization horizon, and probability of failure. By default, it is set to the theoretical value.
246        get_omega : Callable[[int, int, int, float], float]
247            The function that computes the value of the :math:`\omega` parameter given number of states, number of action,
248             optimization horizon, and probability of failure. By default, it is set to the theoretical value.
249        get_kappa : Callable[[int, int, int, float], float]
250            The function that computes the value of the :math:`\kappa` parameter given number of states, number of action,
251             optimization horizon, and probability of failure. By default, it is set to the theoretical value.
252        get_eta : Callable[[int, int, int, float, float], float]
253            The function that computes the value of the :math:`\eta` parameter given number of states, number of action,
254             optimization horizon, probability of failure, and the omega parameter. By default, it is set to the
255             theoretical value.
256        p : float
257            The probability of failure. By default, it is set to :math:`0.05`.
258        no_optimistic_sampling : bool
259            If True the optimistic sampling procedure is disabled.
260        truncate_reward_with_max : bool
261            If True, the sampled rewards are truncated to the maximum possible value of the reward. By default, it is
262            set to False.
263        min_steps_before_new_episode : int
264            The minimum interval length between artificial episodes. By default, it is set to zero.
265        max_psi : int
266            The maximum value of the :math:`\psi` parameter. By default, it is set to :math:`60`.
267        """
268
269        self._n_states = mdp_specs.observations.num_values
270        self._n_actions = mdp_specs.actions.num_values
271
272        self.truncate_reward_with_max = truncate_reward_with_max
273        self.no_optimistic_sampling = (
274            no_optimistic_sampling
275            or (self._n_states ** 2 * self._n_actions) > 6_000_000
276        )
277
278        self.p = p
279        self.psi = min(
280            max_psi,
281            max(
282                2,
283                int(
284                    psi_weight
285                    * get_psi(self._n_states, self._n_actions, optimization_horizon, p)
286                ),
287            ),
288        )
289        self.omega = omega_weight * get_omega(
290            self._n_states, self._n_actions, optimization_horizon, p
291        )
292        self.kappa = kappa_weight * get_kappa(
293            self._n_states, self._n_actions, optimization_horizon, p
294        )
295        self.eta = max(
296            5,
297            min(
298                10 * self._n_states,
299                eta_weight
300                * get_eta(
301                    self._n_states,
302                    self._n_actions,
303                    optimization_horizon,
304                    p,
305                    self.omega,
306                ),
307            ),
308        )
309
310        self._n_states = mdp_specs.observations.num_values
311        self.episode = 0
312        self.min_steps_before_new_episode = min_steps_before_new_episode
313        self.last_change = 0
314
315        self.M = np.zeros(
316            (self._n_states, self._n_actions, self._n_states), dtype=np.float32
317        )
318        self.N = np.zeros(
319            (self._n_states, self._n_actions, self._n_states), dtype=np.int32
320        )
321        q_shape = (
322            (self._n_states, self._n_actions, self._n_states)
323            if no_optimistic_sampling
324            else (self.psi, self._n_states, self._n_actions, self._n_states)
325        )
326        self.Q = np.zeros(q_shape, dtype=np.float32)
327        self.nu_k = np.zeros((self._n_states, self._n_actions), dtype=np.int8)
328        self.episode_transition_data = dict()
329
330        mdp_model = BayesianMDPModel(
331            seed,
332            mdp_specs,
333            reward_prior_model=reward_prior_model,
334            transitions_prior_model=transitions_prior_model,
335            rewards_prior_prms=rewards_prior_prms,
336            transitions_prior_prms=transitions_prior_prms,
337        )
338
339        super(PSRLContinuous, self).__init__(
340            seed,
341            mdp_specs,
342            mdp_model,
343            QValuesActor(seed, mdp_specs, epsilon_greedy, boltzmann_temperature),
344            optimization_horizon,
345        )

Parameters

seed (int): The random seed.
mdp_specs (MDPSpec): The full specification of the MDP.
optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
reward_prior_model (RewardsConjugateModel, optional): The reward priors.
transitions_prior_model (TransitionsConjugateModel, optional): The transitions priors.
rewards_prior_prms (Any): The reward prior parameters.
transitions_prior_prms (Any): The transitions prior parameters.
epsilon_greedy (Union[float, Callable], optional): The probability of selecting an action at random. It can be provided as a float or as a function of the total number of interactions. By default, the probability is set to zero.
boltzmann_temperature (Union[float, Callable], optional): The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of the total number of interactions. By default, Boltzmann exploration is disabled.
psi_weight (float): The coefficient for which the theoretical value of the \( \psi \) parameter is multiplied for. By default, it is set to one.
omega_weight (float): The coefficient for which the theoretical value of the \( \omega \) parameter is multiplied for. By default, it is set to one.
kappa_weight (float): The coefficient for which the theoretical value of the \( \kappa \) parameter is multiplied for. By default, it is set to one.
eta_weight (float): The coefficient for which the theoretical value of the \( \eta \) parameter is multiplied for. By default, it is set to one.
get_psi (Callable[[int, int, int, float], float]): The function that computes the value of the \( \psi \) parameter given number of states, number of action, optimization horizon, and probability of failure. By default, it is set to the theoretical value.
get_omega (Callable[[int, int, int, float], float]): The function that computes the value of the \( \omega \) parameter given number of states, number of action, optimization horizon, and probability of failure. By default, it is set to the theoretical value.
get_kappa (Callable[[int, int, int, float], float]): The function that computes the value of the \( \kappa \) parameter given number of states, number of action, optimization horizon, and probability of failure. By default, it is set to the theoretical value.
get_eta (Callable[[int, int, int, float, float], float]): The function that computes the value of the \( \eta \) parameter given number of states, number of action, optimization horizon, probability of failure, and the omega parameter. By default, it is set to the theoretical value.
p (float): The probability of failure. By default, it is set to \( 0.05 \).
no_optimistic_sampling (bool): If True the optimistic sampling procedure is disabled.
truncate_reward_with_max (bool): If True, the sampled rewards are truncated to the maximum possible value of the reward. By default, it is set to False.
min_steps_before_new_episode (int): The minimum interval length between artificial episodes. By default, it is set to zero.
max_psi (int): The maximum value of the \( \psi \) parameter. By default, it is set to \( 60 \).

@staticmethod

def is_emission_map_accepted(emission_map: colosseum.emission_maps.base.EmissionMap) -> bool: View Source

126    @staticmethod
127    def is_emission_map_accepted(emission_map: "EmissionMap") -> bool:
128        return emission_map.is_tabular

Returns

bool: True if the agent class accepts the emission map.

@staticmethod

def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): View Source

130    @staticmethod
131    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
132        return (
133            "from colosseum.agent.mdp_models import bayesian_models\n"
134            f"prms_{index}/PSRLContinuous.reward_prior_model = %bayesian_models.RewardsConjugateModel.N_NIG\n"
135            f"prms_{index}/PSRLContinuous.rewards_prior_prms = [{parameters['rewards_prior_mean']}, 1, 1, 1]\n"
136            f"prms_{index}/PSRLContinuous.psi_weight = {parameters['psi_weight']}\n"
137            f"prms_{index}/PSRLContinuous.omega_weight = {parameters['omega_weight']}\n"
138            f"prms_{index}/PSRLContinuous.kappa_weight = {parameters['kappa_weight']}\n"
139            f"prms_{index}/PSRLContinuous.eta_weight = {parameters['eta_weight']}"
140        )

produces a string containing the gin config file corresponding to the parameters given in input.

Parameters

parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
index (int): The index assigned to the gin configuration.

Returns

gin_config (str): The gin configuration file.

@staticmethod

def is_episodic() -> bool: View Source

142    @staticmethod
143    def is_episodic() -> bool:
144        return False

Returns

bool: True if the agent is suited for the episodic setting.

@staticmethod

def get_hyperparameters_search_spaces() -> Dict[str, ray.tune.sample.Domain]: View Source

146    @staticmethod
147    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
148        return {
149            "psi_weight": tune.uniform(0.001, 0.1),
150            "omega_weight": tune.uniform(0.0001, 1),
151            "kappa_weight": tune.uniform(0.2, 4),
152            "eta_weight": tune.uniform(1e-10, 1e-6),
153            "rewards_prior_mean": tune.uniform(0.0, 1.2),
154        }

Returns

Dict[str, tune.sample.Domain]: The dictionary with key value pairs corresponding to hyperparameter names and corresponding ray.tune samplers.

@staticmethod

def get_agent_instance_from_parameters( seed: int, optimization_horizon: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, parameters: Dict[str, Any]) -> colosseum.agent.agents.base.BaseAgent: View Source

156    @staticmethod
157    def get_agent_instance_from_parameters(
158        seed: int,
159        optimization_horizon: int,
160        mdp_specs: MDPSpec,
161        parameters: Dict[str, Any],
162    ) -> "BaseAgent":
163        return PSRLContinuous(
164            mdp_specs=mdp_specs,
165            seed=seed,
166            optimization_horizon=optimization_horizon,
167            reward_prior_model=RewardsConjugateModel.N_NIG,
168            rewards_prior_prms=[parameters["rewards_prior_mean"], 1, 1, 1],
169            psi_weight=parameters["psi_weight"],
170            omega_weight=parameters["omega_weight"],
171            kappa_weight=parameters["kappa_weight"],
172            eta_weight=parameters["eta_weight"],
173        )

returns an agent instance for the mdp specification and agent parameters given in input.

Parameters

seed (int): The random seed.
optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
mdp_specs (MDPSpec): The full specification of the MDP.
parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.

Returns

BaseAgent: The agent instance.

current_optimal_stochastic_policy: numpy.ndarray

Returns

np.ndarray: The estimates of the best optimal policy given the current knowledge of the agent in the form of distribution over actions.

def is_episode_end( self, ts_t: dm_env._environment.TimeStep, a_t: Union[int, float, numpy.ndarray], ts_tp1: dm_env._environment.TimeStep, time: int) -> bool: View Source

347    def is_episode_end(
348        self,
349        ts_t: dm_env.TimeStep,
350        a_t: "ACTION_TYPE",
351        ts_tp1: dm_env.TimeStep,
352        time: int,
353    ) -> bool:
354        if time - self.last_change < self.min_steps_before_new_episode:
355            return False
356        self.last_change = time
357        nu_k = len(self.episode_transition_data[ts_t.observation, a_t])
358        N_tau = self.N[ts_t.observation, a_t].sum()
359        return N_tau >= 2 * (N_tau - nu_k)

checks whether the episode is terminated. By default, this checks whether the current time step exceeds the time horizon. In the continuous case, this can be used to define artificial episodes.

Parameters

ts_t (dm_env.TimeStep): The TimeStep at time t.
a_t ("ACTION_TYPE"): The action taken by the agent at time t.
ts_tp1 (dm_env.TimeStep): The TimeStep at time t + 1.
time (int): The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in the continuous case this refers to the total number of previous interactions.

Returns

bool: True if the episode terminated at time t+1.

def episode_end_update(self): View Source

361    def episode_end_update(self):
362        if self.no_optimistic_sampling:
363            T = self._mdp_model.sample_T()
364        else:
365            self.optimistic_sampling()
366            T = np.moveaxis(self.Q, 0, 2)
367            T = T.reshape((self._n_states, -1, self._n_states))
368
369        R = self._mdp_model.sample_R()
370        if self.truncate_reward_with_max:
371            R = np.maximum(self.r_max, R)
372        if not self.no_optimistic_sampling:
373            R = np.tile(R, (1, self.psi))
374
375        Q, _ = discounted_value_iteration(T, R)
376        self._actor.set_q_values(Q)
377
378        self.episode_transition_data = dict()

is called when an episode ends. In the infinite horizon case, we refer to artificial episodes.

def before_start_interacting(self): View Source

380    def before_start_interacting(self):
381        self._actor.set_q_values(
382            self._rng.randn(self._n_states, self._n_actions * self.psi)
383        )
384        self.episode_end_update()

is called before the agent starts interacting with the MDP.

def select_action( self, ts: dm_env._environment.TimeStep, time: int) -> Union[int, float, numpy.ndarray]: View Source

386    def select_action(self, ts: dm_env.TimeStep, time: int) -> "ACTION_TYPE":
387        return self.extended_action_to_real(
388            super(PSRLContinuous, self).select_action(ts, time)
389        )

Parameters

ts (dm_env.TimeStep): The TimeStep for which the agent is required to calculate the next action.
time (int): The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in the continuous case this refers to the total number of previous interactions.

Returns

action (ACTION_TYPE): The action that the agent suggests to take given the observation and the time step.

def step_update( self, ts_t: dm_env._environment.TimeStep, a_t: Union[int, float, numpy.ndarray], ts_tp1: dm_env._environment.TimeStep, h: int): View Source

391    def step_update(
392        self, ts_t: dm_env.TimeStep, a_t: "ACTION_TYPE", ts_tp1: dm_env.TimeStep, h: int
393    ):
394        super(PSRLContinuous, self).step_update(ts_t, a_t, ts_tp1, h)
395
396        self.M[ts_t.observation, a_t, ts_tp1.observation] = (
397            self.N[ts_t.observation, a_t, ts_tp1.observation] + self.omega
398        ) / self.kappa
399        self.N[ts_t.observation, a_t, ts_tp1.observation] += 1
400
401        if (ts_t.observation, a_t) in self.episode_transition_data:
402            if not ts_tp1.last():
403                self.episode_transition_data[ts_t.observation, a_t].append(
404                    ts_tp1.observation
405                )
406        else:
407            if not ts_tp1.last():
408                self.episode_transition_data[ts_t.observation, a_t] = [
409                    ts_tp1.observation
410                ]

adds the transition in input to the MDP model.

Parameters

ts_t (dm_env.TimeStep): The TimeStep at time t.
a_t ("ACTION_TYPE"): The action taken by the agent at time t.
ts_tp1 (dm_env.TimeStep): The TimeStep at time t + 1.
time (int): The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in the continuous case this refers to the total number of previous interactions.

def optimistic_sampling(self): View Source

412    def optimistic_sampling(self):
413        """
414        performs the optimistic sampling procedure.
415        """
416        Nsum = self.N.sum(-1)
417        cond = Nsum < self.eta
418        indices_2 = list(np.where(cond))
419        indices_1 = list(np.where(~cond))
420
421        do_simple_sampling = len(indices_2[0]) > 0
422        do_posterior_sampling = len(indices_1[0]) > 0
423        if do_simple_sampling:
424            P_hat = self.N / np.maximum(Nsum[..., None], 1)
425            N = np.maximum(self.N, 1)
426            P_minus = P_hat - np.minimum(
427                np.sqrt(3 * P_hat * np.log(4 * self._n_states) / N)
428                + 3 * np.log(4 * self._n_states) / N,
429                P_hat,
430            )
431
432        for psi in range(self.psi):
433            if do_posterior_sampling:
434                self.Q[
435                    tuple([np.array([psi] * len(indices_1[0]))] + indices_1)
436                ] = self._mdp_model._transitions_model.sample_sa(tuple(indices_1))
437            if do_simple_sampling:
438                z = self._rng.randint(self._n_states)
439                summing = 1 - P_minus.sum(-1)
440                P_minus[:, :, z] += summing
441                self.Q[
442                    tuple([np.array([psi] * len(indices_2[0]))] + indices_2)
443                ] = P_minus[tuple(indices_2)]
444                P_minus[:, :, z] -= summing

performs the optimistic sampling procedure.

def extended_action_to_real(self, action) -> int: View Source

446    def extended_action_to_real(self, action) -> int:
447        """
448        transform the extended action used to induce optimistic to a real action of the MDP.
449        """
450        if self.no_optimistic_sampling:
451            return action
452        psi, real_action = action % self.psi, int(action / self.psi)
453        return real_action

transform the extended action used to induce optimistic to a real action of the MDP.

colosseum.agent.agents.infinite_horizon.posterior_sampling

Parameters

Returns

Parameters

Returns

Parameters

Returns

Parameters

Returns

Parameters

Returns

Parameters

Returns

Returns

Returns

Parameters

Returns

Returns

Parameters

Returns

Parameters

Returns

Parameters

Inherited Members