colosseum.agent.agents.infinite_horizon.posterior_sampling
1from typing import TYPE_CHECKING, Any, Callable, Dict, Union 2 3import dm_env 4import gin 5import numpy as np 6from ray import tune 7 8from colosseum.agent.actors import QValuesActor 9from colosseum.agent.agents.base import BaseAgent 10from colosseum.agent.mdp_models.bayesian_model import BayesianMDPModel 11from colosseum.agent.mdp_models.bayesian_models import RewardsConjugateModel 12from colosseum.agent.mdp_models.bayesian_models import TransitionsConjugateModel 13from colosseum.dynamic_programming import discounted_value_iteration 14from colosseum.dynamic_programming.utils import get_policy_from_q_values 15from colosseum.emission_maps import EmissionMap 16from colosseum.utils.acme.specs import MDPSpec 17 18if TYPE_CHECKING: 19 from colosseum.mdp import ACTION_TYPE 20 21 22def get_psi(n_states, n_actions, T, p) -> float: 23 r""" 24 computes the theoretical value of the :math:`\psi` parameter. 25 26 Parameters 27 ---------- 28 n_states : int 29 The number of states. 30 n_actions : int 31 The number of actions. 32 T : int 33 The optimization horizon. 34 p : float 35 The probability of failure. 36 37 Returns 38 ------- 39 float 40 The theoretical value of the :math:`\psi` parameter. 41 """ 42 return n_states * np.log(n_states * n_actions / p) 43 44 45def get_omega(n_states, n_actions, T, p) -> float: 46 r""" 47 computes the theoretical value of the :math:`\omega` parameter. 48 49 Parameters 50 ---------- 51 n_states : int 52 The number of states. 53 n_actions : int 54 The number of actions. 55 T : int 56 The optimization horizon. 57 p : float 58 The probability of failure. 59 60 Returns 61 ------- 62 float 63 The theoretical value of the :math:`\omega` parameter. 64 """ 65 return np.log(T / p) 66 67 68def get_kappa(n_states, n_actions, T, p) -> float: 69 r""" 70 computes the theoretical value of the :math:`\kappa` parameter. 71 72 Parameters 73 ---------- 74 n_states : int 75 The number of states. 76 n_actions : int 77 The number of actions. 78 T : int 79 The optimization horizon. 80 p : float 81 The probability of failure. 82 83 Returns 84 ------- 85 float 86 The theoretical value of the :math:`\kappa` parameter. 87 """ 88 return np.log(T / p) 89 90 91def get_eta(n_states, n_actions, T, p, omega) -> float: 92 r""" 93 computes the theoretical value of the :math:`\eta` parameter. 94 95 Parameters 96 ---------- 97 n_states : int 98 The number of states. 99 n_actions : int 100 The number of actions. 101 T : int 102 The optimization horizon. 103 p : float 104 The probability of failure. 105 omega : float 106 The omega parameter. 107 108 Returns 109 ------- 110 float 111 The theoretical value of the :math:`\eta` parameter. 112 """ 113 return np.sqrt(T * n_states / n_actions) + 12 * omega * n_states ** 4 114 115 116@gin.configurable 117class PSRLContinuous(BaseAgent): 118 """ 119 The posterior sampling for reinforcement learning algorithm with optimism. 120 121 Agrawal, Shipra, and Randy Jia. "Posterior sampling for reinforcement learning: worst-case regret bounds." arXiv 122 preprint arXiv:1705.07041 (2017). 123 """ 124 125 @staticmethod 126 def is_emission_map_accepted(emission_map: "EmissionMap") -> bool: 127 return emission_map.is_tabular 128 129 @staticmethod 130 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 131 return ( 132 "from colosseum.agent.mdp_models import bayesian_models\n" 133 f"prms_{index}/PSRLContinuous.reward_prior_model = %bayesian_models.RewardsConjugateModel.N_NIG\n" 134 f"prms_{index}/PSRLContinuous.rewards_prior_prms = [{parameters['rewards_prior_mean']}, 1, 1, 1]\n" 135 f"prms_{index}/PSRLContinuous.psi_weight = {parameters['psi_weight']}\n" 136 f"prms_{index}/PSRLContinuous.omega_weight = {parameters['omega_weight']}\n" 137 f"prms_{index}/PSRLContinuous.kappa_weight = {parameters['kappa_weight']}\n" 138 f"prms_{index}/PSRLContinuous.eta_weight = {parameters['eta_weight']}" 139 ) 140 141 @staticmethod 142 def is_episodic() -> bool: 143 return False 144 145 @staticmethod 146 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 147 return { 148 "psi_weight": tune.uniform(0.001, 0.1), 149 "omega_weight": tune.uniform(0.0001, 1), 150 "kappa_weight": tune.uniform(0.2, 4), 151 "eta_weight": tune.uniform(1e-10, 1e-6), 152 "rewards_prior_mean": tune.uniform(0.0, 1.2), 153 } 154 155 @staticmethod 156 def get_agent_instance_from_parameters( 157 seed: int, 158 optimization_horizon: int, 159 mdp_specs: MDPSpec, 160 parameters: Dict[str, Any], 161 ) -> "BaseAgent": 162 return PSRLContinuous( 163 mdp_specs=mdp_specs, 164 seed=seed, 165 optimization_horizon=optimization_horizon, 166 reward_prior_model=RewardsConjugateModel.N_NIG, 167 rewards_prior_prms=[parameters["rewards_prior_mean"], 1, 1, 1], 168 psi_weight=parameters["psi_weight"], 169 omega_weight=parameters["omega_weight"], 170 kappa_weight=parameters["kappa_weight"], 171 eta_weight=parameters["eta_weight"], 172 ) 173 174 @property 175 def current_optimal_stochastic_policy(self) -> np.ndarray: 176 T_map, R_map = self._mdp_model.get_map_estimate() 177 Q, _ = discounted_value_iteration(T_map, R_map) 178 return get_policy_from_q_values(Q, True) 179 180 def __init__( 181 self, 182 seed: int, 183 mdp_specs: MDPSpec, 184 optimization_horizon: int, 185 # MDP model parameters 186 reward_prior_model: RewardsConjugateModel = None, 187 transitions_prior_model: TransitionsConjugateModel = None, 188 rewards_prior_prms=None, 189 transitions_prior_prms=None, 190 # Actor parameters 191 epsilon_greedy: Union[float, Callable] = None, 192 boltzmann_temperature: Union[float, Callable] = None, 193 psi_weight: float = 1.0, 194 omega_weight: float = 1.0, 195 kappa_weight: float = 1.0, 196 eta_weight: float = 1.0, 197 get_psi: Callable[[int, int, int, float], float] = get_psi, 198 get_omega: Callable[[int, int, int, float], float] = get_omega, 199 get_kappa: Callable[[int, int, int, float], float] = get_kappa, 200 get_eta: Callable[[int, int, int, float, float], float] = get_eta, 201 p: float = 0.05, 202 no_optimistic_sampling: bool = False, 203 truncate_reward_with_max: bool = False, 204 min_steps_before_new_episode: int = 0, 205 max_psi: int = 60, 206 ): 207 r""" 208 Parameters 209 ---------- 210 seed : int 211 The random seed. 212 mdp_specs : MDPSpec 213 The full specification of the MDP. 214 optimization_horizon : int 215 The total number of interactions that the agent is expected to have with the MDP. 216 reward_prior_model : RewardsConjugateModel, optional 217 The reward priors. 218 transitions_prior_model : TransitionsConjugateModel, optional 219 The transitions priors. 220 rewards_prior_prms : Any 221 The reward prior parameters. 222 transitions_prior_prms : Any 223 The transitions prior parameters. 224 epsilon_greedy : Union[float, Callable], optional 225 The probability of selecting an action at random. It can be provided as a float or as a function of the 226 total number of interactions. By default, the probability is set to zero. 227 boltzmann_temperature : Union[float, Callable], optional 228 The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of 229 the total number of interactions. By default, Boltzmann exploration is disabled. 230 psi_weight : float 231 The coefficient for which the theoretical value of the :math:`\psi` parameter is multiplied for. By default, 232 it is set to one. 233 omega_weight : float 234 The coefficient for which the theoretical value of the :math:`\omega` parameter is multiplied for. By default, 235 it is set to one. 236 kappa_weight : float 237 The coefficient for which the theoretical value of the :math:`\kappa` parameter is multiplied for. By default, 238 it is set to one. 239 eta_weight : float 240 The coefficient for which the theoretical value of the :math:`\eta` parameter is multiplied for. By default, 241 it is set to one. 242 get_psi : Callable[[int, int, int, float], float] 243 The function that computes the value of the :math:`\psi` parameter given number of states, number of action, 244 optimization horizon, and probability of failure. By default, it is set to the theoretical value. 245 get_omega : Callable[[int, int, int, float], float] 246 The function that computes the value of the :math:`\omega` parameter given number of states, number of action, 247 optimization horizon, and probability of failure. By default, it is set to the theoretical value. 248 get_kappa : Callable[[int, int, int, float], float] 249 The function that computes the value of the :math:`\kappa` parameter given number of states, number of action, 250 optimization horizon, and probability of failure. By default, it is set to the theoretical value. 251 get_eta : Callable[[int, int, int, float, float], float] 252 The function that computes the value of the :math:`\eta` parameter given number of states, number of action, 253 optimization horizon, probability of failure, and the omega parameter. By default, it is set to the 254 theoretical value. 255 p : float 256 The probability of failure. By default, it is set to :math:`0.05`. 257 no_optimistic_sampling : bool 258 If True the optimistic sampling procedure is disabled. 259 truncate_reward_with_max : bool 260 If True, the sampled rewards are truncated to the maximum possible value of the reward. By default, it is 261 set to False. 262 min_steps_before_new_episode : int 263 The minimum interval length between artificial episodes. By default, it is set to zero. 264 max_psi : int 265 The maximum value of the :math:`\psi` parameter. By default, it is set to :math:`60`. 266 """ 267 268 self._n_states = mdp_specs.observations.num_values 269 self._n_actions = mdp_specs.actions.num_values 270 271 self.truncate_reward_with_max = truncate_reward_with_max 272 self.no_optimistic_sampling = ( 273 no_optimistic_sampling 274 or (self._n_states ** 2 * self._n_actions) > 6_000_000 275 ) 276 277 self.p = p 278 self.psi = min( 279 max_psi, 280 max( 281 2, 282 int( 283 psi_weight 284 * get_psi(self._n_states, self._n_actions, optimization_horizon, p) 285 ), 286 ), 287 ) 288 self.omega = omega_weight * get_omega( 289 self._n_states, self._n_actions, optimization_horizon, p 290 ) 291 self.kappa = kappa_weight * get_kappa( 292 self._n_states, self._n_actions, optimization_horizon, p 293 ) 294 self.eta = max( 295 5, 296 min( 297 10 * self._n_states, 298 eta_weight 299 * get_eta( 300 self._n_states, 301 self._n_actions, 302 optimization_horizon, 303 p, 304 self.omega, 305 ), 306 ), 307 ) 308 309 self._n_states = mdp_specs.observations.num_values 310 self.episode = 0 311 self.min_steps_before_new_episode = min_steps_before_new_episode 312 self.last_change = 0 313 314 self.M = np.zeros( 315 (self._n_states, self._n_actions, self._n_states), dtype=np.float32 316 ) 317 self.N = np.zeros( 318 (self._n_states, self._n_actions, self._n_states), dtype=np.int32 319 ) 320 q_shape = ( 321 (self._n_states, self._n_actions, self._n_states) 322 if no_optimistic_sampling 323 else (self.psi, self._n_states, self._n_actions, self._n_states) 324 ) 325 self.Q = np.zeros(q_shape, dtype=np.float32) 326 self.nu_k = np.zeros((self._n_states, self._n_actions), dtype=np.int8) 327 self.episode_transition_data = dict() 328 329 mdp_model = BayesianMDPModel( 330 seed, 331 mdp_specs, 332 reward_prior_model=reward_prior_model, 333 transitions_prior_model=transitions_prior_model, 334 rewards_prior_prms=rewards_prior_prms, 335 transitions_prior_prms=transitions_prior_prms, 336 ) 337 338 super(PSRLContinuous, self).__init__( 339 seed, 340 mdp_specs, 341 mdp_model, 342 QValuesActor(seed, mdp_specs, epsilon_greedy, boltzmann_temperature), 343 optimization_horizon, 344 ) 345 346 def is_episode_end( 347 self, 348 ts_t: dm_env.TimeStep, 349 a_t: "ACTION_TYPE", 350 ts_tp1: dm_env.TimeStep, 351 time: int, 352 ) -> bool: 353 if time - self.last_change < self.min_steps_before_new_episode: 354 return False 355 self.last_change = time 356 nu_k = len(self.episode_transition_data[ts_t.observation, a_t]) 357 N_tau = self.N[ts_t.observation, a_t].sum() 358 return N_tau >= 2 * (N_tau - nu_k) 359 360 def episode_end_update(self): 361 if self.no_optimistic_sampling: 362 T = self._mdp_model.sample_T() 363 else: 364 self.optimistic_sampling() 365 T = np.moveaxis(self.Q, 0, 2) 366 T = T.reshape((self._n_states, -1, self._n_states)) 367 368 R = self._mdp_model.sample_R() 369 if self.truncate_reward_with_max: 370 R = np.maximum(self.r_max, R) 371 if not self.no_optimistic_sampling: 372 R = np.tile(R, (1, self.psi)) 373 374 Q, _ = discounted_value_iteration(T, R) 375 self._actor.set_q_values(Q) 376 377 self.episode_transition_data = dict() 378 379 def before_start_interacting(self): 380 self._actor.set_q_values( 381 self._rng.randn(self._n_states, self._n_actions * self.psi) 382 ) 383 self.episode_end_update() 384 385 def select_action(self, ts: dm_env.TimeStep, time: int) -> "ACTION_TYPE": 386 return self.extended_action_to_real( 387 super(PSRLContinuous, self).select_action(ts, time) 388 ) 389 390 def step_update( 391 self, ts_t: dm_env.TimeStep, a_t: "ACTION_TYPE", ts_tp1: dm_env.TimeStep, h: int 392 ): 393 super(PSRLContinuous, self).step_update(ts_t, a_t, ts_tp1, h) 394 395 self.M[ts_t.observation, a_t, ts_tp1.observation] = ( 396 self.N[ts_t.observation, a_t, ts_tp1.observation] + self.omega 397 ) / self.kappa 398 self.N[ts_t.observation, a_t, ts_tp1.observation] += 1 399 400 if (ts_t.observation, a_t) in self.episode_transition_data: 401 if not ts_tp1.last(): 402 self.episode_transition_data[ts_t.observation, a_t].append( 403 ts_tp1.observation 404 ) 405 else: 406 if not ts_tp1.last(): 407 self.episode_transition_data[ts_t.observation, a_t] = [ 408 ts_tp1.observation 409 ] 410 411 def optimistic_sampling(self): 412 """ 413 performs the optimistic sampling procedure. 414 """ 415 Nsum = self.N.sum(-1) 416 cond = Nsum < self.eta 417 indices_2 = list(np.where(cond)) 418 indices_1 = list(np.where(~cond)) 419 420 do_simple_sampling = len(indices_2[0]) > 0 421 do_posterior_sampling = len(indices_1[0]) > 0 422 if do_simple_sampling: 423 P_hat = self.N / np.maximum(Nsum[..., None], 1) 424 N = np.maximum(self.N, 1) 425 P_minus = P_hat - np.minimum( 426 np.sqrt(3 * P_hat * np.log(4 * self._n_states) / N) 427 + 3 * np.log(4 * self._n_states) / N, 428 P_hat, 429 ) 430 431 for psi in range(self.psi): 432 if do_posterior_sampling: 433 self.Q[ 434 tuple([np.array([psi] * len(indices_1[0]))] + indices_1) 435 ] = self._mdp_model._transitions_model.sample_sa(tuple(indices_1)) 436 if do_simple_sampling: 437 z = self._rng.randint(self._n_states) 438 summing = 1 - P_minus.sum(-1) 439 P_minus[:, :, z] += summing 440 self.Q[ 441 tuple([np.array([psi] * len(indices_2[0]))] + indices_2) 442 ] = P_minus[tuple(indices_2)] 443 P_minus[:, :, z] -= summing 444 445 def extended_action_to_real(self, action) -> int: 446 """ 447 transform the extended action used to induce optimistic to a real action of the MDP. 448 """ 449 if self.no_optimistic_sampling: 450 return action 451 psi, real_action = action % self.psi, int(action / self.psi) 452 return real_action
23def get_psi(n_states, n_actions, T, p) -> float: 24 r""" 25 computes the theoretical value of the :math:`\psi` parameter. 26 27 Parameters 28 ---------- 29 n_states : int 30 The number of states. 31 n_actions : int 32 The number of actions. 33 T : int 34 The optimization horizon. 35 p : float 36 The probability of failure. 37 38 Returns 39 ------- 40 float 41 The theoretical value of the :math:`\psi` parameter. 42 """ 43 return n_states * np.log(n_states * n_actions / p)
computes the theoretical value of the \( \psi \) parameter.
Parameters
- n_states (int): The number of states.
- n_actions (int): The number of actions.
- T (int): The optimization horizon.
- p (float): The probability of failure.
Returns
- float: The theoretical value of the \( \psi \) parameter.
46def get_omega(n_states, n_actions, T, p) -> float: 47 r""" 48 computes the theoretical value of the :math:`\omega` parameter. 49 50 Parameters 51 ---------- 52 n_states : int 53 The number of states. 54 n_actions : int 55 The number of actions. 56 T : int 57 The optimization horizon. 58 p : float 59 The probability of failure. 60 61 Returns 62 ------- 63 float 64 The theoretical value of the :math:`\omega` parameter. 65 """ 66 return np.log(T / p)
computes the theoretical value of the \( \omega \) parameter.
Parameters
- n_states (int): The number of states.
- n_actions (int): The number of actions.
- T (int): The optimization horizon.
- p (float): The probability of failure.
Returns
- float: The theoretical value of the \( \omega \) parameter.
69def get_kappa(n_states, n_actions, T, p) -> float: 70 r""" 71 computes the theoretical value of the :math:`\kappa` parameter. 72 73 Parameters 74 ---------- 75 n_states : int 76 The number of states. 77 n_actions : int 78 The number of actions. 79 T : int 80 The optimization horizon. 81 p : float 82 The probability of failure. 83 84 Returns 85 ------- 86 float 87 The theoretical value of the :math:`\kappa` parameter. 88 """ 89 return np.log(T / p)
computes the theoretical value of the \( \kappa \) parameter.
Parameters
- n_states (int): The number of states.
- n_actions (int): The number of actions.
- T (int): The optimization horizon.
- p (float): The probability of failure.
Returns
- float: The theoretical value of the \( \kappa \) parameter.
92def get_eta(n_states, n_actions, T, p, omega) -> float: 93 r""" 94 computes the theoretical value of the :math:`\eta` parameter. 95 96 Parameters 97 ---------- 98 n_states : int 99 The number of states. 100 n_actions : int 101 The number of actions. 102 T : int 103 The optimization horizon. 104 p : float 105 The probability of failure. 106 omega : float 107 The omega parameter. 108 109 Returns 110 ------- 111 float 112 The theoretical value of the :math:`\eta` parameter. 113 """ 114 return np.sqrt(T * n_states / n_actions) + 12 * omega * n_states ** 4
computes the theoretical value of the \( \eta \) parameter.
Parameters
- n_states (int): The number of states.
- n_actions (int): The number of actions.
- T (int): The optimization horizon.
- p (float): The probability of failure.
- omega (float): The omega parameter.
Returns
- float: The theoretical value of the \( \eta \) parameter.
117@gin.configurable 118class PSRLContinuous(BaseAgent): 119 """ 120 The posterior sampling for reinforcement learning algorithm with optimism. 121 122 Agrawal, Shipra, and Randy Jia. "Posterior sampling for reinforcement learning: worst-case regret bounds." arXiv 123 preprint arXiv:1705.07041 (2017). 124 """ 125 126 @staticmethod 127 def is_emission_map_accepted(emission_map: "EmissionMap") -> bool: 128 return emission_map.is_tabular 129 130 @staticmethod 131 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 132 return ( 133 "from colosseum.agent.mdp_models import bayesian_models\n" 134 f"prms_{index}/PSRLContinuous.reward_prior_model = %bayesian_models.RewardsConjugateModel.N_NIG\n" 135 f"prms_{index}/PSRLContinuous.rewards_prior_prms = [{parameters['rewards_prior_mean']}, 1, 1, 1]\n" 136 f"prms_{index}/PSRLContinuous.psi_weight = {parameters['psi_weight']}\n" 137 f"prms_{index}/PSRLContinuous.omega_weight = {parameters['omega_weight']}\n" 138 f"prms_{index}/PSRLContinuous.kappa_weight = {parameters['kappa_weight']}\n" 139 f"prms_{index}/PSRLContinuous.eta_weight = {parameters['eta_weight']}" 140 ) 141 142 @staticmethod 143 def is_episodic() -> bool: 144 return False 145 146 @staticmethod 147 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 148 return { 149 "psi_weight": tune.uniform(0.001, 0.1), 150 "omega_weight": tune.uniform(0.0001, 1), 151 "kappa_weight": tune.uniform(0.2, 4), 152 "eta_weight": tune.uniform(1e-10, 1e-6), 153 "rewards_prior_mean": tune.uniform(0.0, 1.2), 154 } 155 156 @staticmethod 157 def get_agent_instance_from_parameters( 158 seed: int, 159 optimization_horizon: int, 160 mdp_specs: MDPSpec, 161 parameters: Dict[str, Any], 162 ) -> "BaseAgent": 163 return PSRLContinuous( 164 mdp_specs=mdp_specs, 165 seed=seed, 166 optimization_horizon=optimization_horizon, 167 reward_prior_model=RewardsConjugateModel.N_NIG, 168 rewards_prior_prms=[parameters["rewards_prior_mean"], 1, 1, 1], 169 psi_weight=parameters["psi_weight"], 170 omega_weight=parameters["omega_weight"], 171 kappa_weight=parameters["kappa_weight"], 172 eta_weight=parameters["eta_weight"], 173 ) 174 175 @property 176 def current_optimal_stochastic_policy(self) -> np.ndarray: 177 T_map, R_map = self._mdp_model.get_map_estimate() 178 Q, _ = discounted_value_iteration(T_map, R_map) 179 return get_policy_from_q_values(Q, True) 180 181 def __init__( 182 self, 183 seed: int, 184 mdp_specs: MDPSpec, 185 optimization_horizon: int, 186 # MDP model parameters 187 reward_prior_model: RewardsConjugateModel = None, 188 transitions_prior_model: TransitionsConjugateModel = None, 189 rewards_prior_prms=None, 190 transitions_prior_prms=None, 191 # Actor parameters 192 epsilon_greedy: Union[float, Callable] = None, 193 boltzmann_temperature: Union[float, Callable] = None, 194 psi_weight: float = 1.0, 195 omega_weight: float = 1.0, 196 kappa_weight: float = 1.0, 197 eta_weight: float = 1.0, 198 get_psi: Callable[[int, int, int, float], float] = get_psi, 199 get_omega: Callable[[int, int, int, float], float] = get_omega, 200 get_kappa: Callable[[int, int, int, float], float] = get_kappa, 201 get_eta: Callable[[int, int, int, float, float], float] = get_eta, 202 p: float = 0.05, 203 no_optimistic_sampling: bool = False, 204 truncate_reward_with_max: bool = False, 205 min_steps_before_new_episode: int = 0, 206 max_psi: int = 60, 207 ): 208 r""" 209 Parameters 210 ---------- 211 seed : int 212 The random seed. 213 mdp_specs : MDPSpec 214 The full specification of the MDP. 215 optimization_horizon : int 216 The total number of interactions that the agent is expected to have with the MDP. 217 reward_prior_model : RewardsConjugateModel, optional 218 The reward priors. 219 transitions_prior_model : TransitionsConjugateModel, optional 220 The transitions priors. 221 rewards_prior_prms : Any 222 The reward prior parameters. 223 transitions_prior_prms : Any 224 The transitions prior parameters. 225 epsilon_greedy : Union[float, Callable], optional 226 The probability of selecting an action at random. It can be provided as a float or as a function of the 227 total number of interactions. By default, the probability is set to zero. 228 boltzmann_temperature : Union[float, Callable], optional 229 The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of 230 the total number of interactions. By default, Boltzmann exploration is disabled. 231 psi_weight : float 232 The coefficient for which the theoretical value of the :math:`\psi` parameter is multiplied for. By default, 233 it is set to one. 234 omega_weight : float 235 The coefficient for which the theoretical value of the :math:`\omega` parameter is multiplied for. By default, 236 it is set to one. 237 kappa_weight : float 238 The coefficient for which the theoretical value of the :math:`\kappa` parameter is multiplied for. By default, 239 it is set to one. 240 eta_weight : float 241 The coefficient for which the theoretical value of the :math:`\eta` parameter is multiplied for. By default, 242 it is set to one. 243 get_psi : Callable[[int, int, int, float], float] 244 The function that computes the value of the :math:`\psi` parameter given number of states, number of action, 245 optimization horizon, and probability of failure. By default, it is set to the theoretical value. 246 get_omega : Callable[[int, int, int, float], float] 247 The function that computes the value of the :math:`\omega` parameter given number of states, number of action, 248 optimization horizon, and probability of failure. By default, it is set to the theoretical value. 249 get_kappa : Callable[[int, int, int, float], float] 250 The function that computes the value of the :math:`\kappa` parameter given number of states, number of action, 251 optimization horizon, and probability of failure. By default, it is set to the theoretical value. 252 get_eta : Callable[[int, int, int, float, float], float] 253 The function that computes the value of the :math:`\eta` parameter given number of states, number of action, 254 optimization horizon, probability of failure, and the omega parameter. By default, it is set to the 255 theoretical value. 256 p : float 257 The probability of failure. By default, it is set to :math:`0.05`. 258 no_optimistic_sampling : bool 259 If True the optimistic sampling procedure is disabled. 260 truncate_reward_with_max : bool 261 If True, the sampled rewards are truncated to the maximum possible value of the reward. By default, it is 262 set to False. 263 min_steps_before_new_episode : int 264 The minimum interval length between artificial episodes. By default, it is set to zero. 265 max_psi : int 266 The maximum value of the :math:`\psi` parameter. By default, it is set to :math:`60`. 267 """ 268 269 self._n_states = mdp_specs.observations.num_values 270 self._n_actions = mdp_specs.actions.num_values 271 272 self.truncate_reward_with_max = truncate_reward_with_max 273 self.no_optimistic_sampling = ( 274 no_optimistic_sampling 275 or (self._n_states ** 2 * self._n_actions) > 6_000_000 276 ) 277 278 self.p = p 279 self.psi = min( 280 max_psi, 281 max( 282 2, 283 int( 284 psi_weight 285 * get_psi(self._n_states, self._n_actions, optimization_horizon, p) 286 ), 287 ), 288 ) 289 self.omega = omega_weight * get_omega( 290 self._n_states, self._n_actions, optimization_horizon, p 291 ) 292 self.kappa = kappa_weight * get_kappa( 293 self._n_states, self._n_actions, optimization_horizon, p 294 ) 295 self.eta = max( 296 5, 297 min( 298 10 * self._n_states, 299 eta_weight 300 * get_eta( 301 self._n_states, 302 self._n_actions, 303 optimization_horizon, 304 p, 305 self.omega, 306 ), 307 ), 308 ) 309 310 self._n_states = mdp_specs.observations.num_values 311 self.episode = 0 312 self.min_steps_before_new_episode = min_steps_before_new_episode 313 self.last_change = 0 314 315 self.M = np.zeros( 316 (self._n_states, self._n_actions, self._n_states), dtype=np.float32 317 ) 318 self.N = np.zeros( 319 (self._n_states, self._n_actions, self._n_states), dtype=np.int32 320 ) 321 q_shape = ( 322 (self._n_states, self._n_actions, self._n_states) 323 if no_optimistic_sampling 324 else (self.psi, self._n_states, self._n_actions, self._n_states) 325 ) 326 self.Q = np.zeros(q_shape, dtype=np.float32) 327 self.nu_k = np.zeros((self._n_states, self._n_actions), dtype=np.int8) 328 self.episode_transition_data = dict() 329 330 mdp_model = BayesianMDPModel( 331 seed, 332 mdp_specs, 333 reward_prior_model=reward_prior_model, 334 transitions_prior_model=transitions_prior_model, 335 rewards_prior_prms=rewards_prior_prms, 336 transitions_prior_prms=transitions_prior_prms, 337 ) 338 339 super(PSRLContinuous, self).__init__( 340 seed, 341 mdp_specs, 342 mdp_model, 343 QValuesActor(seed, mdp_specs, epsilon_greedy, boltzmann_temperature), 344 optimization_horizon, 345 ) 346 347 def is_episode_end( 348 self, 349 ts_t: dm_env.TimeStep, 350 a_t: "ACTION_TYPE", 351 ts_tp1: dm_env.TimeStep, 352 time: int, 353 ) -> bool: 354 if time - self.last_change < self.min_steps_before_new_episode: 355 return False 356 self.last_change = time 357 nu_k = len(self.episode_transition_data[ts_t.observation, a_t]) 358 N_tau = self.N[ts_t.observation, a_t].sum() 359 return N_tau >= 2 * (N_tau - nu_k) 360 361 def episode_end_update(self): 362 if self.no_optimistic_sampling: 363 T = self._mdp_model.sample_T() 364 else: 365 self.optimistic_sampling() 366 T = np.moveaxis(self.Q, 0, 2) 367 T = T.reshape((self._n_states, -1, self._n_states)) 368 369 R = self._mdp_model.sample_R() 370 if self.truncate_reward_with_max: 371 R = np.maximum(self.r_max, R) 372 if not self.no_optimistic_sampling: 373 R = np.tile(R, (1, self.psi)) 374 375 Q, _ = discounted_value_iteration(T, R) 376 self._actor.set_q_values(Q) 377 378 self.episode_transition_data = dict() 379 380 def before_start_interacting(self): 381 self._actor.set_q_values( 382 self._rng.randn(self._n_states, self._n_actions * self.psi) 383 ) 384 self.episode_end_update() 385 386 def select_action(self, ts: dm_env.TimeStep, time: int) -> "ACTION_TYPE": 387 return self.extended_action_to_real( 388 super(PSRLContinuous, self).select_action(ts, time) 389 ) 390 391 def step_update( 392 self, ts_t: dm_env.TimeStep, a_t: "ACTION_TYPE", ts_tp1: dm_env.TimeStep, h: int 393 ): 394 super(PSRLContinuous, self).step_update(ts_t, a_t, ts_tp1, h) 395 396 self.M[ts_t.observation, a_t, ts_tp1.observation] = ( 397 self.N[ts_t.observation, a_t, ts_tp1.observation] + self.omega 398 ) / self.kappa 399 self.N[ts_t.observation, a_t, ts_tp1.observation] += 1 400 401 if (ts_t.observation, a_t) in self.episode_transition_data: 402 if not ts_tp1.last(): 403 self.episode_transition_data[ts_t.observation, a_t].append( 404 ts_tp1.observation 405 ) 406 else: 407 if not ts_tp1.last(): 408 self.episode_transition_data[ts_t.observation, a_t] = [ 409 ts_tp1.observation 410 ] 411 412 def optimistic_sampling(self): 413 """ 414 performs the optimistic sampling procedure. 415 """ 416 Nsum = self.N.sum(-1) 417 cond = Nsum < self.eta 418 indices_2 = list(np.where(cond)) 419 indices_1 = list(np.where(~cond)) 420 421 do_simple_sampling = len(indices_2[0]) > 0 422 do_posterior_sampling = len(indices_1[0]) > 0 423 if do_simple_sampling: 424 P_hat = self.N / np.maximum(Nsum[..., None], 1) 425 N = np.maximum(self.N, 1) 426 P_minus = P_hat - np.minimum( 427 np.sqrt(3 * P_hat * np.log(4 * self._n_states) / N) 428 + 3 * np.log(4 * self._n_states) / N, 429 P_hat, 430 ) 431 432 for psi in range(self.psi): 433 if do_posterior_sampling: 434 self.Q[ 435 tuple([np.array([psi] * len(indices_1[0]))] + indices_1) 436 ] = self._mdp_model._transitions_model.sample_sa(tuple(indices_1)) 437 if do_simple_sampling: 438 z = self._rng.randint(self._n_states) 439 summing = 1 - P_minus.sum(-1) 440 P_minus[:, :, z] += summing 441 self.Q[ 442 tuple([np.array([psi] * len(indices_2[0]))] + indices_2) 443 ] = P_minus[tuple(indices_2)] 444 P_minus[:, :, z] -= summing 445 446 def extended_action_to_real(self, action) -> int: 447 """ 448 transform the extended action used to induce optimistic to a real action of the MDP. 449 """ 450 if self.no_optimistic_sampling: 451 return action 452 psi, real_action = action % self.psi, int(action / self.psi) 453 return real_action
The posterior sampling for reinforcement learning algorithm with optimism.
Agrawal, Shipra, and Randy Jia. "Posterior sampling for reinforcement learning: worst-case regret bounds." arXiv preprint arXiv:1705.07041 (2017).
181 def __init__( 182 self, 183 seed: int, 184 mdp_specs: MDPSpec, 185 optimization_horizon: int, 186 # MDP model parameters 187 reward_prior_model: RewardsConjugateModel = None, 188 transitions_prior_model: TransitionsConjugateModel = None, 189 rewards_prior_prms=None, 190 transitions_prior_prms=None, 191 # Actor parameters 192 epsilon_greedy: Union[float, Callable] = None, 193 boltzmann_temperature: Union[float, Callable] = None, 194 psi_weight: float = 1.0, 195 omega_weight: float = 1.0, 196 kappa_weight: float = 1.0, 197 eta_weight: float = 1.0, 198 get_psi: Callable[[int, int, int, float], float] = get_psi, 199 get_omega: Callable[[int, int, int, float], float] = get_omega, 200 get_kappa: Callable[[int, int, int, float], float] = get_kappa, 201 get_eta: Callable[[int, int, int, float, float], float] = get_eta, 202 p: float = 0.05, 203 no_optimistic_sampling: bool = False, 204 truncate_reward_with_max: bool = False, 205 min_steps_before_new_episode: int = 0, 206 max_psi: int = 60, 207 ): 208 r""" 209 Parameters 210 ---------- 211 seed : int 212 The random seed. 213 mdp_specs : MDPSpec 214 The full specification of the MDP. 215 optimization_horizon : int 216 The total number of interactions that the agent is expected to have with the MDP. 217 reward_prior_model : RewardsConjugateModel, optional 218 The reward priors. 219 transitions_prior_model : TransitionsConjugateModel, optional 220 The transitions priors. 221 rewards_prior_prms : Any 222 The reward prior parameters. 223 transitions_prior_prms : Any 224 The transitions prior parameters. 225 epsilon_greedy : Union[float, Callable], optional 226 The probability of selecting an action at random. It can be provided as a float or as a function of the 227 total number of interactions. By default, the probability is set to zero. 228 boltzmann_temperature : Union[float, Callable], optional 229 The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of 230 the total number of interactions. By default, Boltzmann exploration is disabled. 231 psi_weight : float 232 The coefficient for which the theoretical value of the :math:`\psi` parameter is multiplied for. By default, 233 it is set to one. 234 omega_weight : float 235 The coefficient for which the theoretical value of the :math:`\omega` parameter is multiplied for. By default, 236 it is set to one. 237 kappa_weight : float 238 The coefficient for which the theoretical value of the :math:`\kappa` parameter is multiplied for. By default, 239 it is set to one. 240 eta_weight : float 241 The coefficient for which the theoretical value of the :math:`\eta` parameter is multiplied for. By default, 242 it is set to one. 243 get_psi : Callable[[int, int, int, float], float] 244 The function that computes the value of the :math:`\psi` parameter given number of states, number of action, 245 optimization horizon, and probability of failure. By default, it is set to the theoretical value. 246 get_omega : Callable[[int, int, int, float], float] 247 The function that computes the value of the :math:`\omega` parameter given number of states, number of action, 248 optimization horizon, and probability of failure. By default, it is set to the theoretical value. 249 get_kappa : Callable[[int, int, int, float], float] 250 The function that computes the value of the :math:`\kappa` parameter given number of states, number of action, 251 optimization horizon, and probability of failure. By default, it is set to the theoretical value. 252 get_eta : Callable[[int, int, int, float, float], float] 253 The function that computes the value of the :math:`\eta` parameter given number of states, number of action, 254 optimization horizon, probability of failure, and the omega parameter. By default, it is set to the 255 theoretical value. 256 p : float 257 The probability of failure. By default, it is set to :math:`0.05`. 258 no_optimistic_sampling : bool 259 If True the optimistic sampling procedure is disabled. 260 truncate_reward_with_max : bool 261 If True, the sampled rewards are truncated to the maximum possible value of the reward. By default, it is 262 set to False. 263 min_steps_before_new_episode : int 264 The minimum interval length between artificial episodes. By default, it is set to zero. 265 max_psi : int 266 The maximum value of the :math:`\psi` parameter. By default, it is set to :math:`60`. 267 """ 268 269 self._n_states = mdp_specs.observations.num_values 270 self._n_actions = mdp_specs.actions.num_values 271 272 self.truncate_reward_with_max = truncate_reward_with_max 273 self.no_optimistic_sampling = ( 274 no_optimistic_sampling 275 or (self._n_states ** 2 * self._n_actions) > 6_000_000 276 ) 277 278 self.p = p 279 self.psi = min( 280 max_psi, 281 max( 282 2, 283 int( 284 psi_weight 285 * get_psi(self._n_states, self._n_actions, optimization_horizon, p) 286 ), 287 ), 288 ) 289 self.omega = omega_weight * get_omega( 290 self._n_states, self._n_actions, optimization_horizon, p 291 ) 292 self.kappa = kappa_weight * get_kappa( 293 self._n_states, self._n_actions, optimization_horizon, p 294 ) 295 self.eta = max( 296 5, 297 min( 298 10 * self._n_states, 299 eta_weight 300 * get_eta( 301 self._n_states, 302 self._n_actions, 303 optimization_horizon, 304 p, 305 self.omega, 306 ), 307 ), 308 ) 309 310 self._n_states = mdp_specs.observations.num_values 311 self.episode = 0 312 self.min_steps_before_new_episode = min_steps_before_new_episode 313 self.last_change = 0 314 315 self.M = np.zeros( 316 (self._n_states, self._n_actions, self._n_states), dtype=np.float32 317 ) 318 self.N = np.zeros( 319 (self._n_states, self._n_actions, self._n_states), dtype=np.int32 320 ) 321 q_shape = ( 322 (self._n_states, self._n_actions, self._n_states) 323 if no_optimistic_sampling 324 else (self.psi, self._n_states, self._n_actions, self._n_states) 325 ) 326 self.Q = np.zeros(q_shape, dtype=np.float32) 327 self.nu_k = np.zeros((self._n_states, self._n_actions), dtype=np.int8) 328 self.episode_transition_data = dict() 329 330 mdp_model = BayesianMDPModel( 331 seed, 332 mdp_specs, 333 reward_prior_model=reward_prior_model, 334 transitions_prior_model=transitions_prior_model, 335 rewards_prior_prms=rewards_prior_prms, 336 transitions_prior_prms=transitions_prior_prms, 337 ) 338 339 super(PSRLContinuous, self).__init__( 340 seed, 341 mdp_specs, 342 mdp_model, 343 QValuesActor(seed, mdp_specs, epsilon_greedy, boltzmann_temperature), 344 optimization_horizon, 345 )
Parameters
- seed (int): The random seed.
- mdp_specs (MDPSpec): The full specification of the MDP.
- optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
- reward_prior_model (RewardsConjugateModel, optional): The reward priors.
- transitions_prior_model (TransitionsConjugateModel, optional): The transitions priors.
- rewards_prior_prms (Any): The reward prior parameters.
- transitions_prior_prms (Any): The transitions prior parameters.
- epsilon_greedy (Union[float, Callable], optional): The probability of selecting an action at random. It can be provided as a float or as a function of the total number of interactions. By default, the probability is set to zero.
- boltzmann_temperature (Union[float, Callable], optional): The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of the total number of interactions. By default, Boltzmann exploration is disabled.
- psi_weight (float): The coefficient for which the theoretical value of the \( \psi \) parameter is multiplied for. By default, it is set to one.
- omega_weight (float): The coefficient for which the theoretical value of the \( \omega \) parameter is multiplied for. By default, it is set to one.
- kappa_weight (float): The coefficient for which the theoretical value of the \( \kappa \) parameter is multiplied for. By default, it is set to one.
- eta_weight (float): The coefficient for which the theoretical value of the \( \eta \) parameter is multiplied for. By default, it is set to one.
- get_psi (Callable[[int, int, int, float], float]): The function that computes the value of the \( \psi \) parameter given number of states, number of action, optimization horizon, and probability of failure. By default, it is set to the theoretical value.
- get_omega (Callable[[int, int, int, float], float]): The function that computes the value of the \( \omega \) parameter given number of states, number of action, optimization horizon, and probability of failure. By default, it is set to the theoretical value.
- get_kappa (Callable[[int, int, int, float], float]): The function that computes the value of the \( \kappa \) parameter given number of states, number of action, optimization horizon, and probability of failure. By default, it is set to the theoretical value.
- get_eta (Callable[[int, int, int, float, float], float]): The function that computes the value of the \( \eta \) parameter given number of states, number of action, optimization horizon, probability of failure, and the omega parameter. By default, it is set to the theoretical value.
- p (float): The probability of failure. By default, it is set to \( 0.05 \).
- no_optimistic_sampling (bool): If True the optimistic sampling procedure is disabled.
- truncate_reward_with_max (bool): If True, the sampled rewards are truncated to the maximum possible value of the reward. By default, it is set to False.
- min_steps_before_new_episode (int): The minimum interval length between artificial episodes. By default, it is set to zero.
- max_psi (int): The maximum value of the \( \psi \) parameter. By default, it is set to \( 60 \).
126 @staticmethod 127 def is_emission_map_accepted(emission_map: "EmissionMap") -> bool: 128 return emission_map.is_tabular
Returns
- bool: True if the agent class accepts the emission map.
130 @staticmethod 131 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 132 return ( 133 "from colosseum.agent.mdp_models import bayesian_models\n" 134 f"prms_{index}/PSRLContinuous.reward_prior_model = %bayesian_models.RewardsConjugateModel.N_NIG\n" 135 f"prms_{index}/PSRLContinuous.rewards_prior_prms = [{parameters['rewards_prior_mean']}, 1, 1, 1]\n" 136 f"prms_{index}/PSRLContinuous.psi_weight = {parameters['psi_weight']}\n" 137 f"prms_{index}/PSRLContinuous.omega_weight = {parameters['omega_weight']}\n" 138 f"prms_{index}/PSRLContinuous.kappa_weight = {parameters['kappa_weight']}\n" 139 f"prms_{index}/PSRLContinuous.eta_weight = {parameters['eta_weight']}" 140 )
produces a string containing the gin config file corresponding to the parameters given in input.
Parameters
- parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
- index (int): The index assigned to the gin configuration.
Returns
- gin_config (str): The gin configuration file.
Returns
- bool: True if the agent is suited for the episodic setting.
146 @staticmethod 147 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 148 return { 149 "psi_weight": tune.uniform(0.001, 0.1), 150 "omega_weight": tune.uniform(0.0001, 1), 151 "kappa_weight": tune.uniform(0.2, 4), 152 "eta_weight": tune.uniform(1e-10, 1e-6), 153 "rewards_prior_mean": tune.uniform(0.0, 1.2), 154 }
Returns
- Dict[str, tune.sample.Domain]: The dictionary with key value pairs corresponding to hyperparameter names and corresponding
ray.tune
samplers.
156 @staticmethod 157 def get_agent_instance_from_parameters( 158 seed: int, 159 optimization_horizon: int, 160 mdp_specs: MDPSpec, 161 parameters: Dict[str, Any], 162 ) -> "BaseAgent": 163 return PSRLContinuous( 164 mdp_specs=mdp_specs, 165 seed=seed, 166 optimization_horizon=optimization_horizon, 167 reward_prior_model=RewardsConjugateModel.N_NIG, 168 rewards_prior_prms=[parameters["rewards_prior_mean"], 1, 1, 1], 169 psi_weight=parameters["psi_weight"], 170 omega_weight=parameters["omega_weight"], 171 kappa_weight=parameters["kappa_weight"], 172 eta_weight=parameters["eta_weight"], 173 )
returns an agent instance for the mdp specification and agent parameters given in input.
Parameters
- seed (int): The random seed.
- optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
- mdp_specs (MDPSpec): The full specification of the MDP.
- parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
Returns
- BaseAgent: The agent instance.
Returns
- np.ndarray: The estimates of the best optimal policy given the current knowledge of the agent in the form of distribution over actions.
347 def is_episode_end( 348 self, 349 ts_t: dm_env.TimeStep, 350 a_t: "ACTION_TYPE", 351 ts_tp1: dm_env.TimeStep, 352 time: int, 353 ) -> bool: 354 if time - self.last_change < self.min_steps_before_new_episode: 355 return False 356 self.last_change = time 357 nu_k = len(self.episode_transition_data[ts_t.observation, a_t]) 358 N_tau = self.N[ts_t.observation, a_t].sum() 359 return N_tau >= 2 * (N_tau - nu_k)
checks whether the episode is terminated. By default, this checks whether the current time step exceeds the time horizon. In the continuous case, this can be used to define artificial episodes.
Parameters
- ts_t (dm_env.TimeStep): The TimeStep at time t.
- a_t ("ACTION_TYPE"): The action taken by the agent at time t.
- ts_tp1 (dm_env.TimeStep): The TimeStep at time t + 1.
- time (int): The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in the continuous case this refers to the total number of previous interactions.
Returns
- bool: True if the episode terminated at time t+1.
361 def episode_end_update(self): 362 if self.no_optimistic_sampling: 363 T = self._mdp_model.sample_T() 364 else: 365 self.optimistic_sampling() 366 T = np.moveaxis(self.Q, 0, 2) 367 T = T.reshape((self._n_states, -1, self._n_states)) 368 369 R = self._mdp_model.sample_R() 370 if self.truncate_reward_with_max: 371 R = np.maximum(self.r_max, R) 372 if not self.no_optimistic_sampling: 373 R = np.tile(R, (1, self.psi)) 374 375 Q, _ = discounted_value_iteration(T, R) 376 self._actor.set_q_values(Q) 377 378 self.episode_transition_data = dict()
is called when an episode ends. In the infinite horizon case, we refer to artificial episodes.
380 def before_start_interacting(self): 381 self._actor.set_q_values( 382 self._rng.randn(self._n_states, self._n_actions * self.psi) 383 ) 384 self.episode_end_update()
is called before the agent starts interacting with the MDP.
386 def select_action(self, ts: dm_env.TimeStep, time: int) -> "ACTION_TYPE": 387 return self.extended_action_to_real( 388 super(PSRLContinuous, self).select_action(ts, time) 389 )
Parameters
- ts (dm_env.TimeStep): The TimeStep for which the agent is required to calculate the next action.
- time (int): The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in the continuous case this refers to the total number of previous interactions.
Returns
- action (ACTION_TYPE): The action that the agent suggests to take given the observation and the time step.
391 def step_update( 392 self, ts_t: dm_env.TimeStep, a_t: "ACTION_TYPE", ts_tp1: dm_env.TimeStep, h: int 393 ): 394 super(PSRLContinuous, self).step_update(ts_t, a_t, ts_tp1, h) 395 396 self.M[ts_t.observation, a_t, ts_tp1.observation] = ( 397 self.N[ts_t.observation, a_t, ts_tp1.observation] + self.omega 398 ) / self.kappa 399 self.N[ts_t.observation, a_t, ts_tp1.observation] += 1 400 401 if (ts_t.observation, a_t) in self.episode_transition_data: 402 if not ts_tp1.last(): 403 self.episode_transition_data[ts_t.observation, a_t].append( 404 ts_tp1.observation 405 ) 406 else: 407 if not ts_tp1.last(): 408 self.episode_transition_data[ts_t.observation, a_t] = [ 409 ts_tp1.observation 410 ]
adds the transition in input to the MDP model.
Parameters
- ts_t (dm_env.TimeStep): The TimeStep at time t.
- a_t ("ACTION_TYPE"): The action taken by the agent at time t.
- ts_tp1 (dm_env.TimeStep): The TimeStep at time t + 1.
- time (int): The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in the continuous case this refers to the total number of previous interactions.
412 def optimistic_sampling(self): 413 """ 414 performs the optimistic sampling procedure. 415 """ 416 Nsum = self.N.sum(-1) 417 cond = Nsum < self.eta 418 indices_2 = list(np.where(cond)) 419 indices_1 = list(np.where(~cond)) 420 421 do_simple_sampling = len(indices_2[0]) > 0 422 do_posterior_sampling = len(indices_1[0]) > 0 423 if do_simple_sampling: 424 P_hat = self.N / np.maximum(Nsum[..., None], 1) 425 N = np.maximum(self.N, 1) 426 P_minus = P_hat - np.minimum( 427 np.sqrt(3 * P_hat * np.log(4 * self._n_states) / N) 428 + 3 * np.log(4 * self._n_states) / N, 429 P_hat, 430 ) 431 432 for psi in range(self.psi): 433 if do_posterior_sampling: 434 self.Q[ 435 tuple([np.array([psi] * len(indices_1[0]))] + indices_1) 436 ] = self._mdp_model._transitions_model.sample_sa(tuple(indices_1)) 437 if do_simple_sampling: 438 z = self._rng.randint(self._n_states) 439 summing = 1 - P_minus.sum(-1) 440 P_minus[:, :, z] += summing 441 self.Q[ 442 tuple([np.array([psi] * len(indices_2[0]))] + indices_2) 443 ] = P_minus[tuple(indices_2)] 444 P_minus[:, :, z] -= summing
performs the optimistic sampling procedure.
446 def extended_action_to_real(self, action) -> int: 447 """ 448 transform the extended action used to induce optimistic to a real action of the MDP. 449 """ 450 if self.no_optimistic_sampling: 451 return action 452 psi, real_action = action % self.psi, int(action / self.psi) 453 return real_action
transform the extended action used to induce optimistic to a real action of the MDP.