colosseum.agent.agents.episodic.q_learning
1from typing import TYPE_CHECKING, Any, Callable, Dict, Union 2 3import dm_env 4import gin 5import numpy as np 6from ray import tune 7 8from colosseum.agent.actors import QValuesActor 9from colosseum.agent.agents.base import BaseAgent 10from colosseum.agent.mdp_models.base import BaseMDPModel 11from colosseum.dynamic_programming.utils import get_policy_from_q_values 12from colosseum.emission_maps import EmissionMap 13 14if TYPE_CHECKING: 15 from colosseum.mdp import ACTION_TYPE 16 from colosseum.utils.acme.specs import MDPSpec 17 18 19class QValuesModel(BaseMDPModel): 20 def __init__( 21 self, 22 seed: int, 23 mdp_specs: "MDPSpec", 24 optimization_horizon: int, 25 p: float, 26 c_1: float, 27 c_2: float = None, 28 min_at: float = 0, 29 UCB_type="hoeffding", 30 ): 31 super(QValuesModel, self).__init__(seed, mdp_specs) 32 33 self._UCB_type = UCB_type 34 self._min_at = min_at 35 self._c_1 = c_1 36 self._c_2 = c_2 37 self._p = p 38 39 self.i = np.log(self._n_states * self._n_actions * optimization_horizon / p) 40 self.N = np.ones((self._H, self._n_states, self._n_actions), np.int32) 41 self.Q = ( 42 np.zeros((self._H, self._n_states, self._n_actions), np.float32) + self._H 43 ) 44 self.V = np.zeros((self._H + 1, self._n_states), np.float32) 45 46 if UCB_type == "bernstein": 47 self.mu = np.zeros((self._H, self._n_states, self._n_actions), np.float32) 48 self.sigma = np.zeros( 49 (self._H, self._n_states, self._n_actions), np.float32 50 ) 51 self.beta = np.zeros((self._H, self._n_states, self._n_actions), np.float32) 52 53 def step_update( 54 self, 55 ts_t: dm_env.TimeStep, 56 a_t: "ACTION_TYPE", 57 ts_tp1: dm_env.TimeStep, 58 time: int, 59 ): 60 s_t = ts_t.observation 61 s_tp1 = ts_tp1.observation 62 63 self.N[time, s_t, a_t] += 1 64 65 t = self.N[time, s_t, a_t] 66 self._alpha_t = max(self._min_at, (self._H + 1) / (self._H + t)) 67 68 if self._UCB_type == "hoeffding": 69 b_t = self._c_1 * np.sqrt(self._H ** 3 * self.i / t) 70 else: 71 self.mu[time, s_t, a_t] += self.V[time + 1, s_tp1] 72 self.sigma[time, s_t, a_t] += self.V[time + 1, s_tp1] ** 2 73 old_beta = self.beta[time, s_t, a_t] 74 self.beta[time, s_t, a_t] = min( 75 self._c_1 76 * ( 77 np.sqrt( 78 ( 79 self._H 80 * ( 81 (self.sigma[time, s_t, a_t] - self.mu[time, s_t, a_t]) 82 ** 2 83 ) 84 / t ** 2 85 + self._H 86 ) 87 * self.i 88 ) 89 + np.sqrt(self._H ** 7 * self._n_states * self._n_actions) 90 * self.i 91 / t 92 ), 93 self._c_2 * np.sqrt(self._H ** 3 * self.i / t), 94 ) 95 b_t = ( 96 (self.beta[time, s_t, a_t] - (1 - self._alpha_t) * old_beta) 97 / 2 98 / self._alpha_t 99 ) 100 101 self.Q[time, s_t, a_t] = self._alpha_t * self.Q[time, s_t, a_t] + ( 102 1 - self._alpha_t 103 ) * (ts_tp1.reward + self.V[time + 1, s_tp1] + b_t) 104 self.V[time, s_t] = min(self._H, self.Q[time, s_t].max()) 105 106 107@gin.configurable 108class QLearningEpisodic(BaseAgent): 109 """ 110 The q-learning algorithm with UCB exploration. 111 112 Jin, Chi, et al. "Is q-learning provably efficient?." Advances in neural information processing systems 31 (2018). 113 """ 114 115 @staticmethod 116 def is_emission_map_accepted(emission_map: "EmissionMap") -> bool: 117 return emission_map.is_tabular 118 119 @staticmethod 120 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 121 string = ( 122 f"prms_{index}/QLearningEpisodic.p=0.05\n" 123 f'prms_{index}/QLearningEpisodic.UCB_type="bernstein"\n' 124 ) 125 for k, v in parameters.items(): 126 string += f"prms_{index}/QLearningEpisodic.{k} = {v}\n" 127 return string[:-1] 128 129 @staticmethod 130 def is_episodic() -> bool: 131 return True 132 133 @staticmethod 134 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 135 return { 136 "c_1": tune.uniform(0.001, 1.1), 137 "c_2": tune.uniform(0.001, 1.1), 138 "min_at": tune.uniform(0.001, 0.2), 139 } 140 141 @staticmethod 142 def get_agent_instance_from_parameters( 143 seed: int, 144 optimization_horizon: int, 145 mdp_specs: "MDPSpec", 146 parameters: Dict[str, Any], 147 ) -> "BaseAgent": 148 return QLearningEpisodic( 149 mdp_specs=mdp_specs, 150 seed=seed, 151 optimization_horizon=optimization_horizon, 152 min_at=parameters["min_at"], 153 c_1=parameters["c_1"], 154 c_2=parameters["c_2"], 155 UCB_type="bernstein", 156 p=0.05, 157 ) 158 159 @property 160 def current_optimal_stochastic_policy(self) -> np.ndarray: 161 return get_policy_from_q_values(self._mdp_model.Q, True) 162 163 def __init__( 164 self, 165 seed: int, 166 mdp_specs: "MDPSpec", 167 optimization_horizon: int, 168 # MDP model parameters 169 p: float, 170 c_1: float, 171 c_2: float = None, 172 min_at: float = 0, 173 UCB_type="hoeffding", 174 # Actor parameters 175 epsilon_greedy: Union[float, Callable] = None, 176 boltzmann_temperature: Union[float, Callable] = None, 177 ): 178 """ 179 Parameters 180 ---------- 181 seed : int 182 The random seed. 183 mdp_specs : MDPSpec 184 The full specification of the MDP. 185 optimization_horizon : int 186 The total number of interactions that the agent is expected to have with the MDP. 187 p : float 188 The value of the probability of failure. 189 c_1 : float 190 The value of the :math:`c_1` coefficient. 191 c_2 : float 192 The value of the :math:`c_2` coefficient. 193 min_at : float 194 The minimum value for the alpha coefficient. By default, it is set to zero. 195 UCB_type : str 196 The type of UCB bonus. It can either be 'hoeffding' or 'bernstein'. 197 epsilon_greedy : Union[float, Callable], optional 198 The probability of selecting an action at random. It can be provided as a float or as a function of the 199 total number of interactions. By default, the probability is set to zero. 200 boltzmann_temperature : Union[float, Callable], optional 201 The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of 202 the total number of interactions. By default, Boltzmann exploration is disabled. 203 """ 204 205 UCB_type = UCB_type.lower() 206 assert 0 <= min_at < 0.99 207 assert 0 < p < 1 208 assert c_1 > 0 209 assert UCB_type in ["hoeffding", "bernstein"] 210 if UCB_type == "bernstein": 211 assert c_2 is not None and c_2 > 0 212 213 super(QLearningEpisodic, self).__init__( 214 seed, 215 mdp_specs, 216 QValuesModel( 217 seed, 218 mdp_specs, 219 optimization_horizon, 220 p, 221 c_1, 222 c_2, 223 min_at, 224 UCB_type, 225 ), 226 QValuesActor(seed, mdp_specs, epsilon_greedy, boltzmann_temperature), 227 optimization_horizon, 228 ) 229 230 def episode_end_update(self): 231 pass 232 233 def before_start_interacting(self): 234 self._actor.set_q_values(self._mdp_model.Q) 235 236 def step_update( 237 self, ts_t: dm_env.TimeStep, a_t: "ACTION_TYPE", ts_tp1: dm_env.TimeStep, h: int 238 ): 239 super(QLearningEpisodic, self).step_update(ts_t, a_t, ts_tp1, h) 240 self._actor.set_q_values(self._mdp_model.Q)
20class QValuesModel(BaseMDPModel): 21 def __init__( 22 self, 23 seed: int, 24 mdp_specs: "MDPSpec", 25 optimization_horizon: int, 26 p: float, 27 c_1: float, 28 c_2: float = None, 29 min_at: float = 0, 30 UCB_type="hoeffding", 31 ): 32 super(QValuesModel, self).__init__(seed, mdp_specs) 33 34 self._UCB_type = UCB_type 35 self._min_at = min_at 36 self._c_1 = c_1 37 self._c_2 = c_2 38 self._p = p 39 40 self.i = np.log(self._n_states * self._n_actions * optimization_horizon / p) 41 self.N = np.ones((self._H, self._n_states, self._n_actions), np.int32) 42 self.Q = ( 43 np.zeros((self._H, self._n_states, self._n_actions), np.float32) + self._H 44 ) 45 self.V = np.zeros((self._H + 1, self._n_states), np.float32) 46 47 if UCB_type == "bernstein": 48 self.mu = np.zeros((self._H, self._n_states, self._n_actions), np.float32) 49 self.sigma = np.zeros( 50 (self._H, self._n_states, self._n_actions), np.float32 51 ) 52 self.beta = np.zeros((self._H, self._n_states, self._n_actions), np.float32) 53 54 def step_update( 55 self, 56 ts_t: dm_env.TimeStep, 57 a_t: "ACTION_TYPE", 58 ts_tp1: dm_env.TimeStep, 59 time: int, 60 ): 61 s_t = ts_t.observation 62 s_tp1 = ts_tp1.observation 63 64 self.N[time, s_t, a_t] += 1 65 66 t = self.N[time, s_t, a_t] 67 self._alpha_t = max(self._min_at, (self._H + 1) / (self._H + t)) 68 69 if self._UCB_type == "hoeffding": 70 b_t = self._c_1 * np.sqrt(self._H ** 3 * self.i / t) 71 else: 72 self.mu[time, s_t, a_t] += self.V[time + 1, s_tp1] 73 self.sigma[time, s_t, a_t] += self.V[time + 1, s_tp1] ** 2 74 old_beta = self.beta[time, s_t, a_t] 75 self.beta[time, s_t, a_t] = min( 76 self._c_1 77 * ( 78 np.sqrt( 79 ( 80 self._H 81 * ( 82 (self.sigma[time, s_t, a_t] - self.mu[time, s_t, a_t]) 83 ** 2 84 ) 85 / t ** 2 86 + self._H 87 ) 88 * self.i 89 ) 90 + np.sqrt(self._H ** 7 * self._n_states * self._n_actions) 91 * self.i 92 / t 93 ), 94 self._c_2 * np.sqrt(self._H ** 3 * self.i / t), 95 ) 96 b_t = ( 97 (self.beta[time, s_t, a_t] - (1 - self._alpha_t) * old_beta) 98 / 2 99 / self._alpha_t 100 ) 101 102 self.Q[time, s_t, a_t] = self._alpha_t * self.Q[time, s_t, a_t] + ( 103 1 - self._alpha_t 104 ) * (ts_tp1.reward + self.V[time + 1, s_tp1] + b_t) 105 self.V[time, s_t] = min(self._H, self.Q[time, s_t].max())
The BaseMDPModel
is the base class for MDP models.
QValuesModel( seed: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, optimization_horizon: int, p: float, c_1: float, c_2: float = None, min_at: float = 0, UCB_type='hoeffding')
21 def __init__( 22 self, 23 seed: int, 24 mdp_specs: "MDPSpec", 25 optimization_horizon: int, 26 p: float, 27 c_1: float, 28 c_2: float = None, 29 min_at: float = 0, 30 UCB_type="hoeffding", 31 ): 32 super(QValuesModel, self).__init__(seed, mdp_specs) 33 34 self._UCB_type = UCB_type 35 self._min_at = min_at 36 self._c_1 = c_1 37 self._c_2 = c_2 38 self._p = p 39 40 self.i = np.log(self._n_states * self._n_actions * optimization_horizon / p) 41 self.N = np.ones((self._H, self._n_states, self._n_actions), np.int32) 42 self.Q = ( 43 np.zeros((self._H, self._n_states, self._n_actions), np.float32) + self._H 44 ) 45 self.V = np.zeros((self._H + 1, self._n_states), np.float32) 46 47 if UCB_type == "bernstein": 48 self.mu = np.zeros((self._H, self._n_states, self._n_actions), np.float32) 49 self.sigma = np.zeros( 50 (self._H, self._n_states, self._n_actions), np.float32 51 ) 52 self.beta = np.zeros((self._H, self._n_states, self._n_actions), np.float32)
Parameters
- seed (int): The random seed.
- mdp_specs (MDPSpec): The full specification of the MDP.
def
step_update( self, ts_t: dm_env._environment.TimeStep, a_t: Union[int, float, numpy.ndarray], ts_tp1: dm_env._environment.TimeStep, time: int):
54 def step_update( 55 self, 56 ts_t: dm_env.TimeStep, 57 a_t: "ACTION_TYPE", 58 ts_tp1: dm_env.TimeStep, 59 time: int, 60 ): 61 s_t = ts_t.observation 62 s_tp1 = ts_tp1.observation 63 64 self.N[time, s_t, a_t] += 1 65 66 t = self.N[time, s_t, a_t] 67 self._alpha_t = max(self._min_at, (self._H + 1) / (self._H + t)) 68 69 if self._UCB_type == "hoeffding": 70 b_t = self._c_1 * np.sqrt(self._H ** 3 * self.i / t) 71 else: 72 self.mu[time, s_t, a_t] += self.V[time + 1, s_tp1] 73 self.sigma[time, s_t, a_t] += self.V[time + 1, s_tp1] ** 2 74 old_beta = self.beta[time, s_t, a_t] 75 self.beta[time, s_t, a_t] = min( 76 self._c_1 77 * ( 78 np.sqrt( 79 ( 80 self._H 81 * ( 82 (self.sigma[time, s_t, a_t] - self.mu[time, s_t, a_t]) 83 ** 2 84 ) 85 / t ** 2 86 + self._H 87 ) 88 * self.i 89 ) 90 + np.sqrt(self._H ** 7 * self._n_states * self._n_actions) 91 * self.i 92 / t 93 ), 94 self._c_2 * np.sqrt(self._H ** 3 * self.i / t), 95 ) 96 b_t = ( 97 (self.beta[time, s_t, a_t] - (1 - self._alpha_t) * old_beta) 98 / 2 99 / self._alpha_t 100 ) 101 102 self.Q[time, s_t, a_t] = self._alpha_t * self.Q[time, s_t, a_t] + ( 103 1 - self._alpha_t 104 ) * (ts_tp1.reward + self.V[time + 1, s_tp1] + b_t) 105 self.V[time, s_t] = min(self._H, self.Q[time, s_t].max())
updates the model with the transition in input.
Parameters
- ts_t (dm_env.TimeStep): The TimeStep at time t.
- a_t ("ACTION_TYPE"): The action taken by the agent at time t.
- ts_tp1 (dm_env.TimeStep): The TimeStep at time t + 1.
- time (int): The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in the continuous case this refers to the total number of previous interactions.
108@gin.configurable 109class QLearningEpisodic(BaseAgent): 110 """ 111 The q-learning algorithm with UCB exploration. 112 113 Jin, Chi, et al. "Is q-learning provably efficient?." Advances in neural information processing systems 31 (2018). 114 """ 115 116 @staticmethod 117 def is_emission_map_accepted(emission_map: "EmissionMap") -> bool: 118 return emission_map.is_tabular 119 120 @staticmethod 121 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 122 string = ( 123 f"prms_{index}/QLearningEpisodic.p=0.05\n" 124 f'prms_{index}/QLearningEpisodic.UCB_type="bernstein"\n' 125 ) 126 for k, v in parameters.items(): 127 string += f"prms_{index}/QLearningEpisodic.{k} = {v}\n" 128 return string[:-1] 129 130 @staticmethod 131 def is_episodic() -> bool: 132 return True 133 134 @staticmethod 135 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 136 return { 137 "c_1": tune.uniform(0.001, 1.1), 138 "c_2": tune.uniform(0.001, 1.1), 139 "min_at": tune.uniform(0.001, 0.2), 140 } 141 142 @staticmethod 143 def get_agent_instance_from_parameters( 144 seed: int, 145 optimization_horizon: int, 146 mdp_specs: "MDPSpec", 147 parameters: Dict[str, Any], 148 ) -> "BaseAgent": 149 return QLearningEpisodic( 150 mdp_specs=mdp_specs, 151 seed=seed, 152 optimization_horizon=optimization_horizon, 153 min_at=parameters["min_at"], 154 c_1=parameters["c_1"], 155 c_2=parameters["c_2"], 156 UCB_type="bernstein", 157 p=0.05, 158 ) 159 160 @property 161 def current_optimal_stochastic_policy(self) -> np.ndarray: 162 return get_policy_from_q_values(self._mdp_model.Q, True) 163 164 def __init__( 165 self, 166 seed: int, 167 mdp_specs: "MDPSpec", 168 optimization_horizon: int, 169 # MDP model parameters 170 p: float, 171 c_1: float, 172 c_2: float = None, 173 min_at: float = 0, 174 UCB_type="hoeffding", 175 # Actor parameters 176 epsilon_greedy: Union[float, Callable] = None, 177 boltzmann_temperature: Union[float, Callable] = None, 178 ): 179 """ 180 Parameters 181 ---------- 182 seed : int 183 The random seed. 184 mdp_specs : MDPSpec 185 The full specification of the MDP. 186 optimization_horizon : int 187 The total number of interactions that the agent is expected to have with the MDP. 188 p : float 189 The value of the probability of failure. 190 c_1 : float 191 The value of the :math:`c_1` coefficient. 192 c_2 : float 193 The value of the :math:`c_2` coefficient. 194 min_at : float 195 The minimum value for the alpha coefficient. By default, it is set to zero. 196 UCB_type : str 197 The type of UCB bonus. It can either be 'hoeffding' or 'bernstein'. 198 epsilon_greedy : Union[float, Callable], optional 199 The probability of selecting an action at random. It can be provided as a float or as a function of the 200 total number of interactions. By default, the probability is set to zero. 201 boltzmann_temperature : Union[float, Callable], optional 202 The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of 203 the total number of interactions. By default, Boltzmann exploration is disabled. 204 """ 205 206 UCB_type = UCB_type.lower() 207 assert 0 <= min_at < 0.99 208 assert 0 < p < 1 209 assert c_1 > 0 210 assert UCB_type in ["hoeffding", "bernstein"] 211 if UCB_type == "bernstein": 212 assert c_2 is not None and c_2 > 0 213 214 super(QLearningEpisodic, self).__init__( 215 seed, 216 mdp_specs, 217 QValuesModel( 218 seed, 219 mdp_specs, 220 optimization_horizon, 221 p, 222 c_1, 223 c_2, 224 min_at, 225 UCB_type, 226 ), 227 QValuesActor(seed, mdp_specs, epsilon_greedy, boltzmann_temperature), 228 optimization_horizon, 229 ) 230 231 def episode_end_update(self): 232 pass 233 234 def before_start_interacting(self): 235 self._actor.set_q_values(self._mdp_model.Q) 236 237 def step_update( 238 self, ts_t: dm_env.TimeStep, a_t: "ACTION_TYPE", ts_tp1: dm_env.TimeStep, h: int 239 ): 240 super(QLearningEpisodic, self).step_update(ts_t, a_t, ts_tp1, h) 241 self._actor.set_q_values(self._mdp_model.Q)
The q-learning algorithm with UCB exploration.
Jin, Chi, et al. "Is q-learning provably efficient?." Advances in neural information processing systems 31 (2018).
QLearningEpisodic( seed: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, optimization_horizon: int, p: float, c_1: float, c_2: float = None, min_at: float = 0, UCB_type='hoeffding', epsilon_greedy: Union[float, Callable] = None, boltzmann_temperature: Union[float, Callable] = None)
164 def __init__( 165 self, 166 seed: int, 167 mdp_specs: "MDPSpec", 168 optimization_horizon: int, 169 # MDP model parameters 170 p: float, 171 c_1: float, 172 c_2: float = None, 173 min_at: float = 0, 174 UCB_type="hoeffding", 175 # Actor parameters 176 epsilon_greedy: Union[float, Callable] = None, 177 boltzmann_temperature: Union[float, Callable] = None, 178 ): 179 """ 180 Parameters 181 ---------- 182 seed : int 183 The random seed. 184 mdp_specs : MDPSpec 185 The full specification of the MDP. 186 optimization_horizon : int 187 The total number of interactions that the agent is expected to have with the MDP. 188 p : float 189 The value of the probability of failure. 190 c_1 : float 191 The value of the :math:`c_1` coefficient. 192 c_2 : float 193 The value of the :math:`c_2` coefficient. 194 min_at : float 195 The minimum value for the alpha coefficient. By default, it is set to zero. 196 UCB_type : str 197 The type of UCB bonus. It can either be 'hoeffding' or 'bernstein'. 198 epsilon_greedy : Union[float, Callable], optional 199 The probability of selecting an action at random. It can be provided as a float or as a function of the 200 total number of interactions. By default, the probability is set to zero. 201 boltzmann_temperature : Union[float, Callable], optional 202 The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of 203 the total number of interactions. By default, Boltzmann exploration is disabled. 204 """ 205 206 UCB_type = UCB_type.lower() 207 assert 0 <= min_at < 0.99 208 assert 0 < p < 1 209 assert c_1 > 0 210 assert UCB_type in ["hoeffding", "bernstein"] 211 if UCB_type == "bernstein": 212 assert c_2 is not None and c_2 > 0 213 214 super(QLearningEpisodic, self).__init__( 215 seed, 216 mdp_specs, 217 QValuesModel( 218 seed, 219 mdp_specs, 220 optimization_horizon, 221 p, 222 c_1, 223 c_2, 224 min_at, 225 UCB_type, 226 ), 227 QValuesActor(seed, mdp_specs, epsilon_greedy, boltzmann_temperature), 228 optimization_horizon, 229 )
Parameters
- seed (int): The random seed.
- mdp_specs (MDPSpec): The full specification of the MDP.
- optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
- p (float): The value of the probability of failure.
- c_1 (float): The value of the \( c_1 \) coefficient.
- c_2 (float): The value of the \( c_2 \) coefficient.
- min_at (float): The minimum value for the alpha coefficient. By default, it is set to zero.
- UCB_type (str): The type of UCB bonus. It can either be 'hoeffding' or 'bernstein'.
- epsilon_greedy (Union[float, Callable], optional): The probability of selecting an action at random. It can be provided as a float or as a function of the total number of interactions. By default, the probability is set to zero.
- boltzmann_temperature (Union[float, Callable], optional): The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of the total number of interactions. By default, Boltzmann exploration is disabled.
@staticmethod
def
is_emission_map_accepted(emission_map: colosseum.emission_maps.base.EmissionMap) -> bool:
116 @staticmethod 117 def is_emission_map_accepted(emission_map: "EmissionMap") -> bool: 118 return emission_map.is_tabular
Returns
- bool: True if the agent class accepts the emission map.
@staticmethod
def
produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
120 @staticmethod 121 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 122 string = ( 123 f"prms_{index}/QLearningEpisodic.p=0.05\n" 124 f'prms_{index}/QLearningEpisodic.UCB_type="bernstein"\n' 125 ) 126 for k, v in parameters.items(): 127 string += f"prms_{index}/QLearningEpisodic.{k} = {v}\n" 128 return string[:-1]
produces a string containing the gin config file corresponding to the parameters given in input.
Parameters
- parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
- index (int): The index assigned to the gin configuration.
Returns
- gin_config (str): The gin configuration file.
@staticmethod
def
is_episodic() -> bool:
Returns
- bool: True if the agent is suited for the episodic setting.
@staticmethod
def
get_hyperparameters_search_spaces() -> Dict[str, ray.tune.sample.Domain]:
134 @staticmethod 135 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 136 return { 137 "c_1": tune.uniform(0.001, 1.1), 138 "c_2": tune.uniform(0.001, 1.1), 139 "min_at": tune.uniform(0.001, 0.2), 140 }
Returns
- Dict[str, tune.sample.Domain]: The dictionary with key value pairs corresponding to hyperparameter names and corresponding
ray.tune
samplers.
@staticmethod
def
get_agent_instance_from_parameters( seed: int, optimization_horizon: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, parameters: Dict[str, Any]) -> colosseum.agent.agents.base.BaseAgent:
142 @staticmethod 143 def get_agent_instance_from_parameters( 144 seed: int, 145 optimization_horizon: int, 146 mdp_specs: "MDPSpec", 147 parameters: Dict[str, Any], 148 ) -> "BaseAgent": 149 return QLearningEpisodic( 150 mdp_specs=mdp_specs, 151 seed=seed, 152 optimization_horizon=optimization_horizon, 153 min_at=parameters["min_at"], 154 c_1=parameters["c_1"], 155 c_2=parameters["c_2"], 156 UCB_type="bernstein", 157 p=0.05, 158 )
returns an agent instance for the mdp specification and agent parameters given in input.
Parameters
- seed (int): The random seed.
- optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
- mdp_specs (MDPSpec): The full specification of the MDP.
- parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
Returns
- BaseAgent: The agent instance.
current_optimal_stochastic_policy: numpy.ndarray
Returns
- np.ndarray: The estimates of the best optimal policy given the current knowledge of the agent in the form of distribution over actions.
def
episode_end_update(self):
is called when an episode ends. In the infinite horizon case, we refer to artificial episodes.
def
step_update( self, ts_t: dm_env._environment.TimeStep, a_t: Union[int, float, numpy.ndarray], ts_tp1: dm_env._environment.TimeStep, h: int):
237 def step_update( 238 self, ts_t: dm_env.TimeStep, a_t: "ACTION_TYPE", ts_tp1: dm_env.TimeStep, h: int 239 ): 240 super(QLearningEpisodic, self).step_update(ts_t, a_t, ts_tp1, h) 241 self._actor.set_q_values(self._mdp_model.Q)
adds the transition in input to the MDP model.
Parameters
- ts_t (dm_env.TimeStep): The TimeStep at time t.
- a_t ("ACTION_TYPE"): The action taken by the agent at time t.
- ts_tp1 (dm_env.TimeStep): The TimeStep at time t + 1.
- time (int): The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in the continuous case this refers to the total number of previous interactions.