colosseum.agent.agents.infinite_horizon.q_learning
1from typing import Any, Callable, Dict, Union 2 3import dm_env 4import gin 5import numpy as np 6from ray import tune 7from typing_extensions import TYPE_CHECKING 8 9from colosseum.agent.actors import QValuesActor 10from colosseum.agent.agents.base import BaseAgent 11from colosseum.agent.mdp_models.base import BaseMDPModel 12from colosseum.dynamic_programming.utils import get_policy_from_q_values 13from colosseum.emission_maps import EmissionMap 14 15if TYPE_CHECKING: 16 from colosseum.mdp import ACTION_TYPE 17 from colosseum.utils.acme.specs import MDPSpec 18 19 20def get_H(n_states, n_actions, T, span_approx, confidence): 21 """ 22 computes the theoretical value for the horizon approximation value. 23 Parameters 24 ---------- 25 n_states : int 26 The number of states. 27 n_actions : int 28 The number of actions. 29 T : int 30 The optimization horizon. 31 span_approx : float 32 The span approximation value. 33 confidence : float 34 One minus the probability of failure. 35 36 Returns 37 ------- 38 float 39 The theoretical value for the horizon approximation value. 40 """ 41 return min( 42 np.sqrt(span_approx * T / n_states / n_actions), 43 (T / n_states / n_actions / np.log(4 * T / confidence)) ** 0.333, 44 ) 45 46 47class _QValuesModel(BaseMDPModel): 48 def __init__( 49 self, 50 seed: int, 51 mdp_specs: "MDPSpec", 52 optimization_horizon: int, 53 min_at: float, 54 confidence: float, 55 span_approx_weight: float, 56 get_span_approx: Callable[[int, int], float], 57 h_weight: float, 58 get_H: Callable[[int, int, int, float, float], float], 59 ): 60 super(_QValuesModel, self).__init__(seed, mdp_specs) 61 62 self.min_at = min_at if min_at > 0.009 else 0 63 self.span_approx = span_approx_weight 64 if get_span_approx is not None: 65 self.span_approx *= get_span_approx(self._n_states, self._n_actions) 66 67 self.confidence = confidence 68 self.optimization_horizon = optimization_horizon 69 70 self.H = h_weight * get_H( 71 self._n_states, 72 self._n_actions, 73 optimization_horizon, 74 self.span_approx, 75 confidence, 76 ) 77 self.gamma = 1 - 1 / self.H 78 79 self.N = np.zeros((self._n_states, self._n_actions), np.int32) 80 self.Q = np.zeros((self._n_states, self._n_actions), np.float32) + self.H 81 self.Q_main = np.zeros((self._n_states, self._n_actions), np.float32) + self.H 82 self.V = np.zeros((self._n_states,), np.float32) + self.H 83 84 def step_update( 85 self, 86 ts_t: dm_env.TimeStep, 87 a_t: "ACTION_TYPE", 88 ts_tp1: dm_env.TimeStep, 89 time: int, 90 ): 91 s_t = ts_t.observation 92 s_tp1 = ts_tp1.observation 93 94 self.N[s_t, a_t] += 1 95 alpha_t = max(self.min_at, (self.H + 1) / (self.H + self.N[s_t, a_t])) 96 b_t = ( 97 4 98 * self.span_approx 99 * np.sqrt( 100 self.H 101 / self.N[s_t, a_t] 102 * np.log(2 * self.optimization_horizon / self.confidence) 103 ) 104 ) 105 106 self.Q_main[s_t, a_t] = (1 - alpha_t) * self.Q[s_t, a_t] + alpha_t * ( 107 ts_tp1.reward + self.gamma * self.V[s_tp1] + b_t 108 ) 109 self.Q[s_t, a_t] = min(self.Q[s_t, a_t], self.Q_main[s_t, a_t]) 110 self.V[s_tp1] = self.Q[s_tp1].max() 111 112 113@gin.configurable 114class QLearningContinuous(BaseAgent): 115 """ 116 The q-learning algorithm with optimism. 117 118 Wei, Chen-Yu, et al. "Model-free reinforcement learning in infinite-horizon average-reward markov decision processes." 119 International conference on machine learning. PMLR, 2020. 120 """ 121 122 @staticmethod 123 def is_emission_map_accepted(emission_map: "EmissionMap") -> bool: 124 return emission_map.is_tabular 125 126 @staticmethod 127 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 128 string = "" 129 for k, v in parameters.items(): 130 string += f"prms_{index}/QLearningContinuous.{k} = {v}\n" 131 return string[:-1] 132 133 @staticmethod 134 def is_episodic() -> bool: 135 return False 136 137 @staticmethod 138 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 139 return { 140 "h_weight": tune.uniform(0.001, 1.1), 141 "span_approx_weight": tune.uniform(0.001, 1.1), 142 "min_at": tune.uniform(0.001, 0.2), 143 } 144 145 @staticmethod 146 def get_agent_instance_from_parameters( 147 seed: int, 148 optimization_horizon: int, 149 mdp_specs: "MDPSpec", 150 parameters: Dict[str, Any], 151 ) -> "BaseAgent": 152 return QLearningContinuous( 153 mdp_specs=mdp_specs, 154 seed=seed, 155 optimization_horizon=optimization_horizon, 156 min_at=parameters["min_at"], 157 h_weight=parameters["h_weight"], 158 span_approx_weight=parameters["span_approx_weight"], 159 ) 160 161 @property 162 def current_optimal_stochastic_policy(self) -> np.ndarray: 163 return get_policy_from_q_values(self._mdp_model.Q, True) 164 165 def __init__( 166 self, 167 seed: int, 168 mdp_specs: "MDPSpec", 169 optimization_horizon: int, 170 # MDP model parameters 171 min_at: float = 0, 172 confidence: float = 0.95, 173 span_approx_weight: float = 1, 174 get_span_approx: Callable[[int, int], float] = None, 175 h_weight: float = 1, 176 get_H: Callable[[int, int, int, float, float], float] = get_H, 177 # Actor parameters 178 epsilon_greedy: Union[float, Callable] = None, 179 boltzmann_temperature: Union[float, Callable] = None, 180 ): 181 """ 182 183 Parameters 184 ---------- 185 seed : int 186 The random seed. 187 mdp_specs : MDPSpec 188 The full specification of the MDP. 189 optimization_horizon : int 190 The total number of interactions that the agent is expected to have with the MDP. 191 min_at : float 192 The minimum value for the alpha coefficient. By default, it is set to zero. 193 confidence : float 194 One minus the probability of failure. By default, it is set to :math:`0.95`. 195 span_approx_weight : float 196 The weight given to the value of the span approximation. 197 get_span_approx : Callable[[int, int], float] 198 The function that computes the value for the span approximation given number of states and number of actions. 199 By default, the theoretical value is used. 200 h_weight : float 201 The weight given to the value of approximate horizon. By default, it is set to one. 202 get_H : Callable[[int, int, int, float, float], float] 203 The function that computes the approximate horizon given number of states, number of actions, optimization 204 horizon, the value of the span approximation, and the confidence. By default, the theoretical value is used. 205 epsilon_greedy : Union[float, Callable], optional 206 The probability of selecting an action at random. It can be provided as a float or as a function of the 207 total number of interactions. By default, the probability is set to zero. 208 boltzmann_temperature : Union[float, Callable], optional 209 The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of 210 the total number of interactions. By default, Boltzmann exploration is disabled. 211 """ 212 213 assert 0 <= min_at < 0.99 214 assert 0 < confidence < 1 215 assert span_approx_weight > 0 216 assert h_weight > 0 217 218 super(QLearningContinuous, self).__init__( 219 seed, 220 mdp_specs, 221 _QValuesModel( 222 seed, 223 mdp_specs, 224 optimization_horizon, 225 min_at, 226 confidence, 227 span_approx_weight, 228 get_span_approx, 229 h_weight, 230 get_H, 231 ), 232 QValuesActor(seed, mdp_specs, epsilon_greedy, boltzmann_temperature), 233 optimization_horizon, 234 ) 235 236 def episode_end_update(self): 237 pass 238 239 def before_start_interacting(self): 240 self._actor.set_q_values(self._mdp_model.Q) 241 242 def step_update( 243 self, ts_t: dm_env.TimeStep, a_t: "ACTION_TYPE", ts_tp1: dm_env.TimeStep, h: int 244 ): 245 super(QLearningContinuous, self).step_update(ts_t, a_t, ts_tp1, h) 246 self._actor.set_q_values(self._mdp_model.Q) 247 248 def get_q_value_estimate(self) -> np.ndarray: 249 """ 250 Returns 251 ------- 252 np.ndarray 253 The q-values estimate. 254 """ 255 return self._mdp_model.Q
def
get_H(n_states, n_actions, T, span_approx, confidence):
21def get_H(n_states, n_actions, T, span_approx, confidence): 22 """ 23 computes the theoretical value for the horizon approximation value. 24 Parameters 25 ---------- 26 n_states : int 27 The number of states. 28 n_actions : int 29 The number of actions. 30 T : int 31 The optimization horizon. 32 span_approx : float 33 The span approximation value. 34 confidence : float 35 One minus the probability of failure. 36 37 Returns 38 ------- 39 float 40 The theoretical value for the horizon approximation value. 41 """ 42 return min( 43 np.sqrt(span_approx * T / n_states / n_actions), 44 (T / n_states / n_actions / np.log(4 * T / confidence)) ** 0.333, 45 )
computes the theoretical value for the horizon approximation value.
Parameters
- n_states (int): The number of states.
- n_actions (int): The number of actions.
- T (int): The optimization horizon.
- span_approx (float): The span approximation value.
- confidence (float): One minus the probability of failure.
Returns
- float: The theoretical value for the horizon approximation value.
114@gin.configurable 115class QLearningContinuous(BaseAgent): 116 """ 117 The q-learning algorithm with optimism. 118 119 Wei, Chen-Yu, et al. "Model-free reinforcement learning in infinite-horizon average-reward markov decision processes." 120 International conference on machine learning. PMLR, 2020. 121 """ 122 123 @staticmethod 124 def is_emission_map_accepted(emission_map: "EmissionMap") -> bool: 125 return emission_map.is_tabular 126 127 @staticmethod 128 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 129 string = "" 130 for k, v in parameters.items(): 131 string += f"prms_{index}/QLearningContinuous.{k} = {v}\n" 132 return string[:-1] 133 134 @staticmethod 135 def is_episodic() -> bool: 136 return False 137 138 @staticmethod 139 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 140 return { 141 "h_weight": tune.uniform(0.001, 1.1), 142 "span_approx_weight": tune.uniform(0.001, 1.1), 143 "min_at": tune.uniform(0.001, 0.2), 144 } 145 146 @staticmethod 147 def get_agent_instance_from_parameters( 148 seed: int, 149 optimization_horizon: int, 150 mdp_specs: "MDPSpec", 151 parameters: Dict[str, Any], 152 ) -> "BaseAgent": 153 return QLearningContinuous( 154 mdp_specs=mdp_specs, 155 seed=seed, 156 optimization_horizon=optimization_horizon, 157 min_at=parameters["min_at"], 158 h_weight=parameters["h_weight"], 159 span_approx_weight=parameters["span_approx_weight"], 160 ) 161 162 @property 163 def current_optimal_stochastic_policy(self) -> np.ndarray: 164 return get_policy_from_q_values(self._mdp_model.Q, True) 165 166 def __init__( 167 self, 168 seed: int, 169 mdp_specs: "MDPSpec", 170 optimization_horizon: int, 171 # MDP model parameters 172 min_at: float = 0, 173 confidence: float = 0.95, 174 span_approx_weight: float = 1, 175 get_span_approx: Callable[[int, int], float] = None, 176 h_weight: float = 1, 177 get_H: Callable[[int, int, int, float, float], float] = get_H, 178 # Actor parameters 179 epsilon_greedy: Union[float, Callable] = None, 180 boltzmann_temperature: Union[float, Callable] = None, 181 ): 182 """ 183 184 Parameters 185 ---------- 186 seed : int 187 The random seed. 188 mdp_specs : MDPSpec 189 The full specification of the MDP. 190 optimization_horizon : int 191 The total number of interactions that the agent is expected to have with the MDP. 192 min_at : float 193 The minimum value for the alpha coefficient. By default, it is set to zero. 194 confidence : float 195 One minus the probability of failure. By default, it is set to :math:`0.95`. 196 span_approx_weight : float 197 The weight given to the value of the span approximation. 198 get_span_approx : Callable[[int, int], float] 199 The function that computes the value for the span approximation given number of states and number of actions. 200 By default, the theoretical value is used. 201 h_weight : float 202 The weight given to the value of approximate horizon. By default, it is set to one. 203 get_H : Callable[[int, int, int, float, float], float] 204 The function that computes the approximate horizon given number of states, number of actions, optimization 205 horizon, the value of the span approximation, and the confidence. By default, the theoretical value is used. 206 epsilon_greedy : Union[float, Callable], optional 207 The probability of selecting an action at random. It can be provided as a float or as a function of the 208 total number of interactions. By default, the probability is set to zero. 209 boltzmann_temperature : Union[float, Callable], optional 210 The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of 211 the total number of interactions. By default, Boltzmann exploration is disabled. 212 """ 213 214 assert 0 <= min_at < 0.99 215 assert 0 < confidence < 1 216 assert span_approx_weight > 0 217 assert h_weight > 0 218 219 super(QLearningContinuous, self).__init__( 220 seed, 221 mdp_specs, 222 _QValuesModel( 223 seed, 224 mdp_specs, 225 optimization_horizon, 226 min_at, 227 confidence, 228 span_approx_weight, 229 get_span_approx, 230 h_weight, 231 get_H, 232 ), 233 QValuesActor(seed, mdp_specs, epsilon_greedy, boltzmann_temperature), 234 optimization_horizon, 235 ) 236 237 def episode_end_update(self): 238 pass 239 240 def before_start_interacting(self): 241 self._actor.set_q_values(self._mdp_model.Q) 242 243 def step_update( 244 self, ts_t: dm_env.TimeStep, a_t: "ACTION_TYPE", ts_tp1: dm_env.TimeStep, h: int 245 ): 246 super(QLearningContinuous, self).step_update(ts_t, a_t, ts_tp1, h) 247 self._actor.set_q_values(self._mdp_model.Q) 248 249 def get_q_value_estimate(self) -> np.ndarray: 250 """ 251 Returns 252 ------- 253 np.ndarray 254 The q-values estimate. 255 """ 256 return self._mdp_model.Q
The q-learning algorithm with optimism.
Wei, Chen-Yu, et al. "Model-free reinforcement learning in infinite-horizon average-reward markov decision processes." International conference on machine learning. PMLR, 2020.
QLearningContinuous( seed: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, optimization_horizon: int, min_at: float = 0, confidence: float = 0.95, span_approx_weight: float = 1, get_span_approx: Callable[[int, int], float] = None, h_weight: float = 1, get_H: Callable[[int, int, int, float, float], float] = <function get_H>, epsilon_greedy: Union[float, Callable] = None, boltzmann_temperature: Union[float, Callable] = None)
166 def __init__( 167 self, 168 seed: int, 169 mdp_specs: "MDPSpec", 170 optimization_horizon: int, 171 # MDP model parameters 172 min_at: float = 0, 173 confidence: float = 0.95, 174 span_approx_weight: float = 1, 175 get_span_approx: Callable[[int, int], float] = None, 176 h_weight: float = 1, 177 get_H: Callable[[int, int, int, float, float], float] = get_H, 178 # Actor parameters 179 epsilon_greedy: Union[float, Callable] = None, 180 boltzmann_temperature: Union[float, Callable] = None, 181 ): 182 """ 183 184 Parameters 185 ---------- 186 seed : int 187 The random seed. 188 mdp_specs : MDPSpec 189 The full specification of the MDP. 190 optimization_horizon : int 191 The total number of interactions that the agent is expected to have with the MDP. 192 min_at : float 193 The minimum value for the alpha coefficient. By default, it is set to zero. 194 confidence : float 195 One minus the probability of failure. By default, it is set to :math:`0.95`. 196 span_approx_weight : float 197 The weight given to the value of the span approximation. 198 get_span_approx : Callable[[int, int], float] 199 The function that computes the value for the span approximation given number of states and number of actions. 200 By default, the theoretical value is used. 201 h_weight : float 202 The weight given to the value of approximate horizon. By default, it is set to one. 203 get_H : Callable[[int, int, int, float, float], float] 204 The function that computes the approximate horizon given number of states, number of actions, optimization 205 horizon, the value of the span approximation, and the confidence. By default, the theoretical value is used. 206 epsilon_greedy : Union[float, Callable], optional 207 The probability of selecting an action at random. It can be provided as a float or as a function of the 208 total number of interactions. By default, the probability is set to zero. 209 boltzmann_temperature : Union[float, Callable], optional 210 The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of 211 the total number of interactions. By default, Boltzmann exploration is disabled. 212 """ 213 214 assert 0 <= min_at < 0.99 215 assert 0 < confidence < 1 216 assert span_approx_weight > 0 217 assert h_weight > 0 218 219 super(QLearningContinuous, self).__init__( 220 seed, 221 mdp_specs, 222 _QValuesModel( 223 seed, 224 mdp_specs, 225 optimization_horizon, 226 min_at, 227 confidence, 228 span_approx_weight, 229 get_span_approx, 230 h_weight, 231 get_H, 232 ), 233 QValuesActor(seed, mdp_specs, epsilon_greedy, boltzmann_temperature), 234 optimization_horizon, 235 )
Parameters
- seed (int): The random seed.
- mdp_specs (MDPSpec): The full specification of the MDP.
- optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
- min_at (float): The minimum value for the alpha coefficient. By default, it is set to zero.
- confidence (float): One minus the probability of failure. By default, it is set to \( 0.95 \).
- span_approx_weight (float): The weight given to the value of the span approximation.
- get_span_approx (Callable[[int, int], float]): The function that computes the value for the span approximation given number of states and number of actions. By default, the theoretical value is used.
- h_weight (float): The weight given to the value of approximate horizon. By default, it is set to one.
- get_H (Callable[[int, int, int, float, float], float]): The function that computes the approximate horizon given number of states, number of actions, optimization horizon, the value of the span approximation, and the confidence. By default, the theoretical value is used.
- epsilon_greedy (Union[float, Callable], optional): The probability of selecting an action at random. It can be provided as a float or as a function of the total number of interactions. By default, the probability is set to zero.
- boltzmann_temperature (Union[float, Callable], optional): The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of the total number of interactions. By default, Boltzmann exploration is disabled.
@staticmethod
def
is_emission_map_accepted(emission_map: colosseum.emission_maps.base.EmissionMap) -> bool:
123 @staticmethod 124 def is_emission_map_accepted(emission_map: "EmissionMap") -> bool: 125 return emission_map.is_tabular
Returns
- bool: True if the agent class accepts the emission map.
@staticmethod
def
produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
127 @staticmethod 128 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 129 string = "" 130 for k, v in parameters.items(): 131 string += f"prms_{index}/QLearningContinuous.{k} = {v}\n" 132 return string[:-1]
produces a string containing the gin config file corresponding to the parameters given in input.
Parameters
- parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
- index (int): The index assigned to the gin configuration.
Returns
- gin_config (str): The gin configuration file.
@staticmethod
def
is_episodic() -> bool:
Returns
- bool: True if the agent is suited for the episodic setting.
@staticmethod
def
get_hyperparameters_search_spaces() -> Dict[str, ray.tune.sample.Domain]:
138 @staticmethod 139 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 140 return { 141 "h_weight": tune.uniform(0.001, 1.1), 142 "span_approx_weight": tune.uniform(0.001, 1.1), 143 "min_at": tune.uniform(0.001, 0.2), 144 }
Returns
- Dict[str, tune.sample.Domain]: The dictionary with key value pairs corresponding to hyperparameter names and corresponding
ray.tune
samplers.
@staticmethod
def
get_agent_instance_from_parameters( seed: int, optimization_horizon: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, parameters: Dict[str, Any]) -> colosseum.agent.agents.base.BaseAgent:
146 @staticmethod 147 def get_agent_instance_from_parameters( 148 seed: int, 149 optimization_horizon: int, 150 mdp_specs: "MDPSpec", 151 parameters: Dict[str, Any], 152 ) -> "BaseAgent": 153 return QLearningContinuous( 154 mdp_specs=mdp_specs, 155 seed=seed, 156 optimization_horizon=optimization_horizon, 157 min_at=parameters["min_at"], 158 h_weight=parameters["h_weight"], 159 span_approx_weight=parameters["span_approx_weight"], 160 )
returns an agent instance for the mdp specification and agent parameters given in input.
Parameters
- seed (int): The random seed.
- optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
- mdp_specs (MDPSpec): The full specification of the MDP.
- parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
Returns
- BaseAgent: The agent instance.
current_optimal_stochastic_policy: numpy.ndarray
Returns
- np.ndarray: The estimates of the best optimal policy given the current knowledge of the agent in the form of distribution over actions.
def
episode_end_update(self):
is called when an episode ends. In the infinite horizon case, we refer to artificial episodes.
def
step_update( self, ts_t: dm_env._environment.TimeStep, a_t: Union[int, float, numpy.ndarray], ts_tp1: dm_env._environment.TimeStep, h: int):
243 def step_update( 244 self, ts_t: dm_env.TimeStep, a_t: "ACTION_TYPE", ts_tp1: dm_env.TimeStep, h: int 245 ): 246 super(QLearningContinuous, self).step_update(ts_t, a_t, ts_tp1, h) 247 self._actor.set_q_values(self._mdp_model.Q)
adds the transition in input to the MDP model.
Parameters
- ts_t (dm_env.TimeStep): The TimeStep at time t.
- a_t ("ACTION_TYPE"): The action taken by the agent at time t.
- ts_tp1 (dm_env.TimeStep): The TimeStep at time t + 1.
- time (int): The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in the continuous case this refers to the total number of previous interactions.
def
get_q_value_estimate(self) -> numpy.ndarray:
249 def get_q_value_estimate(self) -> np.ndarray: 250 """ 251 Returns 252 ------- 253 np.ndarray 254 The q-values estimate. 255 """ 256 return self._mdp_model.Q
Returns
- np.ndarray: The q-values estimate.