colosseum.agent.agents.base
1import abc 2import random 3from typing import TYPE_CHECKING, Any, Dict, Union 4 5import dm_env 6import numpy as np 7from ray import tune 8 9from colosseum.emission_maps import EmissionMap 10from colosseum.utils.acme.specs import MDPSpec 11 12if TYPE_CHECKING: 13 from colosseum.mdp import ACTION_TYPE 14 from colosseum.agent.actors import ACTOR_TYPES 15 from colosseum.agent.mdp_models import MODEL_TYPES 16 17 18class BaseAgent(abc.ABC): 19 """ 20 The base class for Colosseum agents. 21 """ 22 23 @staticmethod 24 @abc.abstractmethod 25 def is_emission_map_accepted(emission_map: "EmissionMap") -> bool: 26 """ 27 Returns 28 ------- 29 bool 30 True if the agent class accepts the emission map. 31 """ 32 33 @staticmethod 34 @abc.abstractmethod 35 def is_episodic() -> bool: 36 """ 37 Returns 38 ------- 39 bool 40 True if the agent is suited for the episodic setting. 41 """ 42 43 @staticmethod 44 @abc.abstractmethod 45 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 46 """ 47 Returns 48 ------- 49 Dict[str, tune.sample.Domain] 50 The dictionary with key value pairs corresponding to hyperparameter names and corresponding `ray.tune` samplers. 51 """ 52 53 @staticmethod 54 @abc.abstractmethod 55 def produce_gin_file_from_parameters( 56 parameters: Dict[str, Any], index: int = 0 57 ) -> str: 58 """ 59 produces a string containing the gin config file corresponding to the parameters given in input. 60 61 Parameters 62 ---------- 63 parameters : Dict[str, Any] 64 The dictionary containing the parameters of the agent. 65 index : int 66 The index assigned to the gin configuration. 67 68 Returns 69 ------- 70 gin_config : str 71 The gin configuration file. 72 """ 73 74 @staticmethod 75 @abc.abstractmethod 76 def get_agent_instance_from_parameters( 77 seed: int, 78 optimization_horizon: int, 79 mdp_specs: MDPSpec, 80 parameters: Dict[str, Any], 81 ) -> "BaseAgent": 82 """ 83 returns an agent instance for the mdp specification and agent parameters given in input. 84 85 Parameters 86 ---------- 87 seed : int 88 The random seed. 89 optimization_horizon : int 90 The total number of interactions that the agent is expected to have with the MDP. 91 mdp_specs : MDPSpec 92 The full specification of the MDP. 93 parameters : Dict[str, Any] 94 The dictionary containing the parameters of the agent. 95 96 Returns 97 ------- 98 BaseAgent 99 The agent instance. 100 """ 101 102 @abc.abstractmethod 103 def __init__( 104 self, 105 seed: int, 106 mdp_specs: "MDPSpec", 107 mdp_model: Union[None, "MODEL_TYPES"], 108 actor: "ACTOR_TYPES", 109 optimization_horizon: int, 110 ): 111 """ 112 Parameters 113 ---------- 114 seed : int 115 The random seed. 116 mdp_specs : MDPSpec 117 The full specification of the MDP. 118 mdp_model : BaseMDPModel 119 The component of the agent that contains the knowledge acquired from the interactions with 120 the MDP. 121 actor : BaseActor 122 The component of the agent that provide a mapping from MDP estimates to actions. 123 optimization_horizon : int 124 The total number of interactions that the agent is expected to have with the MDP. 125 """ 126 self._mdp_spec = mdp_specs 127 self._mdp_model = mdp_model 128 self._actor = actor 129 self._optimization_horizon = optimization_horizon 130 self._time_horizon = mdp_specs.time_horizon 131 132 self._rng = np.random.RandomState(seed) 133 self._rng_fast = random.Random(seed) 134 135 @property 136 @abc.abstractmethod 137 def current_optimal_stochastic_policy(self) -> np.ndarray: 138 """ 139 Returns 140 ------- 141 np.ndarray 142 The estimates of the best optimal policy given the current knowledge of the agent in the form of 143 distribution over actions. 144 """ 145 146 @abc.abstractmethod 147 def episode_end_update(self): 148 """ 149 is called when an episode ends. In the infinite horizon case, we refer to artificial episodes. 150 """ 151 152 @abc.abstractmethod 153 def before_start_interacting(self): 154 """ 155 is called before the agent starts interacting with the MDP. 156 """ 157 158 def is_episode_end( 159 self, 160 ts_t: dm_env.TimeStep, 161 a_t: "ACTION_TYPE", 162 ts_tp1: dm_env.TimeStep, 163 time: int, 164 ) -> bool: 165 """ 166 checks whether the episode is terminated. By default, this checks whether the current time step exceeds the time 167 horizon. In the continuous case, this can be used to define artificial episodes. 168 169 Parameters 170 ---------- 171 ts_t : dm_env.TimeStep 172 The TimeStep at time t. 173 a_t : "ACTION_TYPE" 174 The action taken by the agent at time t. 175 ts_tp1 : dm_env.TimeStep 176 The TimeStep at time t + 1. 177 time : int 178 The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in 179 the continuous case this refers to the total number of previous interactions. 180 181 Returns 182 ------- 183 bool 184 True if the episode terminated at time t+1. 185 """ 186 return ts_tp1.last() 187 188 def select_action(self, ts: dm_env.TimeStep, time: int) -> "ACTION_TYPE": 189 """ 190 Parameters 191 ---------- 192 ts : dm_env.TimeStep 193 The TimeStep for which the agent is required to calculate the next action. 194 time : int 195 The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in 196 the continuous case this refers to the total number of previous interactions. 197 198 Returns 199 ------- 200 action : ACTION_TYPE 201 The action that the agent suggests to take given the observation and the time step. 202 """ 203 return self._actor.select_action(ts, time) 204 205 @abc.abstractmethod 206 def step_update( 207 self, 208 ts_t: dm_env.TimeStep, 209 a_t: "ACTION_TYPE", 210 ts_tp1: dm_env.TimeStep, 211 time: int, 212 ): 213 """ 214 adds the transition in input to the MDP model. 215 216 Parameters 217 ---------- 218 ts_t : dm_env.TimeStep 219 The TimeStep at time t. 220 a_t : "ACTION_TYPE" 221 The action taken by the agent at time t. 222 ts_tp1 : dm_env.TimeStep 223 The TimeStep at time t + 1. 224 time : int 225 The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in 226 the continuous case this refers to the total number of previous interactions. 227 """ 228 if self._mdp_model: 229 self._mdp_model.step_update(ts_t, a_t, ts_tp1, time) 230 231 def agent_logs(self): 232 """ 233 is called during the agent MDP interaction at lagging time. It can be used to log additional information. 234 """
class
BaseAgent(abc.ABC):
19class BaseAgent(abc.ABC): 20 """ 21 The base class for Colosseum agents. 22 """ 23 24 @staticmethod 25 @abc.abstractmethod 26 def is_emission_map_accepted(emission_map: "EmissionMap") -> bool: 27 """ 28 Returns 29 ------- 30 bool 31 True if the agent class accepts the emission map. 32 """ 33 34 @staticmethod 35 @abc.abstractmethod 36 def is_episodic() -> bool: 37 """ 38 Returns 39 ------- 40 bool 41 True if the agent is suited for the episodic setting. 42 """ 43 44 @staticmethod 45 @abc.abstractmethod 46 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 47 """ 48 Returns 49 ------- 50 Dict[str, tune.sample.Domain] 51 The dictionary with key value pairs corresponding to hyperparameter names and corresponding `ray.tune` samplers. 52 """ 53 54 @staticmethod 55 @abc.abstractmethod 56 def produce_gin_file_from_parameters( 57 parameters: Dict[str, Any], index: int = 0 58 ) -> str: 59 """ 60 produces a string containing the gin config file corresponding to the parameters given in input. 61 62 Parameters 63 ---------- 64 parameters : Dict[str, Any] 65 The dictionary containing the parameters of the agent. 66 index : int 67 The index assigned to the gin configuration. 68 69 Returns 70 ------- 71 gin_config : str 72 The gin configuration file. 73 """ 74 75 @staticmethod 76 @abc.abstractmethod 77 def get_agent_instance_from_parameters( 78 seed: int, 79 optimization_horizon: int, 80 mdp_specs: MDPSpec, 81 parameters: Dict[str, Any], 82 ) -> "BaseAgent": 83 """ 84 returns an agent instance for the mdp specification and agent parameters given in input. 85 86 Parameters 87 ---------- 88 seed : int 89 The random seed. 90 optimization_horizon : int 91 The total number of interactions that the agent is expected to have with the MDP. 92 mdp_specs : MDPSpec 93 The full specification of the MDP. 94 parameters : Dict[str, Any] 95 The dictionary containing the parameters of the agent. 96 97 Returns 98 ------- 99 BaseAgent 100 The agent instance. 101 """ 102 103 @abc.abstractmethod 104 def __init__( 105 self, 106 seed: int, 107 mdp_specs: "MDPSpec", 108 mdp_model: Union[None, "MODEL_TYPES"], 109 actor: "ACTOR_TYPES", 110 optimization_horizon: int, 111 ): 112 """ 113 Parameters 114 ---------- 115 seed : int 116 The random seed. 117 mdp_specs : MDPSpec 118 The full specification of the MDP. 119 mdp_model : BaseMDPModel 120 The component of the agent that contains the knowledge acquired from the interactions with 121 the MDP. 122 actor : BaseActor 123 The component of the agent that provide a mapping from MDP estimates to actions. 124 optimization_horizon : int 125 The total number of interactions that the agent is expected to have with the MDP. 126 """ 127 self._mdp_spec = mdp_specs 128 self._mdp_model = mdp_model 129 self._actor = actor 130 self._optimization_horizon = optimization_horizon 131 self._time_horizon = mdp_specs.time_horizon 132 133 self._rng = np.random.RandomState(seed) 134 self._rng_fast = random.Random(seed) 135 136 @property 137 @abc.abstractmethod 138 def current_optimal_stochastic_policy(self) -> np.ndarray: 139 """ 140 Returns 141 ------- 142 np.ndarray 143 The estimates of the best optimal policy given the current knowledge of the agent in the form of 144 distribution over actions. 145 """ 146 147 @abc.abstractmethod 148 def episode_end_update(self): 149 """ 150 is called when an episode ends. In the infinite horizon case, we refer to artificial episodes. 151 """ 152 153 @abc.abstractmethod 154 def before_start_interacting(self): 155 """ 156 is called before the agent starts interacting with the MDP. 157 """ 158 159 def is_episode_end( 160 self, 161 ts_t: dm_env.TimeStep, 162 a_t: "ACTION_TYPE", 163 ts_tp1: dm_env.TimeStep, 164 time: int, 165 ) -> bool: 166 """ 167 checks whether the episode is terminated. By default, this checks whether the current time step exceeds the time 168 horizon. In the continuous case, this can be used to define artificial episodes. 169 170 Parameters 171 ---------- 172 ts_t : dm_env.TimeStep 173 The TimeStep at time t. 174 a_t : "ACTION_TYPE" 175 The action taken by the agent at time t. 176 ts_tp1 : dm_env.TimeStep 177 The TimeStep at time t + 1. 178 time : int 179 The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in 180 the continuous case this refers to the total number of previous interactions. 181 182 Returns 183 ------- 184 bool 185 True if the episode terminated at time t+1. 186 """ 187 return ts_tp1.last() 188 189 def select_action(self, ts: dm_env.TimeStep, time: int) -> "ACTION_TYPE": 190 """ 191 Parameters 192 ---------- 193 ts : dm_env.TimeStep 194 The TimeStep for which the agent is required to calculate the next action. 195 time : int 196 The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in 197 the continuous case this refers to the total number of previous interactions. 198 199 Returns 200 ------- 201 action : ACTION_TYPE 202 The action that the agent suggests to take given the observation and the time step. 203 """ 204 return self._actor.select_action(ts, time) 205 206 @abc.abstractmethod 207 def step_update( 208 self, 209 ts_t: dm_env.TimeStep, 210 a_t: "ACTION_TYPE", 211 ts_tp1: dm_env.TimeStep, 212 time: int, 213 ): 214 """ 215 adds the transition in input to the MDP model. 216 217 Parameters 218 ---------- 219 ts_t : dm_env.TimeStep 220 The TimeStep at time t. 221 a_t : "ACTION_TYPE" 222 The action taken by the agent at time t. 223 ts_tp1 : dm_env.TimeStep 224 The TimeStep at time t + 1. 225 time : int 226 The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in 227 the continuous case this refers to the total number of previous interactions. 228 """ 229 if self._mdp_model: 230 self._mdp_model.step_update(ts_t, a_t, ts_tp1, time) 231 232 def agent_logs(self): 233 """ 234 is called during the agent MDP interaction at lagging time. It can be used to log additional information. 235 """
The base class for Colosseum agents.
@abc.abstractmethod
BaseAgent( seed: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, mdp_model: Optional[ForwardRef('MODEL_TYPES')], actor: Union[colosseum.agent.actors.base.BaseActor, colosseum.agent.actors.Q_values_actor.QValuesActor], optimization_horizon: int)
103 @abc.abstractmethod 104 def __init__( 105 self, 106 seed: int, 107 mdp_specs: "MDPSpec", 108 mdp_model: Union[None, "MODEL_TYPES"], 109 actor: "ACTOR_TYPES", 110 optimization_horizon: int, 111 ): 112 """ 113 Parameters 114 ---------- 115 seed : int 116 The random seed. 117 mdp_specs : MDPSpec 118 The full specification of the MDP. 119 mdp_model : BaseMDPModel 120 The component of the agent that contains the knowledge acquired from the interactions with 121 the MDP. 122 actor : BaseActor 123 The component of the agent that provide a mapping from MDP estimates to actions. 124 optimization_horizon : int 125 The total number of interactions that the agent is expected to have with the MDP. 126 """ 127 self._mdp_spec = mdp_specs 128 self._mdp_model = mdp_model 129 self._actor = actor 130 self._optimization_horizon = optimization_horizon 131 self._time_horizon = mdp_specs.time_horizon 132 133 self._rng = np.random.RandomState(seed) 134 self._rng_fast = random.Random(seed)
Parameters
- seed (int): The random seed.
- mdp_specs (MDPSpec): The full specification of the MDP.
- mdp_model (BaseMDPModel): The component of the agent that contains the knowledge acquired from the interactions with the MDP.
- actor (BaseActor): The component of the agent that provide a mapping from MDP estimates to actions.
- optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
@staticmethod
@abc.abstractmethod
def
is_emission_map_accepted(emission_map: colosseum.emission_maps.base.EmissionMap) -> bool:
24 @staticmethod 25 @abc.abstractmethod 26 def is_emission_map_accepted(emission_map: "EmissionMap") -> bool: 27 """ 28 Returns 29 ------- 30 bool 31 True if the agent class accepts the emission map. 32 """
Returns
- bool: True if the agent class accepts the emission map.
@staticmethod
@abc.abstractmethod
def
is_episodic() -> bool:
34 @staticmethod 35 @abc.abstractmethod 36 def is_episodic() -> bool: 37 """ 38 Returns 39 ------- 40 bool 41 True if the agent is suited for the episodic setting. 42 """
Returns
- bool: True if the agent is suited for the episodic setting.
@staticmethod
@abc.abstractmethod
def
get_hyperparameters_search_spaces() -> Dict[str, ray.tune.sample.Domain]:
44 @staticmethod 45 @abc.abstractmethod 46 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 47 """ 48 Returns 49 ------- 50 Dict[str, tune.sample.Domain] 51 The dictionary with key value pairs corresponding to hyperparameter names and corresponding `ray.tune` samplers. 52 """
Returns
- Dict[str, tune.sample.Domain]: The dictionary with key value pairs corresponding to hyperparameter names and corresponding
ray.tune
samplers.
@staticmethod
@abc.abstractmethod
def
produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0) -> str:
54 @staticmethod 55 @abc.abstractmethod 56 def produce_gin_file_from_parameters( 57 parameters: Dict[str, Any], index: int = 0 58 ) -> str: 59 """ 60 produces a string containing the gin config file corresponding to the parameters given in input. 61 62 Parameters 63 ---------- 64 parameters : Dict[str, Any] 65 The dictionary containing the parameters of the agent. 66 index : int 67 The index assigned to the gin configuration. 68 69 Returns 70 ------- 71 gin_config : str 72 The gin configuration file. 73 """
produces a string containing the gin config file corresponding to the parameters given in input.
Parameters
- parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
- index (int): The index assigned to the gin configuration.
Returns
- gin_config (str): The gin configuration file.
@staticmethod
@abc.abstractmethod
def
get_agent_instance_from_parameters( seed: int, optimization_horizon: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, parameters: Dict[str, Any]) -> colosseum.agent.agents.base.BaseAgent:
75 @staticmethod 76 @abc.abstractmethod 77 def get_agent_instance_from_parameters( 78 seed: int, 79 optimization_horizon: int, 80 mdp_specs: MDPSpec, 81 parameters: Dict[str, Any], 82 ) -> "BaseAgent": 83 """ 84 returns an agent instance for the mdp specification and agent parameters given in input. 85 86 Parameters 87 ---------- 88 seed : int 89 The random seed. 90 optimization_horizon : int 91 The total number of interactions that the agent is expected to have with the MDP. 92 mdp_specs : MDPSpec 93 The full specification of the MDP. 94 parameters : Dict[str, Any] 95 The dictionary containing the parameters of the agent. 96 97 Returns 98 ------- 99 BaseAgent 100 The agent instance. 101 """
returns an agent instance for the mdp specification and agent parameters given in input.
Parameters
- seed (int): The random seed.
- optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
- mdp_specs (MDPSpec): The full specification of the MDP.
- parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
Returns
- BaseAgent: The agent instance.
current_optimal_stochastic_policy: numpy.ndarray
Returns
- np.ndarray: The estimates of the best optimal policy given the current knowledge of the agent in the form of distribution over actions.
@abc.abstractmethod
def
episode_end_update(self):
147 @abc.abstractmethod 148 def episode_end_update(self): 149 """ 150 is called when an episode ends. In the infinite horizon case, we refer to artificial episodes. 151 """
is called when an episode ends. In the infinite horizon case, we refer to artificial episodes.
@abc.abstractmethod
def
before_start_interacting(self):
153 @abc.abstractmethod 154 def before_start_interacting(self): 155 """ 156 is called before the agent starts interacting with the MDP. 157 """
is called before the agent starts interacting with the MDP.
def
is_episode_end( self, ts_t: dm_env._environment.TimeStep, a_t: Union[int, float, numpy.ndarray], ts_tp1: dm_env._environment.TimeStep, time: int) -> bool:
159 def is_episode_end( 160 self, 161 ts_t: dm_env.TimeStep, 162 a_t: "ACTION_TYPE", 163 ts_tp1: dm_env.TimeStep, 164 time: int, 165 ) -> bool: 166 """ 167 checks whether the episode is terminated. By default, this checks whether the current time step exceeds the time 168 horizon. In the continuous case, this can be used to define artificial episodes. 169 170 Parameters 171 ---------- 172 ts_t : dm_env.TimeStep 173 The TimeStep at time t. 174 a_t : "ACTION_TYPE" 175 The action taken by the agent at time t. 176 ts_tp1 : dm_env.TimeStep 177 The TimeStep at time t + 1. 178 time : int 179 The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in 180 the continuous case this refers to the total number of previous interactions. 181 182 Returns 183 ------- 184 bool 185 True if the episode terminated at time t+1. 186 """ 187 return ts_tp1.last()
checks whether the episode is terminated. By default, this checks whether the current time step exceeds the time horizon. In the continuous case, this can be used to define artificial episodes.
Parameters
- ts_t (dm_env.TimeStep): The TimeStep at time t.
- a_t ("ACTION_TYPE"): The action taken by the agent at time t.
- ts_tp1 (dm_env.TimeStep): The TimeStep at time t + 1.
- time (int): The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in the continuous case this refers to the total number of previous interactions.
Returns
- bool: True if the episode terminated at time t+1.
def
select_action( self, ts: dm_env._environment.TimeStep, time: int) -> Union[int, float, numpy.ndarray]:
189 def select_action(self, ts: dm_env.TimeStep, time: int) -> "ACTION_TYPE": 190 """ 191 Parameters 192 ---------- 193 ts : dm_env.TimeStep 194 The TimeStep for which the agent is required to calculate the next action. 195 time : int 196 The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in 197 the continuous case this refers to the total number of previous interactions. 198 199 Returns 200 ------- 201 action : ACTION_TYPE 202 The action that the agent suggests to take given the observation and the time step. 203 """ 204 return self._actor.select_action(ts, time)
Parameters
- ts (dm_env.TimeStep): The TimeStep for which the agent is required to calculate the next action.
- time (int): The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in the continuous case this refers to the total number of previous interactions.
Returns
- action (ACTION_TYPE): The action that the agent suggests to take given the observation and the time step.
@abc.abstractmethod
def
step_update( self, ts_t: dm_env._environment.TimeStep, a_t: Union[int, float, numpy.ndarray], ts_tp1: dm_env._environment.TimeStep, time: int):
206 @abc.abstractmethod 207 def step_update( 208 self, 209 ts_t: dm_env.TimeStep, 210 a_t: "ACTION_TYPE", 211 ts_tp1: dm_env.TimeStep, 212 time: int, 213 ): 214 """ 215 adds the transition in input to the MDP model. 216 217 Parameters 218 ---------- 219 ts_t : dm_env.TimeStep 220 The TimeStep at time t. 221 a_t : "ACTION_TYPE" 222 The action taken by the agent at time t. 223 ts_tp1 : dm_env.TimeStep 224 The TimeStep at time t + 1. 225 time : int 226 The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in 227 the continuous case this refers to the total number of previous interactions. 228 """ 229 if self._mdp_model: 230 self._mdp_model.step_update(ts_t, a_t, ts_tp1, time)
adds the transition in input to the MDP model.
Parameters
- ts_t (dm_env.TimeStep): The TimeStep at time t.
- a_t ("ACTION_TYPE"): The action taken by the agent at time t.
- ts_tp1 (dm_env.TimeStep): The TimeStep at time t + 1.
- time (int): The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in the continuous case this refers to the total number of previous interactions.