colosseum.agent.agents.episodic.actor_critic_rnn
1from typing import Dict, Any, TYPE_CHECKING 2 3import dm_env 4import gin 5import numpy as np 6import sonnet as snt 7import tensorflow as tf 8from bsuite.baselines.tf.actor_critic_rnn import PolicyValueRNN, ActorCriticRNN 9from ray import tune 10 11from colosseum.utils.non_tabular.bsuite import NonTabularBsuiteAgentWrapper 12 13if TYPE_CHECKING: 14 from colosseum.agent.agents.base import BaseAgent 15 from colosseum.utils.acme.specs import MDPSpec 16 from colosseum.mdp import ACTION_TYPE 17 18 19@gin.configurable 20class ActorCriticRNNEpisodic(NonTabularBsuiteAgentWrapper): 21 """ 22 The wrapper for the `ActorCriticRNN` agent from `bsuite`. 23 """ 24 25 @staticmethod 26 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 27 string = "" 28 for k, v in parameters.items(): 29 string += f"prms_{index}/ActorCriticRNNEpisodic.{k} = {v}\n" 30 return string[:-1] 31 32 @staticmethod 33 def is_episodic() -> bool: 34 return True 35 36 @staticmethod 37 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 38 return { 39 "network_width": tune.choice([64, 128, 256]), 40 "network_depth": tune.choice([2, 4]), 41 "max_sequence_length": tune.choice([16, 32, 64, 128]), 42 "td_lambda": tune.choice([0.7, 0.8, 0.9]), 43 } 44 45 @staticmethod 46 def get_agent_instance_from_parameters( 47 seed: int, 48 optimization_horizon: int, 49 mdp_specs: "MDPSpec", 50 parameters: Dict[str, Any], 51 ) -> "BaseAgent": 52 53 return ActorCriticRNNEpisodic( 54 seed, 55 mdp_specs, 56 optimization_horizon, 57 parameters["network_width"], 58 parameters["network_depth"], 59 parameters["max_sequence_length"], 60 parameters["td_lambda"], 61 ) 62 63 @property 64 def current_optimal_stochastic_policy(self) -> np.ndarray: 65 H, S, d = self.emission_map.all_observations.shape 66 67 logits = ( 68 self._agent._network( 69 tf.convert_to_tensor(self.emission_map.all_observations.reshape(-1, d)), 70 self._agent._network.initial_state(self._mdp_spec.n_states * H), 71 )[0][0] 72 .numpy() 73 .reshape(H, S, self._mdp_spec.actions.num_values) 74 ) 75 76 return (logits >= logits.max(-1, keepdims=True)).astype(np.float32) 77 78 def __init__( 79 self, 80 seed: int, 81 mdp_specs: "MDPSpec", 82 optimization_horizon: int, 83 # MDP model parameters 84 network_width: int, 85 network_depth: int, 86 max_sequence_length: int, 87 td_lambda: float, 88 ): 89 r""" 90 Parameters 91 ---------- 92 seed : int 93 The random seed. 94 mdp_specs : MDPSpec 95 The full specification of the MDP. 96 optimization_horizon : int 97 The total number of interactions that the agent is expected to have with the MDP. 98 network_width : int 99 The width of the neural networks of the agent. 100 network_depth : int 101 The depth of the neural networks of the agent. 102 max_sequence_length : int 103 The maximum sequence length for training the agent. 104 td_lambda : float 105 The TD(:math:`\lambda`) parameter for training the agent. 106 """ 107 108 tf.random.set_seed(seed) 109 np.random.seed(seed) 110 111 network = PolicyValueRNN( 112 hidden_sizes=[network_width] * network_depth, 113 n_actions=mdp_specs.actions.num_values, 114 ) 115 agent = ActorCriticRNN( 116 obs_spec=mdp_specs.observations, 117 action_spec=mdp_specs.actions, 118 network=network, 119 optimizer=snt.optimizers.Adam(learning_rate=3e-3), 120 max_sequence_length=max_sequence_length, 121 td_lambda=td_lambda, 122 discount=0.99, 123 seed=seed, 124 ) 125 super(ActorCriticRNNEpisodic, self).__init__(seed, agent, mdp_specs) 126 127 def select_action(self, ts: dm_env.TimeStep, time: int) -> "ACTION_TYPE": 128 action = super(ActorCriticRNNEpisodic, self).select_action(ts, time) 129 if action >= self._mdp_spec.actions.num_values: 130 return self._rng.randint(self._mdp_spec.actions.num_values) 131 return action
@gin.configurable
class
ActorCriticRNNEpisodic20@gin.configurable 21class ActorCriticRNNEpisodic(NonTabularBsuiteAgentWrapper): 22 """ 23 The wrapper for the `ActorCriticRNN` agent from `bsuite`. 24 """ 25 26 @staticmethod 27 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 28 string = "" 29 for k, v in parameters.items(): 30 string += f"prms_{index}/ActorCriticRNNEpisodic.{k} = {v}\n" 31 return string[:-1] 32 33 @staticmethod 34 def is_episodic() -> bool: 35 return True 36 37 @staticmethod 38 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 39 return { 40 "network_width": tune.choice([64, 128, 256]), 41 "network_depth": tune.choice([2, 4]), 42 "max_sequence_length": tune.choice([16, 32, 64, 128]), 43 "td_lambda": tune.choice([0.7, 0.8, 0.9]), 44 } 45 46 @staticmethod 47 def get_agent_instance_from_parameters( 48 seed: int, 49 optimization_horizon: int, 50 mdp_specs: "MDPSpec", 51 parameters: Dict[str, Any], 52 ) -> "BaseAgent": 53 54 return ActorCriticRNNEpisodic( 55 seed, 56 mdp_specs, 57 optimization_horizon, 58 parameters["network_width"], 59 parameters["network_depth"], 60 parameters["max_sequence_length"], 61 parameters["td_lambda"], 62 ) 63 64 @property 65 def current_optimal_stochastic_policy(self) -> np.ndarray: 66 H, S, d = self.emission_map.all_observations.shape 67 68 logits = ( 69 self._agent._network( 70 tf.convert_to_tensor(self.emission_map.all_observations.reshape(-1, d)), 71 self._agent._network.initial_state(self._mdp_spec.n_states * H), 72 )[0][0] 73 .numpy() 74 .reshape(H, S, self._mdp_spec.actions.num_values) 75 ) 76 77 return (logits >= logits.max(-1, keepdims=True)).astype(np.float32) 78 79 def __init__( 80 self, 81 seed: int, 82 mdp_specs: "MDPSpec", 83 optimization_horizon: int, 84 # MDP model parameters 85 network_width: int, 86 network_depth: int, 87 max_sequence_length: int, 88 td_lambda: float, 89 ): 90 r""" 91 Parameters 92 ---------- 93 seed : int 94 The random seed. 95 mdp_specs : MDPSpec 96 The full specification of the MDP. 97 optimization_horizon : int 98 The total number of interactions that the agent is expected to have with the MDP. 99 network_width : int 100 The width of the neural networks of the agent. 101 network_depth : int 102 The depth of the neural networks of the agent. 103 max_sequence_length : int 104 The maximum sequence length for training the agent. 105 td_lambda : float 106 The TD(:math:`\lambda`) parameter for training the agent. 107 """ 108 109 tf.random.set_seed(seed) 110 np.random.seed(seed) 111 112 network = PolicyValueRNN( 113 hidden_sizes=[network_width] * network_depth, 114 n_actions=mdp_specs.actions.num_values, 115 ) 116 agent = ActorCriticRNN( 117 obs_spec=mdp_specs.observations, 118 action_spec=mdp_specs.actions, 119 network=network, 120 optimizer=snt.optimizers.Adam(learning_rate=3e-3), 121 max_sequence_length=max_sequence_length, 122 td_lambda=td_lambda, 123 discount=0.99, 124 seed=seed, 125 ) 126 super(ActorCriticRNNEpisodic, self).__init__(seed, agent, mdp_specs) 127 128 def select_action(self, ts: dm_env.TimeStep, time: int) -> "ACTION_TYPE": 129 action = super(ActorCriticRNNEpisodic, self).select_action(ts, time) 130 if action >= self._mdp_spec.actions.num_values: 131 return self._rng.randint(self._mdp_spec.actions.num_values) 132 return action
The wrapper for the ActorCriticRNN
agent from bsuite
.
ActorCriticRNNEpisodic( seed: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, optimization_horizon: int, network_width: int, network_depth: int, max_sequence_length: int, td_lambda: float)
79 def __init__( 80 self, 81 seed: int, 82 mdp_specs: "MDPSpec", 83 optimization_horizon: int, 84 # MDP model parameters 85 network_width: int, 86 network_depth: int, 87 max_sequence_length: int, 88 td_lambda: float, 89 ): 90 r""" 91 Parameters 92 ---------- 93 seed : int 94 The random seed. 95 mdp_specs : MDPSpec 96 The full specification of the MDP. 97 optimization_horizon : int 98 The total number of interactions that the agent is expected to have with the MDP. 99 network_width : int 100 The width of the neural networks of the agent. 101 network_depth : int 102 The depth of the neural networks of the agent. 103 max_sequence_length : int 104 The maximum sequence length for training the agent. 105 td_lambda : float 106 The TD(:math:`\lambda`) parameter for training the agent. 107 """ 108 109 tf.random.set_seed(seed) 110 np.random.seed(seed) 111 112 network = PolicyValueRNN( 113 hidden_sizes=[network_width] * network_depth, 114 n_actions=mdp_specs.actions.num_values, 115 ) 116 agent = ActorCriticRNN( 117 obs_spec=mdp_specs.observations, 118 action_spec=mdp_specs.actions, 119 network=network, 120 optimizer=snt.optimizers.Adam(learning_rate=3e-3), 121 max_sequence_length=max_sequence_length, 122 td_lambda=td_lambda, 123 discount=0.99, 124 seed=seed, 125 ) 126 super(ActorCriticRNNEpisodic, self).__init__(seed, agent, mdp_specs)
Parameters
- seed (int): The random seed.
- mdp_specs (MDPSpec): The full specification of the MDP.
- optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
- network_width (int): The width of the neural networks of the agent.
- network_depth (int): The depth of the neural networks of the agent.
- max_sequence_length (int): The maximum sequence length for training the agent.
- td_lambda (float): The TD(\( \lambda \)) parameter for training the agent.
@staticmethod
def
produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
26 @staticmethod 27 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 28 string = "" 29 for k, v in parameters.items(): 30 string += f"prms_{index}/ActorCriticRNNEpisodic.{k} = {v}\n" 31 return string[:-1]
produces a string containing the gin config file corresponding to the parameters given in input.
Parameters
- parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
- index (int): The index assigned to the gin configuration.
Returns
- gin_config (str): The gin configuration file.
@staticmethod
def
is_episodic() -> bool:
Returns
- bool: True if the agent is suited for the episodic setting.
@staticmethod
def
get_hyperparameters_search_spaces() -> Dict[str, ray.tune.sample.Domain]:
37 @staticmethod 38 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 39 return { 40 "network_width": tune.choice([64, 128, 256]), 41 "network_depth": tune.choice([2, 4]), 42 "max_sequence_length": tune.choice([16, 32, 64, 128]), 43 "td_lambda": tune.choice([0.7, 0.8, 0.9]), 44 }
Returns
- Dict[str, tune.sample.Domain]: The dictionary with key value pairs corresponding to hyperparameter names and corresponding
ray.tune
samplers.
@staticmethod
def
get_agent_instance_from_parameters( seed: int, optimization_horizon: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, parameters: Dict[str, Any]) -> colosseum.agent.agents.base.BaseAgent:
46 @staticmethod 47 def get_agent_instance_from_parameters( 48 seed: int, 49 optimization_horizon: int, 50 mdp_specs: "MDPSpec", 51 parameters: Dict[str, Any], 52 ) -> "BaseAgent": 53 54 return ActorCriticRNNEpisodic( 55 seed, 56 mdp_specs, 57 optimization_horizon, 58 parameters["network_width"], 59 parameters["network_depth"], 60 parameters["max_sequence_length"], 61 parameters["td_lambda"], 62 )
returns an agent instance for the mdp specification and agent parameters given in input.
Parameters
- seed (int): The random seed.
- optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
- mdp_specs (MDPSpec): The full specification of the MDP.
- parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
Returns
- BaseAgent: The agent instance.
current_optimal_stochastic_policy: numpy.ndarray
Returns
- np.ndarray: The estimates of the best optimal policy given the current knowledge of the agent in the form of distribution over actions.
def
select_action( self, ts: dm_env._environment.TimeStep, time: int) -> Union[int, float, numpy.ndarray]:
128 def select_action(self, ts: dm_env.TimeStep, time: int) -> "ACTION_TYPE": 129 action = super(ActorCriticRNNEpisodic, self).select_action(ts, time) 130 if action >= self._mdp_spec.actions.num_values: 131 return self._rng.randint(self._mdp_spec.actions.num_values) 132 return action
Parameters
- ts (dm_env.TimeStep): The TimeStep for which the agent is required to calculate the next action.
- time (int): The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in the continuous case this refers to the total number of previous interactions.
Returns
- action (ACTION_TYPE): The action that the agent suggests to take given the observation and the time step.