colosseum.agent.agents.infinite_horizon.actor_critic_rnn
1from typing import Dict, Any, TYPE_CHECKING 2 3import gin 4import numpy as np 5import sonnet as snt 6import tensorflow as tf 7from bsuite.baselines.tf.actor_critic_rnn import PolicyValueRNN, ActorCriticRNN 8from ray import tune 9 10from colosseum.utils.non_tabular.bsuite import NonTabularBsuiteAgentWrapper 11 12if TYPE_CHECKING: 13 from colosseum.agent.agents.base import BaseAgent 14 from colosseum.utils.acme.specs import MDPSpec 15 16 17@gin.configurable 18class ActorCriticRNNContinuous(NonTabularBsuiteAgentWrapper): 19 """ 20 The wrapper for the `ActorCriticRNN` agent from `bsuite`. 21 """ 22 23 @staticmethod 24 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 25 string = "" 26 for k, v in parameters.items(): 27 string += f"prms_{index}/ActorCriticRNNContinuous.{k} = {v}\n" 28 return string[:-1] 29 30 @staticmethod 31 def is_episodic() -> bool: 32 return False 33 34 @staticmethod 35 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 36 return { 37 "network_width": tune.choice([64, 128, 256]), 38 "network_depth": tune.choice([2, 4]), 39 "max_sequence_length": tune.choice([16, 32, 64, 128]), 40 "td_lambda": tune.choice([0.7, 0.8, 0.9]), 41 } 42 43 @staticmethod 44 def get_agent_instance_from_parameters( 45 seed: int, 46 optimization_horizon: int, 47 mdp_specs: "MDPSpec", 48 parameters: Dict[str, Any], 49 ) -> "BaseAgent": 50 51 return ActorCriticRNNContinuous( 52 seed, 53 mdp_specs, 54 optimization_horizon, 55 parameters["network_width"], 56 parameters["network_depth"], 57 parameters["max_sequence_length"], 58 parameters["td_lambda"], 59 ) 60 61 @property 62 def current_optimal_stochastic_policy(self) -> np.ndarray: 63 logits = tf.stop_gradient( 64 self._agent._network( 65 tf.convert_to_tensor(self.emission_map.all_observations), 66 self._agent._network.initial_state(self._mdp_spec.n_states), 67 )[0][0] 68 ).numpy() 69 70 return (logits >= logits.max(-1, keepdims=True)).astype(np.float32) 71 72 def __init__( 73 self, 74 seed: int, 75 mdp_specs: "MDPSpec", 76 optimization_horizon: int, 77 # MDP model parameters 78 network_width: int, 79 network_depth: int, 80 max_sequence_length: int, 81 td_lambda: float, 82 ): 83 r""" 84 Parameters 85 ---------- 86 seed : int 87 The random seed. 88 mdp_specs : MDPSpec 89 The full specification of the MDP. 90 optimization_horizon : int 91 The total number of interactions that the agent is expected to have with the MDP. 92 network_width : int 93 The width of the neural networks of the agent. 94 network_depth : int 95 The depth of the neural networks of the agent. 96 max_sequence_length : int 97 The maximum sequence length for training the agent. 98 td_lambda : float 99 The TD(:math:`\lambda`) parameter for training the agent. 100 """ 101 102 tf.random.set_seed(seed) 103 np.random.seed(seed) 104 105 network = PolicyValueRNN( 106 hidden_sizes=[network_width] * network_depth, 107 n_actions=mdp_specs.actions.num_values, 108 ) 109 agent = ActorCriticRNN( 110 obs_spec=mdp_specs.observations, 111 action_spec=mdp_specs.actions, 112 network=network, 113 optimizer=snt.optimizers.Adam(learning_rate=3e-3), 114 max_sequence_length=max_sequence_length, 115 td_lambda=td_lambda, 116 discount=0.99, 117 seed=seed, 118 ) 119 super(ActorCriticRNNContinuous, self).__init__(seed, agent, mdp_specs)
@gin.configurable
class
ActorCriticRNNContinuous18@gin.configurable 19class ActorCriticRNNContinuous(NonTabularBsuiteAgentWrapper): 20 """ 21 The wrapper for the `ActorCriticRNN` agent from `bsuite`. 22 """ 23 24 @staticmethod 25 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 26 string = "" 27 for k, v in parameters.items(): 28 string += f"prms_{index}/ActorCriticRNNContinuous.{k} = {v}\n" 29 return string[:-1] 30 31 @staticmethod 32 def is_episodic() -> bool: 33 return False 34 35 @staticmethod 36 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 37 return { 38 "network_width": tune.choice([64, 128, 256]), 39 "network_depth": tune.choice([2, 4]), 40 "max_sequence_length": tune.choice([16, 32, 64, 128]), 41 "td_lambda": tune.choice([0.7, 0.8, 0.9]), 42 } 43 44 @staticmethod 45 def get_agent_instance_from_parameters( 46 seed: int, 47 optimization_horizon: int, 48 mdp_specs: "MDPSpec", 49 parameters: Dict[str, Any], 50 ) -> "BaseAgent": 51 52 return ActorCriticRNNContinuous( 53 seed, 54 mdp_specs, 55 optimization_horizon, 56 parameters["network_width"], 57 parameters["network_depth"], 58 parameters["max_sequence_length"], 59 parameters["td_lambda"], 60 ) 61 62 @property 63 def current_optimal_stochastic_policy(self) -> np.ndarray: 64 logits = tf.stop_gradient( 65 self._agent._network( 66 tf.convert_to_tensor(self.emission_map.all_observations), 67 self._agent._network.initial_state(self._mdp_spec.n_states), 68 )[0][0] 69 ).numpy() 70 71 return (logits >= logits.max(-1, keepdims=True)).astype(np.float32) 72 73 def __init__( 74 self, 75 seed: int, 76 mdp_specs: "MDPSpec", 77 optimization_horizon: int, 78 # MDP model parameters 79 network_width: int, 80 network_depth: int, 81 max_sequence_length: int, 82 td_lambda: float, 83 ): 84 r""" 85 Parameters 86 ---------- 87 seed : int 88 The random seed. 89 mdp_specs : MDPSpec 90 The full specification of the MDP. 91 optimization_horizon : int 92 The total number of interactions that the agent is expected to have with the MDP. 93 network_width : int 94 The width of the neural networks of the agent. 95 network_depth : int 96 The depth of the neural networks of the agent. 97 max_sequence_length : int 98 The maximum sequence length for training the agent. 99 td_lambda : float 100 The TD(:math:`\lambda`) parameter for training the agent. 101 """ 102 103 tf.random.set_seed(seed) 104 np.random.seed(seed) 105 106 network = PolicyValueRNN( 107 hidden_sizes=[network_width] * network_depth, 108 n_actions=mdp_specs.actions.num_values, 109 ) 110 agent = ActorCriticRNN( 111 obs_spec=mdp_specs.observations, 112 action_spec=mdp_specs.actions, 113 network=network, 114 optimizer=snt.optimizers.Adam(learning_rate=3e-3), 115 max_sequence_length=max_sequence_length, 116 td_lambda=td_lambda, 117 discount=0.99, 118 seed=seed, 119 ) 120 super(ActorCriticRNNContinuous, self).__init__(seed, agent, mdp_specs)
The wrapper for the ActorCriticRNN
agent from bsuite
.
ActorCriticRNNContinuous( seed: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, optimization_horizon: int, network_width: int, network_depth: int, max_sequence_length: int, td_lambda: float)
73 def __init__( 74 self, 75 seed: int, 76 mdp_specs: "MDPSpec", 77 optimization_horizon: int, 78 # MDP model parameters 79 network_width: int, 80 network_depth: int, 81 max_sequence_length: int, 82 td_lambda: float, 83 ): 84 r""" 85 Parameters 86 ---------- 87 seed : int 88 The random seed. 89 mdp_specs : MDPSpec 90 The full specification of the MDP. 91 optimization_horizon : int 92 The total number of interactions that the agent is expected to have with the MDP. 93 network_width : int 94 The width of the neural networks of the agent. 95 network_depth : int 96 The depth of the neural networks of the agent. 97 max_sequence_length : int 98 The maximum sequence length for training the agent. 99 td_lambda : float 100 The TD(:math:`\lambda`) parameter for training the agent. 101 """ 102 103 tf.random.set_seed(seed) 104 np.random.seed(seed) 105 106 network = PolicyValueRNN( 107 hidden_sizes=[network_width] * network_depth, 108 n_actions=mdp_specs.actions.num_values, 109 ) 110 agent = ActorCriticRNN( 111 obs_spec=mdp_specs.observations, 112 action_spec=mdp_specs.actions, 113 network=network, 114 optimizer=snt.optimizers.Adam(learning_rate=3e-3), 115 max_sequence_length=max_sequence_length, 116 td_lambda=td_lambda, 117 discount=0.99, 118 seed=seed, 119 ) 120 super(ActorCriticRNNContinuous, self).__init__(seed, agent, mdp_specs)
Parameters
- seed (int): The random seed.
- mdp_specs (MDPSpec): The full specification of the MDP.
- optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
- network_width (int): The width of the neural networks of the agent.
- network_depth (int): The depth of the neural networks of the agent.
- max_sequence_length (int): The maximum sequence length for training the agent.
- td_lambda (float): The TD(\( \lambda \)) parameter for training the agent.
@staticmethod
def
produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
24 @staticmethod 25 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 26 string = "" 27 for k, v in parameters.items(): 28 string += f"prms_{index}/ActorCriticRNNContinuous.{k} = {v}\n" 29 return string[:-1]
produces a string containing the gin config file corresponding to the parameters given in input.
Parameters
- parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
- index (int): The index assigned to the gin configuration.
Returns
- gin_config (str): The gin configuration file.
@staticmethod
def
is_episodic() -> bool:
Returns
- bool: True if the agent is suited for the episodic setting.
@staticmethod
def
get_hyperparameters_search_spaces() -> Dict[str, ray.tune.sample.Domain]:
35 @staticmethod 36 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 37 return { 38 "network_width": tune.choice([64, 128, 256]), 39 "network_depth": tune.choice([2, 4]), 40 "max_sequence_length": tune.choice([16, 32, 64, 128]), 41 "td_lambda": tune.choice([0.7, 0.8, 0.9]), 42 }
Returns
- Dict[str, tune.sample.Domain]: The dictionary with key value pairs corresponding to hyperparameter names and corresponding
ray.tune
samplers.
@staticmethod
def
get_agent_instance_from_parameters( seed: int, optimization_horizon: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, parameters: Dict[str, Any]) -> colosseum.agent.agents.base.BaseAgent:
44 @staticmethod 45 def get_agent_instance_from_parameters( 46 seed: int, 47 optimization_horizon: int, 48 mdp_specs: "MDPSpec", 49 parameters: Dict[str, Any], 50 ) -> "BaseAgent": 51 52 return ActorCriticRNNContinuous( 53 seed, 54 mdp_specs, 55 optimization_horizon, 56 parameters["network_width"], 57 parameters["network_depth"], 58 parameters["max_sequence_length"], 59 parameters["td_lambda"], 60 )
returns an agent instance for the mdp specification and agent parameters given in input.
Parameters
- seed (int): The random seed.
- optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
- mdp_specs (MDPSpec): The full specification of the MDP.
- parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
Returns
- BaseAgent: The agent instance.