colosseum.agent.agents.episodic.actor_critic
1from typing import Dict, Any, TYPE_CHECKING 2 3import gin 4import numpy as np 5import sonnet as snt 6import tensorflow as tf 7from bsuite.baselines.tf.actor_critic import PolicyValueNet, ActorCritic 8from ray import tune 9 10from colosseum.utils.non_tabular.bsuite import NonTabularBsuiteAgentWrapper 11 12if TYPE_CHECKING: 13 from colosseum.agent.agents.base import BaseAgent 14 from colosseum.utils.acme.specs import MDPSpec 15 16 17@gin.configurable 18class ActorCriticEpisodic(NonTabularBsuiteAgentWrapper): 19 """ 20 The wrapper for the `ActorCritic` agent from `bsuite`. 21 """ 22 23 @staticmethod 24 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 25 string = "" 26 for k, v in parameters.items(): 27 string += f"prms_{index}/ActorCriticEpisodic.{k} = {v}\n" 28 return string[:-1] 29 30 @staticmethod 31 def is_episodic() -> bool: 32 return True 33 34 @staticmethod 35 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 36 return { 37 "network_width": tune.choice([64, 128, 256]), 38 "network_depth": tune.choice([2, 4]), 39 "max_sequence_length": tune.choice([16, 32, 64, 128]), 40 "td_lambda": tune.choice([0.7, 0.8, 0.9]), 41 } 42 43 @staticmethod 44 def get_agent_instance_from_parameters( 45 seed: int, 46 optimization_horizon: int, 47 mdp_specs: "MDPSpec", 48 parameters: Dict[str, Any], 49 ) -> "BaseAgent": 50 51 return ActorCriticEpisodic( 52 seed, 53 mdp_specs, 54 optimization_horizon, 55 parameters["network_width"], 56 parameters["network_depth"], 57 parameters["max_sequence_length"], 58 parameters["td_lambda"], 59 ) 60 61 @property 62 def current_optimal_stochastic_policy(self) -> np.ndarray: 63 H, S, d = self.emission_map.all_observations.shape 64 logits = ( 65 tf.stop_gradient( 66 self._agent._network( 67 tf.convert_to_tensor( 68 self.emission_map.all_observations.reshape(-1, d) 69 ) 70 )[0].logits 71 ) 72 .numpy() 73 .reshape(H, S, self._mdp_spec.actions.num_values) 74 ) 75 return (logits >= logits.max(-1, keepdims=True)).astype(np.float32) 76 77 def __init__( 78 self, 79 seed: int, 80 mdp_specs: "MDPSpec", 81 optimization_horizon: int, 82 # MDP model parameters 83 network_width: int, 84 network_depth: int, 85 max_sequence_length: int, 86 td_lambda: float, 87 ): 88 r""" 89 Parameters 90 ---------- 91 seed : int 92 The random seed. 93 mdp_specs : MDPSpec 94 The full specification of the MDP. 95 optimization_horizon : int 96 The total number of interactions that the agent is expected to have with the MDP. 97 network_width : int 98 The width of the neural networks of the agent. 99 network_depth : int 100 The depth of the neural networks of the agent. 101 max_sequence_length : int 102 The maximum sequence length for training the agent. 103 td_lambda : float 104 The TD(:math:`\lambda`) parameter for training the agent. 105 """ 106 107 tf.random.set_seed(seed) 108 np.random.seed(seed) 109 110 network = PolicyValueNet( 111 hidden_sizes=[network_width] * network_depth, 112 action_spec=mdp_specs.actions, 113 ) 114 agent = ActorCritic( 115 obs_spec=mdp_specs.observations, 116 action_spec=mdp_specs.actions, 117 network=network, 118 optimizer=snt.optimizers.Adam(learning_rate=3e-3), 119 max_sequence_length=max_sequence_length, 120 td_lambda=td_lambda, 121 discount=0.99, 122 seed=seed, 123 ) 124 super(ActorCriticEpisodic, self).__init__(seed, agent, mdp_specs)
@gin.configurable
class
ActorCriticEpisodic18@gin.configurable 19class ActorCriticEpisodic(NonTabularBsuiteAgentWrapper): 20 """ 21 The wrapper for the `ActorCritic` agent from `bsuite`. 22 """ 23 24 @staticmethod 25 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 26 string = "" 27 for k, v in parameters.items(): 28 string += f"prms_{index}/ActorCriticEpisodic.{k} = {v}\n" 29 return string[:-1] 30 31 @staticmethod 32 def is_episodic() -> bool: 33 return True 34 35 @staticmethod 36 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 37 return { 38 "network_width": tune.choice([64, 128, 256]), 39 "network_depth": tune.choice([2, 4]), 40 "max_sequence_length": tune.choice([16, 32, 64, 128]), 41 "td_lambda": tune.choice([0.7, 0.8, 0.9]), 42 } 43 44 @staticmethod 45 def get_agent_instance_from_parameters( 46 seed: int, 47 optimization_horizon: int, 48 mdp_specs: "MDPSpec", 49 parameters: Dict[str, Any], 50 ) -> "BaseAgent": 51 52 return ActorCriticEpisodic( 53 seed, 54 mdp_specs, 55 optimization_horizon, 56 parameters["network_width"], 57 parameters["network_depth"], 58 parameters["max_sequence_length"], 59 parameters["td_lambda"], 60 ) 61 62 @property 63 def current_optimal_stochastic_policy(self) -> np.ndarray: 64 H, S, d = self.emission_map.all_observations.shape 65 logits = ( 66 tf.stop_gradient( 67 self._agent._network( 68 tf.convert_to_tensor( 69 self.emission_map.all_observations.reshape(-1, d) 70 ) 71 )[0].logits 72 ) 73 .numpy() 74 .reshape(H, S, self._mdp_spec.actions.num_values) 75 ) 76 return (logits >= logits.max(-1, keepdims=True)).astype(np.float32) 77 78 def __init__( 79 self, 80 seed: int, 81 mdp_specs: "MDPSpec", 82 optimization_horizon: int, 83 # MDP model parameters 84 network_width: int, 85 network_depth: int, 86 max_sequence_length: int, 87 td_lambda: float, 88 ): 89 r""" 90 Parameters 91 ---------- 92 seed : int 93 The random seed. 94 mdp_specs : MDPSpec 95 The full specification of the MDP. 96 optimization_horizon : int 97 The total number of interactions that the agent is expected to have with the MDP. 98 network_width : int 99 The width of the neural networks of the agent. 100 network_depth : int 101 The depth of the neural networks of the agent. 102 max_sequence_length : int 103 The maximum sequence length for training the agent. 104 td_lambda : float 105 The TD(:math:`\lambda`) parameter for training the agent. 106 """ 107 108 tf.random.set_seed(seed) 109 np.random.seed(seed) 110 111 network = PolicyValueNet( 112 hidden_sizes=[network_width] * network_depth, 113 action_spec=mdp_specs.actions, 114 ) 115 agent = ActorCritic( 116 obs_spec=mdp_specs.observations, 117 action_spec=mdp_specs.actions, 118 network=network, 119 optimizer=snt.optimizers.Adam(learning_rate=3e-3), 120 max_sequence_length=max_sequence_length, 121 td_lambda=td_lambda, 122 discount=0.99, 123 seed=seed, 124 ) 125 super(ActorCriticEpisodic, self).__init__(seed, agent, mdp_specs)
The wrapper for the ActorCritic
agent from bsuite
.
ActorCriticEpisodic( seed: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, optimization_horizon: int, network_width: int, network_depth: int, max_sequence_length: int, td_lambda: float)
78 def __init__( 79 self, 80 seed: int, 81 mdp_specs: "MDPSpec", 82 optimization_horizon: int, 83 # MDP model parameters 84 network_width: int, 85 network_depth: int, 86 max_sequence_length: int, 87 td_lambda: float, 88 ): 89 r""" 90 Parameters 91 ---------- 92 seed : int 93 The random seed. 94 mdp_specs : MDPSpec 95 The full specification of the MDP. 96 optimization_horizon : int 97 The total number of interactions that the agent is expected to have with the MDP. 98 network_width : int 99 The width of the neural networks of the agent. 100 network_depth : int 101 The depth of the neural networks of the agent. 102 max_sequence_length : int 103 The maximum sequence length for training the agent. 104 td_lambda : float 105 The TD(:math:`\lambda`) parameter for training the agent. 106 """ 107 108 tf.random.set_seed(seed) 109 np.random.seed(seed) 110 111 network = PolicyValueNet( 112 hidden_sizes=[network_width] * network_depth, 113 action_spec=mdp_specs.actions, 114 ) 115 agent = ActorCritic( 116 obs_spec=mdp_specs.observations, 117 action_spec=mdp_specs.actions, 118 network=network, 119 optimizer=snt.optimizers.Adam(learning_rate=3e-3), 120 max_sequence_length=max_sequence_length, 121 td_lambda=td_lambda, 122 discount=0.99, 123 seed=seed, 124 ) 125 super(ActorCriticEpisodic, self).__init__(seed, agent, mdp_specs)
Parameters
- seed (int): The random seed.
- mdp_specs (MDPSpec): The full specification of the MDP.
- optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
- network_width (int): The width of the neural networks of the agent.
- network_depth (int): The depth of the neural networks of the agent.
- max_sequence_length (int): The maximum sequence length for training the agent.
- td_lambda (float): The TD(\( \lambda \)) parameter for training the agent.
@staticmethod
def
produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
24 @staticmethod 25 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 26 string = "" 27 for k, v in parameters.items(): 28 string += f"prms_{index}/ActorCriticEpisodic.{k} = {v}\n" 29 return string[:-1]
produces a string containing the gin config file corresponding to the parameters given in input.
Parameters
- parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
- index (int): The index assigned to the gin configuration.
Returns
- gin_config (str): The gin configuration file.
@staticmethod
def
is_episodic() -> bool:
Returns
- bool: True if the agent is suited for the episodic setting.
@staticmethod
def
get_hyperparameters_search_spaces() -> Dict[str, ray.tune.sample.Domain]:
35 @staticmethod 36 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 37 return { 38 "network_width": tune.choice([64, 128, 256]), 39 "network_depth": tune.choice([2, 4]), 40 "max_sequence_length": tune.choice([16, 32, 64, 128]), 41 "td_lambda": tune.choice([0.7, 0.8, 0.9]), 42 }
Returns
- Dict[str, tune.sample.Domain]: The dictionary with key value pairs corresponding to hyperparameter names and corresponding
ray.tune
samplers.
@staticmethod
def
get_agent_instance_from_parameters( seed: int, optimization_horizon: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, parameters: Dict[str, Any]) -> colosseum.agent.agents.base.BaseAgent:
44 @staticmethod 45 def get_agent_instance_from_parameters( 46 seed: int, 47 optimization_horizon: int, 48 mdp_specs: "MDPSpec", 49 parameters: Dict[str, Any], 50 ) -> "BaseAgent": 51 52 return ActorCriticEpisodic( 53 seed, 54 mdp_specs, 55 optimization_horizon, 56 parameters["network_width"], 57 parameters["network_depth"], 58 parameters["max_sequence_length"], 59 parameters["td_lambda"], 60 )
returns an agent instance for the mdp specification and agent parameters given in input.
Parameters
- seed (int): The random seed.
- optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
- mdp_specs (MDPSpec): The full specification of the MDP.
- parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
Returns
- BaseAgent: The agent instance.