colosseum.agent.agents.episodic.boot_dqn
1from typing import Any, Dict, Callable, TYPE_CHECKING 2 3import gin 4import numpy as np 5import sonnet as snt 6import tensorflow as tf 7from bsuite.baselines.tf.boot_dqn import BootstrappedDqn, make_ensemble 8from ray import tune 9 10from colosseum.dynamic_programming.utils import get_policy_from_q_values 11from colosseum.utils.non_tabular.bsuite import NonTabularBsuiteAgentWrapper 12 13if TYPE_CHECKING: 14 from colosseum.agent.agents.base import BaseAgent 15 from colosseum.utils.acme.specs import MDPSpec 16 17 18@gin.configurable 19class BootDQNEpisodic(NonTabularBsuiteAgentWrapper): 20 """ 21 The wrapper for the `BootDQN` agent from `bsuite`. 22 """ 23 24 @staticmethod 25 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 26 string = "" 27 for k, v in parameters.items(): 28 string += f"prms_{index}/BootDQNEpisodic.{k} = {v}\n" 29 return string[:-1] 30 31 @staticmethod 32 def is_episodic() -> bool: 33 return True 34 35 @staticmethod 36 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 37 return { 38 "network_width": tune.choice([64, 128, 256]), 39 "network_depth": tune.choice([2, 4]), 40 "batch_size": tune.choice([32, 64, 128]), 41 "sgd_period": tune.choice([1, 4, 8]), 42 "target_update_period": tune.choice([4, 16, 32]), 43 "mask_prob": tune.choice([0.8, 0.9, 1.0]), 44 "noise_scale": tune.choice([0.0, 0.05, 0.1]), 45 "n_ensemble": tune.choice([8, 16, 20]), 46 } 47 48 @staticmethod 49 def get_agent_instance_from_parameters( 50 seed: int, 51 optimization_horizon: int, 52 mdp_specs: "MDPSpec", 53 parameters: Dict[str, Any], 54 ) -> "BaseAgent": 55 return BootDQNEpisodic( 56 seed, 57 mdp_specs, 58 optimization_horizon, 59 parameters["network_width"], 60 parameters["network_depth"], 61 parameters["batch_size"], 62 parameters["sgd_period"], 63 parameters["target_update_period"], 64 parameters["mask_prob"], 65 parameters["noise_scale"], 66 parameters["n_ensemble"], 67 ) 68 69 @property 70 def current_optimal_stochastic_policy(self) -> np.ndarray: 71 H, S, d = self.emission_map.all_observations.shape 72 qvals = ( 73 tf.stop_gradient( 74 self._agent._forward[self._agent._active_head]( 75 self.emission_map.all_observations.reshape(-1, d) 76 ) 77 ) 78 .numpy() 79 .reshape(H, S, self._mdp_spec.actions.num_values) 80 ) 81 return get_policy_from_q_values(qvals, True) 82 83 def __init__( 84 self, 85 seed: int, 86 mdp_specs: "MDPSpec", 87 optimization_horizon: int, 88 # MDP model parameters 89 network_width: int, 90 network_depth: int, 91 batch_size: int, 92 sgd_period: int, 93 target_update_period: int, 94 # Actor parameters 95 mask_prob: float, 96 noise_scale: float, 97 n_ensemble: int, 98 learning_rate: float = 1e-3, 99 replay_capacity: int = 10000, 100 epsilon_fn: Callable[[int], float] = lambda t: 0, 101 ): 102 r""" 103 Parameters 104 ---------- 105 seed : int 106 The random seed. 107 mdp_specs : MDPSpec 108 The full specification of the MDP. 109 optimization_horizon : int 110 The total number of interactions that the agent is expected to have with the MDP. 111 network_width : int 112 The width of the neural networks of the agent. 113 network_depth : int 114 The depth of the neural networks of the agent. 115 batch_size : int 116 The batch size for training the agent. 117 sgd_period : int 118 The stochastic gradient descent update period. 119 target_update_period : int 120 The interval length between updating the target network. 121 mask_prob : float 122 The masking probability for the bootstrapping procedure. 123 noise_scale : float 124 The scale of the Gaussian noise_class added to the value estimates. 125 n_ensemble : int 126 The number of ensembles. 127 learning_rate : float 128 The learning rate of the optimizer. By default, it is set to 1e-3. 129 replay_capacity : int 130 The maximum capacity of the replay buffer. By default, it is set to 10 000. 131 epsilon_fn : Callable[[int], float]] 132 The :math:`\epsilon` greedy probability as a function of the time. By default, it is set to zero. 133 """ 134 135 tf.random.set_seed(seed) 136 np.random.seed(seed) 137 138 ensemble = make_ensemble( 139 mdp_specs.actions.num_values, n_ensemble, network_depth, network_width 140 ) 141 optimizer = snt.optimizers.Adam(learning_rate=learning_rate) 142 143 agent = BootstrappedDqn( 144 obs_spec=mdp_specs.observations, 145 action_spec=mdp_specs.actions, 146 ensemble=ensemble, 147 batch_size=batch_size, 148 discount=1, 149 replay_capacity=replay_capacity, 150 min_replay_size=batch_size, 151 sgd_period=sgd_period, 152 target_update_period=target_update_period, 153 optimizer=optimizer, 154 mask_prob=mask_prob, 155 noise_scale=noise_scale, 156 seed=seed, 157 epsilon_fn=epsilon_fn, 158 ) 159 super(BootDQNEpisodic, self).__init__(seed, agent, mdp_specs)
@gin.configurable
class
BootDQNEpisodic19@gin.configurable 20class BootDQNEpisodic(NonTabularBsuiteAgentWrapper): 21 """ 22 The wrapper for the `BootDQN` agent from `bsuite`. 23 """ 24 25 @staticmethod 26 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 27 string = "" 28 for k, v in parameters.items(): 29 string += f"prms_{index}/BootDQNEpisodic.{k} = {v}\n" 30 return string[:-1] 31 32 @staticmethod 33 def is_episodic() -> bool: 34 return True 35 36 @staticmethod 37 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 38 return { 39 "network_width": tune.choice([64, 128, 256]), 40 "network_depth": tune.choice([2, 4]), 41 "batch_size": tune.choice([32, 64, 128]), 42 "sgd_period": tune.choice([1, 4, 8]), 43 "target_update_period": tune.choice([4, 16, 32]), 44 "mask_prob": tune.choice([0.8, 0.9, 1.0]), 45 "noise_scale": tune.choice([0.0, 0.05, 0.1]), 46 "n_ensemble": tune.choice([8, 16, 20]), 47 } 48 49 @staticmethod 50 def get_agent_instance_from_parameters( 51 seed: int, 52 optimization_horizon: int, 53 mdp_specs: "MDPSpec", 54 parameters: Dict[str, Any], 55 ) -> "BaseAgent": 56 return BootDQNEpisodic( 57 seed, 58 mdp_specs, 59 optimization_horizon, 60 parameters["network_width"], 61 parameters["network_depth"], 62 parameters["batch_size"], 63 parameters["sgd_period"], 64 parameters["target_update_period"], 65 parameters["mask_prob"], 66 parameters["noise_scale"], 67 parameters["n_ensemble"], 68 ) 69 70 @property 71 def current_optimal_stochastic_policy(self) -> np.ndarray: 72 H, S, d = self.emission_map.all_observations.shape 73 qvals = ( 74 tf.stop_gradient( 75 self._agent._forward[self._agent._active_head]( 76 self.emission_map.all_observations.reshape(-1, d) 77 ) 78 ) 79 .numpy() 80 .reshape(H, S, self._mdp_spec.actions.num_values) 81 ) 82 return get_policy_from_q_values(qvals, True) 83 84 def __init__( 85 self, 86 seed: int, 87 mdp_specs: "MDPSpec", 88 optimization_horizon: int, 89 # MDP model parameters 90 network_width: int, 91 network_depth: int, 92 batch_size: int, 93 sgd_period: int, 94 target_update_period: int, 95 # Actor parameters 96 mask_prob: float, 97 noise_scale: float, 98 n_ensemble: int, 99 learning_rate: float = 1e-3, 100 replay_capacity: int = 10000, 101 epsilon_fn: Callable[[int], float] = lambda t: 0, 102 ): 103 r""" 104 Parameters 105 ---------- 106 seed : int 107 The random seed. 108 mdp_specs : MDPSpec 109 The full specification of the MDP. 110 optimization_horizon : int 111 The total number of interactions that the agent is expected to have with the MDP. 112 network_width : int 113 The width of the neural networks of the agent. 114 network_depth : int 115 The depth of the neural networks of the agent. 116 batch_size : int 117 The batch size for training the agent. 118 sgd_period : int 119 The stochastic gradient descent update period. 120 target_update_period : int 121 The interval length between updating the target network. 122 mask_prob : float 123 The masking probability for the bootstrapping procedure. 124 noise_scale : float 125 The scale of the Gaussian noise_class added to the value estimates. 126 n_ensemble : int 127 The number of ensembles. 128 learning_rate : float 129 The learning rate of the optimizer. By default, it is set to 1e-3. 130 replay_capacity : int 131 The maximum capacity of the replay buffer. By default, it is set to 10 000. 132 epsilon_fn : Callable[[int], float]] 133 The :math:`\epsilon` greedy probability as a function of the time. By default, it is set to zero. 134 """ 135 136 tf.random.set_seed(seed) 137 np.random.seed(seed) 138 139 ensemble = make_ensemble( 140 mdp_specs.actions.num_values, n_ensemble, network_depth, network_width 141 ) 142 optimizer = snt.optimizers.Adam(learning_rate=learning_rate) 143 144 agent = BootstrappedDqn( 145 obs_spec=mdp_specs.observations, 146 action_spec=mdp_specs.actions, 147 ensemble=ensemble, 148 batch_size=batch_size, 149 discount=1, 150 replay_capacity=replay_capacity, 151 min_replay_size=batch_size, 152 sgd_period=sgd_period, 153 target_update_period=target_update_period, 154 optimizer=optimizer, 155 mask_prob=mask_prob, 156 noise_scale=noise_scale, 157 seed=seed, 158 epsilon_fn=epsilon_fn, 159 ) 160 super(BootDQNEpisodic, self).__init__(seed, agent, mdp_specs)
The wrapper for the BootDQN
agent from bsuite
.
BootDQNEpisodic( seed: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, optimization_horizon: int, network_width: int, network_depth: int, batch_size: int, sgd_period: int, target_update_period: int, mask_prob: float, noise_scale: float, n_ensemble: int, learning_rate: float = 0.001, replay_capacity: int = 10000, epsilon_fn: Callable[[int], float] = <function BootDQNEpisodic.<lambda>>)
84 def __init__( 85 self, 86 seed: int, 87 mdp_specs: "MDPSpec", 88 optimization_horizon: int, 89 # MDP model parameters 90 network_width: int, 91 network_depth: int, 92 batch_size: int, 93 sgd_period: int, 94 target_update_period: int, 95 # Actor parameters 96 mask_prob: float, 97 noise_scale: float, 98 n_ensemble: int, 99 learning_rate: float = 1e-3, 100 replay_capacity: int = 10000, 101 epsilon_fn: Callable[[int], float] = lambda t: 0, 102 ): 103 r""" 104 Parameters 105 ---------- 106 seed : int 107 The random seed. 108 mdp_specs : MDPSpec 109 The full specification of the MDP. 110 optimization_horizon : int 111 The total number of interactions that the agent is expected to have with the MDP. 112 network_width : int 113 The width of the neural networks of the agent. 114 network_depth : int 115 The depth of the neural networks of the agent. 116 batch_size : int 117 The batch size for training the agent. 118 sgd_period : int 119 The stochastic gradient descent update period. 120 target_update_period : int 121 The interval length between updating the target network. 122 mask_prob : float 123 The masking probability for the bootstrapping procedure. 124 noise_scale : float 125 The scale of the Gaussian noise_class added to the value estimates. 126 n_ensemble : int 127 The number of ensembles. 128 learning_rate : float 129 The learning rate of the optimizer. By default, it is set to 1e-3. 130 replay_capacity : int 131 The maximum capacity of the replay buffer. By default, it is set to 10 000. 132 epsilon_fn : Callable[[int], float]] 133 The :math:`\epsilon` greedy probability as a function of the time. By default, it is set to zero. 134 """ 135 136 tf.random.set_seed(seed) 137 np.random.seed(seed) 138 139 ensemble = make_ensemble( 140 mdp_specs.actions.num_values, n_ensemble, network_depth, network_width 141 ) 142 optimizer = snt.optimizers.Adam(learning_rate=learning_rate) 143 144 agent = BootstrappedDqn( 145 obs_spec=mdp_specs.observations, 146 action_spec=mdp_specs.actions, 147 ensemble=ensemble, 148 batch_size=batch_size, 149 discount=1, 150 replay_capacity=replay_capacity, 151 min_replay_size=batch_size, 152 sgd_period=sgd_period, 153 target_update_period=target_update_period, 154 optimizer=optimizer, 155 mask_prob=mask_prob, 156 noise_scale=noise_scale, 157 seed=seed, 158 epsilon_fn=epsilon_fn, 159 ) 160 super(BootDQNEpisodic, self).__init__(seed, agent, mdp_specs)
Parameters
- seed (int): The random seed.
- mdp_specs (MDPSpec): The full specification of the MDP.
- optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
- network_width (int): The width of the neural networks of the agent.
- network_depth (int): The depth of the neural networks of the agent.
- batch_size (int): The batch size for training the agent.
- sgd_period (int): The stochastic gradient descent update period.
- target_update_period (int): The interval length between updating the target network.
- mask_prob (float): The masking probability for the bootstrapping procedure.
- noise_scale (float): The scale of the Gaussian noise_class added to the value estimates.
- n_ensemble (int): The number of ensembles.
- learning_rate (float): The learning rate of the optimizer. By default, it is set to 1e-3.
- replay_capacity (int): The maximum capacity of the replay buffer. By default, it is set to 10 000.
- epsilon_fn (Callable[[int], float]]): The \( \epsilon \) greedy probability as a function of the time. By default, it is set to zero.
@staticmethod
def
produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
25 @staticmethod 26 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 27 string = "" 28 for k, v in parameters.items(): 29 string += f"prms_{index}/BootDQNEpisodic.{k} = {v}\n" 30 return string[:-1]
produces a string containing the gin config file corresponding to the parameters given in input.
Parameters
- parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
- index (int): The index assigned to the gin configuration.
Returns
- gin_config (str): The gin configuration file.
@staticmethod
def
is_episodic() -> bool:
Returns
- bool: True if the agent is suited for the episodic setting.
@staticmethod
def
get_hyperparameters_search_spaces() -> Dict[str, ray.tune.sample.Domain]:
36 @staticmethod 37 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 38 return { 39 "network_width": tune.choice([64, 128, 256]), 40 "network_depth": tune.choice([2, 4]), 41 "batch_size": tune.choice([32, 64, 128]), 42 "sgd_period": tune.choice([1, 4, 8]), 43 "target_update_period": tune.choice([4, 16, 32]), 44 "mask_prob": tune.choice([0.8, 0.9, 1.0]), 45 "noise_scale": tune.choice([0.0, 0.05, 0.1]), 46 "n_ensemble": tune.choice([8, 16, 20]), 47 }
Returns
- Dict[str, tune.sample.Domain]: The dictionary with key value pairs corresponding to hyperparameter names and corresponding
ray.tune
samplers.
@staticmethod
def
get_agent_instance_from_parameters( seed: int, optimization_horizon: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, parameters: Dict[str, Any]) -> colosseum.agent.agents.base.BaseAgent:
49 @staticmethod 50 def get_agent_instance_from_parameters( 51 seed: int, 52 optimization_horizon: int, 53 mdp_specs: "MDPSpec", 54 parameters: Dict[str, Any], 55 ) -> "BaseAgent": 56 return BootDQNEpisodic( 57 seed, 58 mdp_specs, 59 optimization_horizon, 60 parameters["network_width"], 61 parameters["network_depth"], 62 parameters["batch_size"], 63 parameters["sgd_period"], 64 parameters["target_update_period"], 65 parameters["mask_prob"], 66 parameters["noise_scale"], 67 parameters["n_ensemble"], 68 )
returns an agent instance for the mdp specification and agent parameters given in input.
Parameters
- seed (int): The random seed.
- optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
- mdp_specs (MDPSpec): The full specification of the MDP.
- parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
Returns
- BaseAgent: The agent instance.