colosseum.agent.agents.episodic.dqn
1from typing import Any, Dict, TYPE_CHECKING 2 3import gin 4import numpy as np 5import sonnet as snt 6import tensorflow as tf 7from bsuite.baselines.tf.dqn import DQN 8from ray import tune 9 10from colosseum.dynamic_programming.utils import get_policy_from_q_values 11from colosseum.utils.non_tabular.bsuite import NonTabularBsuiteAgentWrapper 12 13if TYPE_CHECKING: 14 from colosseum.agent.agents.base import BaseAgent 15 from colosseum.utils.acme.specs import MDPSpec 16 17 18@gin.configurable 19class DQNEpisodic(NonTabularBsuiteAgentWrapper): 20 """ 21 The wrapper for the `DQN` agent from `bsuite`. 22 """ 23 24 @staticmethod 25 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 26 string = "" 27 for k, v in parameters.items(): 28 string += f"prms_{index}/DQNEpisodic.{k} = {v}\n" 29 return string[:-1] 30 31 @staticmethod 32 def is_episodic() -> bool: 33 return True 34 35 @staticmethod 36 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 37 return { 38 "network_width": tune.choice([64, 128, 256]), 39 "network_depth": tune.choice([2, 4]), 40 "batch_size": tune.choice([32, 64, 128]), 41 "sgd_period": tune.choice([1, 4, 8]), 42 "target_update_period": tune.choice([4, 16, 32]), 43 "epsilon": tune.choice([0.01, 0.05, 0.1]), 44 } 45 46 @staticmethod 47 def get_agent_instance_from_parameters( 48 seed: int, 49 optimization_horizon: int, 50 mdp_specs: "MDPSpec", 51 parameters: Dict[str, Any], 52 ) -> "BaseAgent": 53 54 return DQNEpisodic( 55 seed, 56 mdp_specs, 57 optimization_horizon, 58 parameters["network_width"], 59 parameters["network_depth"], 60 parameters["batch_size"], 61 parameters["sgd_period"], 62 parameters["target_update_period"], 63 parameters["epsilon"], 64 ) 65 66 @property 67 def current_optimal_stochastic_policy(self) -> np.ndarray: 68 H, S, d = self.emission_map.all_observations.shape 69 qvals = ( 70 tf.stop_gradient( 71 self._agent._forward(self.emission_map.all_observations.reshape(-1, d)) 72 ) 73 .numpy() 74 .reshape(H, S, self._mdp_spec.actions.num_values) 75 ) 76 return get_policy_from_q_values(qvals, True) 77 78 def __init__( 79 self, 80 seed: int, 81 mdp_specs: "MDPSpec", 82 optimization_horizon: int, 83 # MDP model parameters 84 network_width: int, 85 network_depth: int, 86 batch_size: int, 87 sgd_period: int, 88 target_update_period: int, 89 # Actor parameters 90 epsilon: float, 91 ): 92 r""" 93 Parameters 94 ---------- 95 seed : int 96 The random seed. 97 mdp_specs : MDPSpec 98 The full specification of the MDP. 99 optimization_horizon : int 100 The total number of interactions that the agent is expected to have with the MDP. 101 network_width : int 102 The width of the neural networks of the agent. 103 network_depth : int 104 The depth of the neural networks of the agent. 105 batch_size : int 106 The batch size for training the agent. 107 sgd_period : int 108 The stochastic gradient descent update period. 109 target_update_period : int 110 The interval length between updating the target network. 111 epsilon : Callable[[int], float]] 112 The :math:`\epsilon` greedy probability as a function of the time. 113 """ 114 115 tf.random.set_seed(seed) 116 np.random.seed(seed) 117 118 network = snt.Sequential( 119 [ 120 snt.Flatten(), 121 snt.nets.MLP( 122 [network_width] * network_depth + [mdp_specs.actions.num_values] 123 ), 124 ] 125 ) 126 optimizer = snt.optimizers.Adam(learning_rate=1e-3) 127 128 agent = DQN( 129 action_spec=mdp_specs.actions, 130 network=network, 131 batch_size=batch_size, 132 discount=1, 133 replay_capacity=10000, 134 min_replay_size=100, 135 sgd_period=sgd_period, 136 target_update_period=target_update_period, 137 optimizer=optimizer, 138 epsilon=epsilon, 139 seed=seed, 140 ) 141 142 super(DQNEpisodic, self).__init__(seed, agent, mdp_specs)
@gin.configurable
class
DQNEpisodic19@gin.configurable 20class DQNEpisodic(NonTabularBsuiteAgentWrapper): 21 """ 22 The wrapper for the `DQN` agent from `bsuite`. 23 """ 24 25 @staticmethod 26 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 27 string = "" 28 for k, v in parameters.items(): 29 string += f"prms_{index}/DQNEpisodic.{k} = {v}\n" 30 return string[:-1] 31 32 @staticmethod 33 def is_episodic() -> bool: 34 return True 35 36 @staticmethod 37 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 38 return { 39 "network_width": tune.choice([64, 128, 256]), 40 "network_depth": tune.choice([2, 4]), 41 "batch_size": tune.choice([32, 64, 128]), 42 "sgd_period": tune.choice([1, 4, 8]), 43 "target_update_period": tune.choice([4, 16, 32]), 44 "epsilon": tune.choice([0.01, 0.05, 0.1]), 45 } 46 47 @staticmethod 48 def get_agent_instance_from_parameters( 49 seed: int, 50 optimization_horizon: int, 51 mdp_specs: "MDPSpec", 52 parameters: Dict[str, Any], 53 ) -> "BaseAgent": 54 55 return DQNEpisodic( 56 seed, 57 mdp_specs, 58 optimization_horizon, 59 parameters["network_width"], 60 parameters["network_depth"], 61 parameters["batch_size"], 62 parameters["sgd_period"], 63 parameters["target_update_period"], 64 parameters["epsilon"], 65 ) 66 67 @property 68 def current_optimal_stochastic_policy(self) -> np.ndarray: 69 H, S, d = self.emission_map.all_observations.shape 70 qvals = ( 71 tf.stop_gradient( 72 self._agent._forward(self.emission_map.all_observations.reshape(-1, d)) 73 ) 74 .numpy() 75 .reshape(H, S, self._mdp_spec.actions.num_values) 76 ) 77 return get_policy_from_q_values(qvals, True) 78 79 def __init__( 80 self, 81 seed: int, 82 mdp_specs: "MDPSpec", 83 optimization_horizon: int, 84 # MDP model parameters 85 network_width: int, 86 network_depth: int, 87 batch_size: int, 88 sgd_period: int, 89 target_update_period: int, 90 # Actor parameters 91 epsilon: float, 92 ): 93 r""" 94 Parameters 95 ---------- 96 seed : int 97 The random seed. 98 mdp_specs : MDPSpec 99 The full specification of the MDP. 100 optimization_horizon : int 101 The total number of interactions that the agent is expected to have with the MDP. 102 network_width : int 103 The width of the neural networks of the agent. 104 network_depth : int 105 The depth of the neural networks of the agent. 106 batch_size : int 107 The batch size for training the agent. 108 sgd_period : int 109 The stochastic gradient descent update period. 110 target_update_period : int 111 The interval length between updating the target network. 112 epsilon : Callable[[int], float]] 113 The :math:`\epsilon` greedy probability as a function of the time. 114 """ 115 116 tf.random.set_seed(seed) 117 np.random.seed(seed) 118 119 network = snt.Sequential( 120 [ 121 snt.Flatten(), 122 snt.nets.MLP( 123 [network_width] * network_depth + [mdp_specs.actions.num_values] 124 ), 125 ] 126 ) 127 optimizer = snt.optimizers.Adam(learning_rate=1e-3) 128 129 agent = DQN( 130 action_spec=mdp_specs.actions, 131 network=network, 132 batch_size=batch_size, 133 discount=1, 134 replay_capacity=10000, 135 min_replay_size=100, 136 sgd_period=sgd_period, 137 target_update_period=target_update_period, 138 optimizer=optimizer, 139 epsilon=epsilon, 140 seed=seed, 141 ) 142 143 super(DQNEpisodic, self).__init__(seed, agent, mdp_specs)
The wrapper for the DQN
agent from bsuite
.
DQNEpisodic( seed: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, optimization_horizon: int, network_width: int, network_depth: int, batch_size: int, sgd_period: int, target_update_period: int, epsilon: float)
79 def __init__( 80 self, 81 seed: int, 82 mdp_specs: "MDPSpec", 83 optimization_horizon: int, 84 # MDP model parameters 85 network_width: int, 86 network_depth: int, 87 batch_size: int, 88 sgd_period: int, 89 target_update_period: int, 90 # Actor parameters 91 epsilon: float, 92 ): 93 r""" 94 Parameters 95 ---------- 96 seed : int 97 The random seed. 98 mdp_specs : MDPSpec 99 The full specification of the MDP. 100 optimization_horizon : int 101 The total number of interactions that the agent is expected to have with the MDP. 102 network_width : int 103 The width of the neural networks of the agent. 104 network_depth : int 105 The depth of the neural networks of the agent. 106 batch_size : int 107 The batch size for training the agent. 108 sgd_period : int 109 The stochastic gradient descent update period. 110 target_update_period : int 111 The interval length between updating the target network. 112 epsilon : Callable[[int], float]] 113 The :math:`\epsilon` greedy probability as a function of the time. 114 """ 115 116 tf.random.set_seed(seed) 117 np.random.seed(seed) 118 119 network = snt.Sequential( 120 [ 121 snt.Flatten(), 122 snt.nets.MLP( 123 [network_width] * network_depth + [mdp_specs.actions.num_values] 124 ), 125 ] 126 ) 127 optimizer = snt.optimizers.Adam(learning_rate=1e-3) 128 129 agent = DQN( 130 action_spec=mdp_specs.actions, 131 network=network, 132 batch_size=batch_size, 133 discount=1, 134 replay_capacity=10000, 135 min_replay_size=100, 136 sgd_period=sgd_period, 137 target_update_period=target_update_period, 138 optimizer=optimizer, 139 epsilon=epsilon, 140 seed=seed, 141 ) 142 143 super(DQNEpisodic, self).__init__(seed, agent, mdp_specs)
Parameters
- seed (int): The random seed.
- mdp_specs (MDPSpec): The full specification of the MDP.
- optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
- network_width (int): The width of the neural networks of the agent.
- network_depth (int): The depth of the neural networks of the agent.
- batch_size (int): The batch size for training the agent.
- sgd_period (int): The stochastic gradient descent update period.
- target_update_period (int): The interval length between updating the target network.
- epsilon (Callable[[int], float]]): The \( \epsilon \) greedy probability as a function of the time.
@staticmethod
def
produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
25 @staticmethod 26 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 27 string = "" 28 for k, v in parameters.items(): 29 string += f"prms_{index}/DQNEpisodic.{k} = {v}\n" 30 return string[:-1]
produces a string containing the gin config file corresponding to the parameters given in input.
Parameters
- parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
- index (int): The index assigned to the gin configuration.
Returns
- gin_config (str): The gin configuration file.
@staticmethod
def
is_episodic() -> bool:
Returns
- bool: True if the agent is suited for the episodic setting.
@staticmethod
def
get_hyperparameters_search_spaces() -> Dict[str, ray.tune.sample.Domain]:
36 @staticmethod 37 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 38 return { 39 "network_width": tune.choice([64, 128, 256]), 40 "network_depth": tune.choice([2, 4]), 41 "batch_size": tune.choice([32, 64, 128]), 42 "sgd_period": tune.choice([1, 4, 8]), 43 "target_update_period": tune.choice([4, 16, 32]), 44 "epsilon": tune.choice([0.01, 0.05, 0.1]), 45 }
Returns
- Dict[str, tune.sample.Domain]: The dictionary with key value pairs corresponding to hyperparameter names and corresponding
ray.tune
samplers.
@staticmethod
def
get_agent_instance_from_parameters( seed: int, optimization_horizon: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, parameters: Dict[str, Any]) -> colosseum.agent.agents.base.BaseAgent:
47 @staticmethod 48 def get_agent_instance_from_parameters( 49 seed: int, 50 optimization_horizon: int, 51 mdp_specs: "MDPSpec", 52 parameters: Dict[str, Any], 53 ) -> "BaseAgent": 54 55 return DQNEpisodic( 56 seed, 57 mdp_specs, 58 optimization_horizon, 59 parameters["network_width"], 60 parameters["network_depth"], 61 parameters["batch_size"], 62 parameters["sgd_period"], 63 parameters["target_update_period"], 64 parameters["epsilon"], 65 )
returns an agent instance for the mdp specification and agent parameters given in input.
Parameters
- seed (int): The random seed.
- optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
- mdp_specs (MDPSpec): The full specification of the MDP.
- parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
Returns
- BaseAgent: The agent instance.