colosseum.agent.agents.infinite_horizon.dqn
1from typing import Any, Dict, TYPE_CHECKING 2 3import gin 4import numpy as np 5import sonnet as snt 6import tensorflow as tf 7from bsuite.baselines.tf.dqn import DQN 8from ray import tune 9 10from colosseum.dynamic_programming.utils import argmax_2d 11from colosseum.utils.non_tabular.bsuite import NonTabularBsuiteAgentWrapper 12 13if TYPE_CHECKING: 14 from colosseum.agent.agents.base import BaseAgent 15 from colosseum.utils.acme.specs import MDPSpec 16 17 18@gin.configurable 19class DQNContinuous(NonTabularBsuiteAgentWrapper): 20 """ 21 The wrapper for the `DQN` agent from `bsuite`. 22 """ 23 24 @staticmethod 25 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 26 string = "" 27 for k, v in parameters.items(): 28 string += f"prms_{index}/DQNContinuous.{k} = {v}\n" 29 return string[:-1] 30 31 @staticmethod 32 def is_episodic() -> bool: 33 return False 34 35 @staticmethod 36 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 37 return { 38 "network_width": tune.choice([64, 128, 256]), 39 "network_depth": tune.choice([2, 4]), 40 "batch_size": tune.choice([32, 64, 128]), 41 "sgd_period": tune.choice([1, 4, 8]), 42 "target_update_period": tune.choice([4, 16, 32]), 43 "epsilon": tune.choice([0.01, 0.05, 0.1]), 44 } 45 46 @staticmethod 47 def get_agent_instance_from_parameters( 48 seed: int, 49 optimization_horizon: int, 50 mdp_specs: "MDPSpec", 51 parameters: Dict[str, Any], 52 ) -> "BaseAgent": 53 54 return DQNContinuous( 55 seed, 56 mdp_specs, 57 optimization_horizon, 58 parameters["network_width"], 59 parameters["network_depth"], 60 parameters["batch_size"], 61 parameters["sgd_period"], 62 parameters["target_update_period"], 63 parameters["epsilon"], 64 ) 65 66 @property 67 def current_optimal_stochastic_policy(self) -> np.ndarray: 68 qvals = tf.stop_gradient( 69 self._agent._forward(self.emission_map.all_observations) 70 ).numpy() 71 return argmax_2d(qvals) 72 73 def __init__( 74 self, 75 seed: int, 76 mdp_specs: "MDPSpec", 77 optimization_horizon: int, 78 # MDP model parameters 79 network_width: int, 80 network_depth: int, 81 batch_size: int, 82 sgd_period: int, 83 target_update_period: int, 84 # Actor parameters 85 epsilon: float, 86 ): 87 r""" 88 Parameters 89 ---------- 90 seed : int 91 The random seed. 92 mdp_specs : MDPSpec 93 The full specification of the MDP. 94 optimization_horizon : int 95 The total number of interactions that the agent is expected to have with the MDP. 96 network_width : int 97 The width of the neural networks of the agent. 98 network_depth : int 99 The depth of the neural networks of the agent. 100 batch_size : int 101 The batch size for training the agent. 102 sgd_period : int 103 The stochastic gradient descent update period. 104 target_update_period : int 105 The interval length between updating the target network. 106 epsilon : Callable[[int], float]] 107 The :math:`\epsilon` greedy probability as a function of the time. 108 """ 109 110 tf.random.set_seed(seed) 111 np.random.seed(seed) 112 113 network = snt.Sequential( 114 [ 115 snt.Flatten(), 116 snt.nets.MLP( 117 [network_width] * network_depth + [mdp_specs.actions.num_values] 118 ), 119 ] 120 ) 121 optimizer = snt.optimizers.Adam(learning_rate=1e-3) 122 123 agent = DQN( 124 action_spec=mdp_specs.actions, 125 network=network, 126 batch_size=batch_size, 127 discount=0.99, 128 replay_capacity=10000, 129 min_replay_size=100, 130 sgd_period=sgd_period, 131 target_update_period=target_update_period, 132 optimizer=optimizer, 133 epsilon=epsilon, 134 seed=seed, 135 ) 136 137 super(DQNContinuous, self).__init__(seed, agent, mdp_specs)
@gin.configurable
class
DQNContinuous19@gin.configurable 20class DQNContinuous(NonTabularBsuiteAgentWrapper): 21 """ 22 The wrapper for the `DQN` agent from `bsuite`. 23 """ 24 25 @staticmethod 26 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 27 string = "" 28 for k, v in parameters.items(): 29 string += f"prms_{index}/DQNContinuous.{k} = {v}\n" 30 return string[:-1] 31 32 @staticmethod 33 def is_episodic() -> bool: 34 return False 35 36 @staticmethod 37 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 38 return { 39 "network_width": tune.choice([64, 128, 256]), 40 "network_depth": tune.choice([2, 4]), 41 "batch_size": tune.choice([32, 64, 128]), 42 "sgd_period": tune.choice([1, 4, 8]), 43 "target_update_period": tune.choice([4, 16, 32]), 44 "epsilon": tune.choice([0.01, 0.05, 0.1]), 45 } 46 47 @staticmethod 48 def get_agent_instance_from_parameters( 49 seed: int, 50 optimization_horizon: int, 51 mdp_specs: "MDPSpec", 52 parameters: Dict[str, Any], 53 ) -> "BaseAgent": 54 55 return DQNContinuous( 56 seed, 57 mdp_specs, 58 optimization_horizon, 59 parameters["network_width"], 60 parameters["network_depth"], 61 parameters["batch_size"], 62 parameters["sgd_period"], 63 parameters["target_update_period"], 64 parameters["epsilon"], 65 ) 66 67 @property 68 def current_optimal_stochastic_policy(self) -> np.ndarray: 69 qvals = tf.stop_gradient( 70 self._agent._forward(self.emission_map.all_observations) 71 ).numpy() 72 return argmax_2d(qvals) 73 74 def __init__( 75 self, 76 seed: int, 77 mdp_specs: "MDPSpec", 78 optimization_horizon: int, 79 # MDP model parameters 80 network_width: int, 81 network_depth: int, 82 batch_size: int, 83 sgd_period: int, 84 target_update_period: int, 85 # Actor parameters 86 epsilon: float, 87 ): 88 r""" 89 Parameters 90 ---------- 91 seed : int 92 The random seed. 93 mdp_specs : MDPSpec 94 The full specification of the MDP. 95 optimization_horizon : int 96 The total number of interactions that the agent is expected to have with the MDP. 97 network_width : int 98 The width of the neural networks of the agent. 99 network_depth : int 100 The depth of the neural networks of the agent. 101 batch_size : int 102 The batch size for training the agent. 103 sgd_period : int 104 The stochastic gradient descent update period. 105 target_update_period : int 106 The interval length between updating the target network. 107 epsilon : Callable[[int], float]] 108 The :math:`\epsilon` greedy probability as a function of the time. 109 """ 110 111 tf.random.set_seed(seed) 112 np.random.seed(seed) 113 114 network = snt.Sequential( 115 [ 116 snt.Flatten(), 117 snt.nets.MLP( 118 [network_width] * network_depth + [mdp_specs.actions.num_values] 119 ), 120 ] 121 ) 122 optimizer = snt.optimizers.Adam(learning_rate=1e-3) 123 124 agent = DQN( 125 action_spec=mdp_specs.actions, 126 network=network, 127 batch_size=batch_size, 128 discount=0.99, 129 replay_capacity=10000, 130 min_replay_size=100, 131 sgd_period=sgd_period, 132 target_update_period=target_update_period, 133 optimizer=optimizer, 134 epsilon=epsilon, 135 seed=seed, 136 ) 137 138 super(DQNContinuous, self).__init__(seed, agent, mdp_specs)
The wrapper for the DQN
agent from bsuite
.
DQNContinuous( seed: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, optimization_horizon: int, network_width: int, network_depth: int, batch_size: int, sgd_period: int, target_update_period: int, epsilon: float)
74 def __init__( 75 self, 76 seed: int, 77 mdp_specs: "MDPSpec", 78 optimization_horizon: int, 79 # MDP model parameters 80 network_width: int, 81 network_depth: int, 82 batch_size: int, 83 sgd_period: int, 84 target_update_period: int, 85 # Actor parameters 86 epsilon: float, 87 ): 88 r""" 89 Parameters 90 ---------- 91 seed : int 92 The random seed. 93 mdp_specs : MDPSpec 94 The full specification of the MDP. 95 optimization_horizon : int 96 The total number of interactions that the agent is expected to have with the MDP. 97 network_width : int 98 The width of the neural networks of the agent. 99 network_depth : int 100 The depth of the neural networks of the agent. 101 batch_size : int 102 The batch size for training the agent. 103 sgd_period : int 104 The stochastic gradient descent update period. 105 target_update_period : int 106 The interval length between updating the target network. 107 epsilon : Callable[[int], float]] 108 The :math:`\epsilon` greedy probability as a function of the time. 109 """ 110 111 tf.random.set_seed(seed) 112 np.random.seed(seed) 113 114 network = snt.Sequential( 115 [ 116 snt.Flatten(), 117 snt.nets.MLP( 118 [network_width] * network_depth + [mdp_specs.actions.num_values] 119 ), 120 ] 121 ) 122 optimizer = snt.optimizers.Adam(learning_rate=1e-3) 123 124 agent = DQN( 125 action_spec=mdp_specs.actions, 126 network=network, 127 batch_size=batch_size, 128 discount=0.99, 129 replay_capacity=10000, 130 min_replay_size=100, 131 sgd_period=sgd_period, 132 target_update_period=target_update_period, 133 optimizer=optimizer, 134 epsilon=epsilon, 135 seed=seed, 136 ) 137 138 super(DQNContinuous, self).__init__(seed, agent, mdp_specs)
Parameters
- seed (int): The random seed.
- mdp_specs (MDPSpec): The full specification of the MDP.
- optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
- network_width (int): The width of the neural networks of the agent.
- network_depth (int): The depth of the neural networks of the agent.
- batch_size (int): The batch size for training the agent.
- sgd_period (int): The stochastic gradient descent update period.
- target_update_period (int): The interval length between updating the target network.
- epsilon (Callable[[int], float]]): The \( \epsilon \) greedy probability as a function of the time.
@staticmethod
def
produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
25 @staticmethod 26 def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): 27 string = "" 28 for k, v in parameters.items(): 29 string += f"prms_{index}/DQNContinuous.{k} = {v}\n" 30 return string[:-1]
produces a string containing the gin config file corresponding to the parameters given in input.
Parameters
- parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
- index (int): The index assigned to the gin configuration.
Returns
- gin_config (str): The gin configuration file.
@staticmethod
def
is_episodic() -> bool:
Returns
- bool: True if the agent is suited for the episodic setting.
@staticmethod
def
get_hyperparameters_search_spaces() -> Dict[str, ray.tune.sample.Domain]:
36 @staticmethod 37 def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]: 38 return { 39 "network_width": tune.choice([64, 128, 256]), 40 "network_depth": tune.choice([2, 4]), 41 "batch_size": tune.choice([32, 64, 128]), 42 "sgd_period": tune.choice([1, 4, 8]), 43 "target_update_period": tune.choice([4, 16, 32]), 44 "epsilon": tune.choice([0.01, 0.05, 0.1]), 45 }
Returns
- Dict[str, tune.sample.Domain]: The dictionary with key value pairs corresponding to hyperparameter names and corresponding
ray.tune
samplers.
@staticmethod
def
get_agent_instance_from_parameters( seed: int, optimization_horizon: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, parameters: Dict[str, Any]) -> colosseum.agent.agents.base.BaseAgent:
47 @staticmethod 48 def get_agent_instance_from_parameters( 49 seed: int, 50 optimization_horizon: int, 51 mdp_specs: "MDPSpec", 52 parameters: Dict[str, Any], 53 ) -> "BaseAgent": 54 55 return DQNContinuous( 56 seed, 57 mdp_specs, 58 optimization_horizon, 59 parameters["network_width"], 60 parameters["network_depth"], 61 parameters["batch_size"], 62 parameters["sgd_period"], 63 parameters["target_update_period"], 64 parameters["epsilon"], 65 )
returns an agent instance for the mdp specification and agent parameters given in input.
Parameters
- seed (int): The random seed.
- optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
- mdp_specs (MDPSpec): The full specification of the MDP.
- parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
Returns
- BaseAgent: The agent instance.