colosseum.agent.agents.infinite_horizon.dqn

  1from typing import Any, Dict, TYPE_CHECKING
  2
  3import gin
  4import numpy as np
  5import sonnet as snt
  6import tensorflow as tf
  7from bsuite.baselines.tf.dqn import DQN
  8from ray import tune
  9
 10from colosseum.dynamic_programming.utils import argmax_2d
 11from colosseum.utils.non_tabular.bsuite import NonTabularBsuiteAgentWrapper
 12
 13if TYPE_CHECKING:
 14    from colosseum.agent.agents.base import BaseAgent
 15    from colosseum.utils.acme.specs import MDPSpec
 16
 17
 18@gin.configurable
 19class DQNContinuous(NonTabularBsuiteAgentWrapper):
 20    """
 21    The wrapper for the `DQN` agent from `bsuite`.
 22    """
 23
 24    @staticmethod
 25    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
 26        string = ""
 27        for k, v in parameters.items():
 28            string += f"prms_{index}/DQNContinuous.{k} = {v}\n"
 29        return string[:-1]
 30
 31    @staticmethod
 32    def is_episodic() -> bool:
 33        return False
 34
 35    @staticmethod
 36    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
 37        return {
 38            "network_width": tune.choice([64, 128, 256]),
 39            "network_depth": tune.choice([2, 4]),
 40            "batch_size": tune.choice([32, 64, 128]),
 41            "sgd_period": tune.choice([1, 4, 8]),
 42            "target_update_period": tune.choice([4, 16, 32]),
 43            "epsilon": tune.choice([0.01, 0.05, 0.1]),
 44        }
 45
 46    @staticmethod
 47    def get_agent_instance_from_parameters(
 48        seed: int,
 49        optimization_horizon: int,
 50        mdp_specs: "MDPSpec",
 51        parameters: Dict[str, Any],
 52    ) -> "BaseAgent":
 53
 54        return DQNContinuous(
 55            seed,
 56            mdp_specs,
 57            optimization_horizon,
 58            parameters["network_width"],
 59            parameters["network_depth"],
 60            parameters["batch_size"],
 61            parameters["sgd_period"],
 62            parameters["target_update_period"],
 63            parameters["epsilon"],
 64        )
 65
 66    @property
 67    def current_optimal_stochastic_policy(self) -> np.ndarray:
 68        qvals = tf.stop_gradient(
 69            self._agent._forward(self.emission_map.all_observations)
 70        ).numpy()
 71        return argmax_2d(qvals)
 72
 73    def __init__(
 74        self,
 75        seed: int,
 76        mdp_specs: "MDPSpec",
 77        optimization_horizon: int,
 78        # MDP model parameters
 79        network_width: int,
 80        network_depth: int,
 81        batch_size: int,
 82        sgd_period: int,
 83        target_update_period: int,
 84        # Actor parameters
 85        epsilon: float,
 86    ):
 87        r"""
 88        Parameters
 89        ----------
 90        seed : int
 91            The random seed.
 92        mdp_specs : MDPSpec
 93            The full specification of the MDP.
 94        optimization_horizon : int
 95            The total number of interactions that the agent is expected to have with the MDP.
 96        network_width : int
 97            The width of the neural networks of the agent.
 98        network_depth : int
 99            The depth of the neural networks of the agent.
100        batch_size : int
101            The batch size for training the agent.
102        sgd_period : int
103            The stochastic gradient descent update period.
104        target_update_period : int
105            The interval length between updating the target network.
106        epsilon : Callable[[int], float]]
107            The :math:`\epsilon` greedy probability as a function of the time.
108        """
109
110        tf.random.set_seed(seed)
111        np.random.seed(seed)
112
113        network = snt.Sequential(
114            [
115                snt.Flatten(),
116                snt.nets.MLP(
117                    [network_width] * network_depth + [mdp_specs.actions.num_values]
118                ),
119            ]
120        )
121        optimizer = snt.optimizers.Adam(learning_rate=1e-3)
122
123        agent = DQN(
124            action_spec=mdp_specs.actions,
125            network=network,
126            batch_size=batch_size,
127            discount=0.99,
128            replay_capacity=10000,
129            min_replay_size=100,
130            sgd_period=sgd_period,
131            target_update_period=target_update_period,
132            optimizer=optimizer,
133            epsilon=epsilon,
134            seed=seed,
135        )
136
137        super(DQNContinuous, self).__init__(seed, agent, mdp_specs)
@gin.configurable
class DQNContinuous(colosseum.utils.non_tabular.bsuite.NonTabularBsuiteAgentWrapper):
 19@gin.configurable
 20class DQNContinuous(NonTabularBsuiteAgentWrapper):
 21    """
 22    The wrapper for the `DQN` agent from `bsuite`.
 23    """
 24
 25    @staticmethod
 26    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
 27        string = ""
 28        for k, v in parameters.items():
 29            string += f"prms_{index}/DQNContinuous.{k} = {v}\n"
 30        return string[:-1]
 31
 32    @staticmethod
 33    def is_episodic() -> bool:
 34        return False
 35
 36    @staticmethod
 37    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
 38        return {
 39            "network_width": tune.choice([64, 128, 256]),
 40            "network_depth": tune.choice([2, 4]),
 41            "batch_size": tune.choice([32, 64, 128]),
 42            "sgd_period": tune.choice([1, 4, 8]),
 43            "target_update_period": tune.choice([4, 16, 32]),
 44            "epsilon": tune.choice([0.01, 0.05, 0.1]),
 45        }
 46
 47    @staticmethod
 48    def get_agent_instance_from_parameters(
 49        seed: int,
 50        optimization_horizon: int,
 51        mdp_specs: "MDPSpec",
 52        parameters: Dict[str, Any],
 53    ) -> "BaseAgent":
 54
 55        return DQNContinuous(
 56            seed,
 57            mdp_specs,
 58            optimization_horizon,
 59            parameters["network_width"],
 60            parameters["network_depth"],
 61            parameters["batch_size"],
 62            parameters["sgd_period"],
 63            parameters["target_update_period"],
 64            parameters["epsilon"],
 65        )
 66
 67    @property
 68    def current_optimal_stochastic_policy(self) -> np.ndarray:
 69        qvals = tf.stop_gradient(
 70            self._agent._forward(self.emission_map.all_observations)
 71        ).numpy()
 72        return argmax_2d(qvals)
 73
 74    def __init__(
 75        self,
 76        seed: int,
 77        mdp_specs: "MDPSpec",
 78        optimization_horizon: int,
 79        # MDP model parameters
 80        network_width: int,
 81        network_depth: int,
 82        batch_size: int,
 83        sgd_period: int,
 84        target_update_period: int,
 85        # Actor parameters
 86        epsilon: float,
 87    ):
 88        r"""
 89        Parameters
 90        ----------
 91        seed : int
 92            The random seed.
 93        mdp_specs : MDPSpec
 94            The full specification of the MDP.
 95        optimization_horizon : int
 96            The total number of interactions that the agent is expected to have with the MDP.
 97        network_width : int
 98            The width of the neural networks of the agent.
 99        network_depth : int
100            The depth of the neural networks of the agent.
101        batch_size : int
102            The batch size for training the agent.
103        sgd_period : int
104            The stochastic gradient descent update period.
105        target_update_period : int
106            The interval length between updating the target network.
107        epsilon : Callable[[int], float]]
108            The :math:`\epsilon` greedy probability as a function of the time.
109        """
110
111        tf.random.set_seed(seed)
112        np.random.seed(seed)
113
114        network = snt.Sequential(
115            [
116                snt.Flatten(),
117                snt.nets.MLP(
118                    [network_width] * network_depth + [mdp_specs.actions.num_values]
119                ),
120            ]
121        )
122        optimizer = snt.optimizers.Adam(learning_rate=1e-3)
123
124        agent = DQN(
125            action_spec=mdp_specs.actions,
126            network=network,
127            batch_size=batch_size,
128            discount=0.99,
129            replay_capacity=10000,
130            min_replay_size=100,
131            sgd_period=sgd_period,
132            target_update_period=target_update_period,
133            optimizer=optimizer,
134            epsilon=epsilon,
135            seed=seed,
136        )
137
138        super(DQNContinuous, self).__init__(seed, agent, mdp_specs)

The wrapper for the DQN agent from bsuite.

DQNContinuous( seed: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, optimization_horizon: int, network_width: int, network_depth: int, batch_size: int, sgd_period: int, target_update_period: int, epsilon: float)
 74    def __init__(
 75        self,
 76        seed: int,
 77        mdp_specs: "MDPSpec",
 78        optimization_horizon: int,
 79        # MDP model parameters
 80        network_width: int,
 81        network_depth: int,
 82        batch_size: int,
 83        sgd_period: int,
 84        target_update_period: int,
 85        # Actor parameters
 86        epsilon: float,
 87    ):
 88        r"""
 89        Parameters
 90        ----------
 91        seed : int
 92            The random seed.
 93        mdp_specs : MDPSpec
 94            The full specification of the MDP.
 95        optimization_horizon : int
 96            The total number of interactions that the agent is expected to have with the MDP.
 97        network_width : int
 98            The width of the neural networks of the agent.
 99        network_depth : int
100            The depth of the neural networks of the agent.
101        batch_size : int
102            The batch size for training the agent.
103        sgd_period : int
104            The stochastic gradient descent update period.
105        target_update_period : int
106            The interval length between updating the target network.
107        epsilon : Callable[[int], float]]
108            The :math:`\epsilon` greedy probability as a function of the time.
109        """
110
111        tf.random.set_seed(seed)
112        np.random.seed(seed)
113
114        network = snt.Sequential(
115            [
116                snt.Flatten(),
117                snt.nets.MLP(
118                    [network_width] * network_depth + [mdp_specs.actions.num_values]
119                ),
120            ]
121        )
122        optimizer = snt.optimizers.Adam(learning_rate=1e-3)
123
124        agent = DQN(
125            action_spec=mdp_specs.actions,
126            network=network,
127            batch_size=batch_size,
128            discount=0.99,
129            replay_capacity=10000,
130            min_replay_size=100,
131            sgd_period=sgd_period,
132            target_update_period=target_update_period,
133            optimizer=optimizer,
134            epsilon=epsilon,
135            seed=seed,
136        )
137
138        super(DQNContinuous, self).__init__(seed, agent, mdp_specs)
Parameters
  • seed (int): The random seed.
  • mdp_specs (MDPSpec): The full specification of the MDP.
  • optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
  • network_width (int): The width of the neural networks of the agent.
  • network_depth (int): The depth of the neural networks of the agent.
  • batch_size (int): The batch size for training the agent.
  • sgd_period (int): The stochastic gradient descent update period.
  • target_update_period (int): The interval length between updating the target network.
  • epsilon (Callable[[int], float]]): The \( \epsilon \) greedy probability as a function of the time.
@staticmethod
def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
25    @staticmethod
26    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
27        string = ""
28        for k, v in parameters.items():
29            string += f"prms_{index}/DQNContinuous.{k} = {v}\n"
30        return string[:-1]

produces a string containing the gin config file corresponding to the parameters given in input.

Parameters
  • parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
  • index (int): The index assigned to the gin configuration.
Returns
  • gin_config (str): The gin configuration file.
@staticmethod
def is_episodic() -> bool:
32    @staticmethod
33    def is_episodic() -> bool:
34        return False
Returns
  • bool: True if the agent is suited for the episodic setting.
@staticmethod
def get_hyperparameters_search_spaces() -> Dict[str, ray.tune.sample.Domain]:
36    @staticmethod
37    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
38        return {
39            "network_width": tune.choice([64, 128, 256]),
40            "network_depth": tune.choice([2, 4]),
41            "batch_size": tune.choice([32, 64, 128]),
42            "sgd_period": tune.choice([1, 4, 8]),
43            "target_update_period": tune.choice([4, 16, 32]),
44            "epsilon": tune.choice([0.01, 0.05, 0.1]),
45        }
Returns
  • Dict[str, tune.sample.Domain]: The dictionary with key value pairs corresponding to hyperparameter names and corresponding ray.tune samplers.
@staticmethod
def get_agent_instance_from_parameters( seed: int, optimization_horizon: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, parameters: Dict[str, Any]) -> colosseum.agent.agents.base.BaseAgent:
47    @staticmethod
48    def get_agent_instance_from_parameters(
49        seed: int,
50        optimization_horizon: int,
51        mdp_specs: "MDPSpec",
52        parameters: Dict[str, Any],
53    ) -> "BaseAgent":
54
55        return DQNContinuous(
56            seed,
57            mdp_specs,
58            optimization_horizon,
59            parameters["network_width"],
60            parameters["network_depth"],
61            parameters["batch_size"],
62            parameters["sgd_period"],
63            parameters["target_update_period"],
64            parameters["epsilon"],
65        )

returns an agent instance for the mdp specification and agent parameters given in input.

Parameters
  • seed (int): The random seed.
  • optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
  • mdp_specs (MDPSpec): The full specification of the MDP.
  • parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
Returns
  • BaseAgent: The agent instance.
current_optimal_stochastic_policy: numpy.ndarray
Returns
  • np.ndarray: The estimates of the best optimal policy given the current knowledge of the agent in the form of distribution over actions.