colosseum.agent.agents.episodic.dqn

  1from typing import Any, Dict, TYPE_CHECKING
  2
  3import gin
  4import numpy as np
  5import sonnet as snt
  6import tensorflow as tf
  7from bsuite.baselines.tf.dqn import DQN
  8from ray import tune
  9
 10from colosseum.dynamic_programming.utils import get_policy_from_q_values
 11from colosseum.utils.non_tabular.bsuite import NonTabularBsuiteAgentWrapper
 12
 13if TYPE_CHECKING:
 14    from colosseum.agent.agents.base import BaseAgent
 15    from colosseum.utils.acme.specs import MDPSpec
 16
 17
 18@gin.configurable
 19class DQNEpisodic(NonTabularBsuiteAgentWrapper):
 20    """
 21    The wrapper for the `DQN` agent from `bsuite`.
 22    """
 23
 24    @staticmethod
 25    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
 26        string = ""
 27        for k, v in parameters.items():
 28            string += f"prms_{index}/DQNEpisodic.{k} = {v}\n"
 29        return string[:-1]
 30
 31    @staticmethod
 32    def is_episodic() -> bool:
 33        return True
 34
 35    @staticmethod
 36    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
 37        return {
 38            "network_width": tune.choice([64, 128, 256]),
 39            "network_depth": tune.choice([2, 4]),
 40            "batch_size": tune.choice([32, 64, 128]),
 41            "sgd_period": tune.choice([1, 4, 8]),
 42            "target_update_period": tune.choice([4, 16, 32]),
 43            "epsilon": tune.choice([0.01, 0.05, 0.1]),
 44        }
 45
 46    @staticmethod
 47    def get_agent_instance_from_parameters(
 48        seed: int,
 49        optimization_horizon: int,
 50        mdp_specs: "MDPSpec",
 51        parameters: Dict[str, Any],
 52    ) -> "BaseAgent":
 53
 54        return DQNEpisodic(
 55            seed,
 56            mdp_specs,
 57            optimization_horizon,
 58            parameters["network_width"],
 59            parameters["network_depth"],
 60            parameters["batch_size"],
 61            parameters["sgd_period"],
 62            parameters["target_update_period"],
 63            parameters["epsilon"],
 64        )
 65
 66    @property
 67    def current_optimal_stochastic_policy(self) -> np.ndarray:
 68        H, S, d = self.emission_map.all_observations.shape
 69        qvals = (
 70            tf.stop_gradient(
 71                self._agent._forward(self.emission_map.all_observations.reshape(-1, d))
 72            )
 73            .numpy()
 74            .reshape(H, S, self._mdp_spec.actions.num_values)
 75        )
 76        return get_policy_from_q_values(qvals, True)
 77
 78    def __init__(
 79        self,
 80        seed: int,
 81        mdp_specs: "MDPSpec",
 82        optimization_horizon: int,
 83        # MDP model parameters
 84        network_width: int,
 85        network_depth: int,
 86        batch_size: int,
 87        sgd_period: int,
 88        target_update_period: int,
 89        # Actor parameters
 90        epsilon: float,
 91    ):
 92        r"""
 93        Parameters
 94        ----------
 95        seed : int
 96            The random seed.
 97        mdp_specs : MDPSpec
 98            The full specification of the MDP.
 99        optimization_horizon : int
100            The total number of interactions that the agent is expected to have with the MDP.
101        network_width : int
102            The width of the neural networks of the agent.
103        network_depth : int
104            The depth of the neural networks of the agent.
105        batch_size : int
106            The batch size for training the agent.
107        sgd_period : int
108            The stochastic gradient descent update period.
109        target_update_period : int
110            The interval length between updating the target network.
111        epsilon : Callable[[int], float]]
112            The :math:`\epsilon` greedy probability as a function of the time.
113        """
114
115        tf.random.set_seed(seed)
116        np.random.seed(seed)
117
118        network = snt.Sequential(
119            [
120                snt.Flatten(),
121                snt.nets.MLP(
122                    [network_width] * network_depth + [mdp_specs.actions.num_values]
123                ),
124            ]
125        )
126        optimizer = snt.optimizers.Adam(learning_rate=1e-3)
127
128        agent = DQN(
129            action_spec=mdp_specs.actions,
130            network=network,
131            batch_size=batch_size,
132            discount=1,
133            replay_capacity=10000,
134            min_replay_size=100,
135            sgd_period=sgd_period,
136            target_update_period=target_update_period,
137            optimizer=optimizer,
138            epsilon=epsilon,
139            seed=seed,
140        )
141
142        super(DQNEpisodic, self).__init__(seed, agent, mdp_specs)
@gin.configurable
class DQNEpisodic(colosseum.utils.non_tabular.bsuite.NonTabularBsuiteAgentWrapper):
 19@gin.configurable
 20class DQNEpisodic(NonTabularBsuiteAgentWrapper):
 21    """
 22    The wrapper for the `DQN` agent from `bsuite`.
 23    """
 24
 25    @staticmethod
 26    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
 27        string = ""
 28        for k, v in parameters.items():
 29            string += f"prms_{index}/DQNEpisodic.{k} = {v}\n"
 30        return string[:-1]
 31
 32    @staticmethod
 33    def is_episodic() -> bool:
 34        return True
 35
 36    @staticmethod
 37    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
 38        return {
 39            "network_width": tune.choice([64, 128, 256]),
 40            "network_depth": tune.choice([2, 4]),
 41            "batch_size": tune.choice([32, 64, 128]),
 42            "sgd_period": tune.choice([1, 4, 8]),
 43            "target_update_period": tune.choice([4, 16, 32]),
 44            "epsilon": tune.choice([0.01, 0.05, 0.1]),
 45        }
 46
 47    @staticmethod
 48    def get_agent_instance_from_parameters(
 49        seed: int,
 50        optimization_horizon: int,
 51        mdp_specs: "MDPSpec",
 52        parameters: Dict[str, Any],
 53    ) -> "BaseAgent":
 54
 55        return DQNEpisodic(
 56            seed,
 57            mdp_specs,
 58            optimization_horizon,
 59            parameters["network_width"],
 60            parameters["network_depth"],
 61            parameters["batch_size"],
 62            parameters["sgd_period"],
 63            parameters["target_update_period"],
 64            parameters["epsilon"],
 65        )
 66
 67    @property
 68    def current_optimal_stochastic_policy(self) -> np.ndarray:
 69        H, S, d = self.emission_map.all_observations.shape
 70        qvals = (
 71            tf.stop_gradient(
 72                self._agent._forward(self.emission_map.all_observations.reshape(-1, d))
 73            )
 74            .numpy()
 75            .reshape(H, S, self._mdp_spec.actions.num_values)
 76        )
 77        return get_policy_from_q_values(qvals, True)
 78
 79    def __init__(
 80        self,
 81        seed: int,
 82        mdp_specs: "MDPSpec",
 83        optimization_horizon: int,
 84        # MDP model parameters
 85        network_width: int,
 86        network_depth: int,
 87        batch_size: int,
 88        sgd_period: int,
 89        target_update_period: int,
 90        # Actor parameters
 91        epsilon: float,
 92    ):
 93        r"""
 94        Parameters
 95        ----------
 96        seed : int
 97            The random seed.
 98        mdp_specs : MDPSpec
 99            The full specification of the MDP.
100        optimization_horizon : int
101            The total number of interactions that the agent is expected to have with the MDP.
102        network_width : int
103            The width of the neural networks of the agent.
104        network_depth : int
105            The depth of the neural networks of the agent.
106        batch_size : int
107            The batch size for training the agent.
108        sgd_period : int
109            The stochastic gradient descent update period.
110        target_update_period : int
111            The interval length between updating the target network.
112        epsilon : Callable[[int], float]]
113            The :math:`\epsilon` greedy probability as a function of the time.
114        """
115
116        tf.random.set_seed(seed)
117        np.random.seed(seed)
118
119        network = snt.Sequential(
120            [
121                snt.Flatten(),
122                snt.nets.MLP(
123                    [network_width] * network_depth + [mdp_specs.actions.num_values]
124                ),
125            ]
126        )
127        optimizer = snt.optimizers.Adam(learning_rate=1e-3)
128
129        agent = DQN(
130            action_spec=mdp_specs.actions,
131            network=network,
132            batch_size=batch_size,
133            discount=1,
134            replay_capacity=10000,
135            min_replay_size=100,
136            sgd_period=sgd_period,
137            target_update_period=target_update_period,
138            optimizer=optimizer,
139            epsilon=epsilon,
140            seed=seed,
141        )
142
143        super(DQNEpisodic, self).__init__(seed, agent, mdp_specs)

The wrapper for the DQN agent from bsuite.

DQNEpisodic( seed: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, optimization_horizon: int, network_width: int, network_depth: int, batch_size: int, sgd_period: int, target_update_period: int, epsilon: float)
 79    def __init__(
 80        self,
 81        seed: int,
 82        mdp_specs: "MDPSpec",
 83        optimization_horizon: int,
 84        # MDP model parameters
 85        network_width: int,
 86        network_depth: int,
 87        batch_size: int,
 88        sgd_period: int,
 89        target_update_period: int,
 90        # Actor parameters
 91        epsilon: float,
 92    ):
 93        r"""
 94        Parameters
 95        ----------
 96        seed : int
 97            The random seed.
 98        mdp_specs : MDPSpec
 99            The full specification of the MDP.
100        optimization_horizon : int
101            The total number of interactions that the agent is expected to have with the MDP.
102        network_width : int
103            The width of the neural networks of the agent.
104        network_depth : int
105            The depth of the neural networks of the agent.
106        batch_size : int
107            The batch size for training the agent.
108        sgd_period : int
109            The stochastic gradient descent update period.
110        target_update_period : int
111            The interval length between updating the target network.
112        epsilon : Callable[[int], float]]
113            The :math:`\epsilon` greedy probability as a function of the time.
114        """
115
116        tf.random.set_seed(seed)
117        np.random.seed(seed)
118
119        network = snt.Sequential(
120            [
121                snt.Flatten(),
122                snt.nets.MLP(
123                    [network_width] * network_depth + [mdp_specs.actions.num_values]
124                ),
125            ]
126        )
127        optimizer = snt.optimizers.Adam(learning_rate=1e-3)
128
129        agent = DQN(
130            action_spec=mdp_specs.actions,
131            network=network,
132            batch_size=batch_size,
133            discount=1,
134            replay_capacity=10000,
135            min_replay_size=100,
136            sgd_period=sgd_period,
137            target_update_period=target_update_period,
138            optimizer=optimizer,
139            epsilon=epsilon,
140            seed=seed,
141        )
142
143        super(DQNEpisodic, self).__init__(seed, agent, mdp_specs)
Parameters
  • seed (int): The random seed.
  • mdp_specs (MDPSpec): The full specification of the MDP.
  • optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
  • network_width (int): The width of the neural networks of the agent.
  • network_depth (int): The depth of the neural networks of the agent.
  • batch_size (int): The batch size for training the agent.
  • sgd_period (int): The stochastic gradient descent update period.
  • target_update_period (int): The interval length between updating the target network.
  • epsilon (Callable[[int], float]]): The \( \epsilon \) greedy probability as a function of the time.
@staticmethod
def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
25    @staticmethod
26    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
27        string = ""
28        for k, v in parameters.items():
29            string += f"prms_{index}/DQNEpisodic.{k} = {v}\n"
30        return string[:-1]

produces a string containing the gin config file corresponding to the parameters given in input.

Parameters
  • parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
  • index (int): The index assigned to the gin configuration.
Returns
  • gin_config (str): The gin configuration file.
@staticmethod
def is_episodic() -> bool:
32    @staticmethod
33    def is_episodic() -> bool:
34        return True
Returns
  • bool: True if the agent is suited for the episodic setting.
@staticmethod
def get_hyperparameters_search_spaces() -> Dict[str, ray.tune.sample.Domain]:
36    @staticmethod
37    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
38        return {
39            "network_width": tune.choice([64, 128, 256]),
40            "network_depth": tune.choice([2, 4]),
41            "batch_size": tune.choice([32, 64, 128]),
42            "sgd_period": tune.choice([1, 4, 8]),
43            "target_update_period": tune.choice([4, 16, 32]),
44            "epsilon": tune.choice([0.01, 0.05, 0.1]),
45        }
Returns
  • Dict[str, tune.sample.Domain]: The dictionary with key value pairs corresponding to hyperparameter names and corresponding ray.tune samplers.
@staticmethod
def get_agent_instance_from_parameters( seed: int, optimization_horizon: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, parameters: Dict[str, Any]) -> colosseum.agent.agents.base.BaseAgent:
47    @staticmethod
48    def get_agent_instance_from_parameters(
49        seed: int,
50        optimization_horizon: int,
51        mdp_specs: "MDPSpec",
52        parameters: Dict[str, Any],
53    ) -> "BaseAgent":
54
55        return DQNEpisodic(
56            seed,
57            mdp_specs,
58            optimization_horizon,
59            parameters["network_width"],
60            parameters["network_depth"],
61            parameters["batch_size"],
62            parameters["sgd_period"],
63            parameters["target_update_period"],
64            parameters["epsilon"],
65        )

returns an agent instance for the mdp specification and agent parameters given in input.

Parameters
  • seed (int): The random seed.
  • optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
  • mdp_specs (MDPSpec): The full specification of the MDP.
  • parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
Returns
  • BaseAgent: The agent instance.
current_optimal_stochastic_policy: numpy.ndarray
Returns
  • np.ndarray: The estimates of the best optimal policy given the current knowledge of the agent in the form of distribution over actions.