colosseum.agent.agents.episodic.boot_dqn

  1from typing import Any, Dict, Callable, TYPE_CHECKING
  2
  3import gin
  4import numpy as np
  5import sonnet as snt
  6import tensorflow as tf
  7from bsuite.baselines.tf.boot_dqn import BootstrappedDqn, make_ensemble
  8from ray import tune
  9
 10from colosseum.dynamic_programming.utils import get_policy_from_q_values
 11from colosseum.utils.non_tabular.bsuite import NonTabularBsuiteAgentWrapper
 12
 13if TYPE_CHECKING:
 14    from colosseum.agent.agents.base import BaseAgent
 15    from colosseum.utils.acme.specs import MDPSpec
 16
 17
 18@gin.configurable
 19class BootDQNEpisodic(NonTabularBsuiteAgentWrapper):
 20    """
 21    The wrapper for the `BootDQN` agent from `bsuite`.
 22    """
 23
 24    @staticmethod
 25    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
 26        string = ""
 27        for k, v in parameters.items():
 28            string += f"prms_{index}/BootDQNEpisodic.{k} = {v}\n"
 29        return string[:-1]
 30
 31    @staticmethod
 32    def is_episodic() -> bool:
 33        return True
 34
 35    @staticmethod
 36    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
 37        return {
 38            "network_width": tune.choice([64, 128, 256]),
 39            "network_depth": tune.choice([2, 4]),
 40            "batch_size": tune.choice([32, 64, 128]),
 41            "sgd_period": tune.choice([1, 4, 8]),
 42            "target_update_period": tune.choice([4, 16, 32]),
 43            "mask_prob": tune.choice([0.8, 0.9, 1.0]),
 44            "noise_scale": tune.choice([0.0, 0.05, 0.1]),
 45            "n_ensemble": tune.choice([8, 16, 20]),
 46        }
 47
 48    @staticmethod
 49    def get_agent_instance_from_parameters(
 50        seed: int,
 51        optimization_horizon: int,
 52        mdp_specs: "MDPSpec",
 53        parameters: Dict[str, Any],
 54    ) -> "BaseAgent":
 55        return BootDQNEpisodic(
 56            seed,
 57            mdp_specs,
 58            optimization_horizon,
 59            parameters["network_width"],
 60            parameters["network_depth"],
 61            parameters["batch_size"],
 62            parameters["sgd_period"],
 63            parameters["target_update_period"],
 64            parameters["mask_prob"],
 65            parameters["noise_scale"],
 66            parameters["n_ensemble"],
 67        )
 68
 69    @property
 70    def current_optimal_stochastic_policy(self) -> np.ndarray:
 71        H, S, d = self.emission_map.all_observations.shape
 72        qvals = (
 73            tf.stop_gradient(
 74                self._agent._forward[self._agent._active_head](
 75                    self.emission_map.all_observations.reshape(-1, d)
 76                )
 77            )
 78            .numpy()
 79            .reshape(H, S, self._mdp_spec.actions.num_values)
 80        )
 81        return get_policy_from_q_values(qvals, True)
 82
 83    def __init__(
 84        self,
 85        seed: int,
 86        mdp_specs: "MDPSpec",
 87        optimization_horizon: int,
 88        # MDP model parameters
 89        network_width: int,
 90        network_depth: int,
 91        batch_size: int,
 92        sgd_period: int,
 93        target_update_period: int,
 94        # Actor parameters
 95        mask_prob: float,
 96        noise_scale: float,
 97        n_ensemble: int,
 98        learning_rate: float = 1e-3,
 99        replay_capacity: int = 10000,
100        epsilon_fn: Callable[[int], float] = lambda t: 0,
101    ):
102        r"""
103        Parameters
104        ----------
105        seed : int
106            The random seed.
107        mdp_specs : MDPSpec
108            The full specification of the MDP.
109        optimization_horizon : int
110            The total number of interactions that the agent is expected to have with the MDP.
111        network_width : int
112            The width of the neural networks of the agent.
113        network_depth : int
114            The depth of the neural networks of the agent.
115        batch_size : int
116            The batch size for training the agent.
117        sgd_period : int
118            The stochastic gradient descent update period.
119        target_update_period : int
120            The interval length between updating the target network.
121        mask_prob : float
122            The masking probability for the bootstrapping procedure.
123        noise_scale : float
124            The scale of the Gaussian noise_class added to the value estimates.
125        n_ensemble : int
126            The number of ensembles.
127        learning_rate : float
128            The learning rate of the optimizer. By default, it is set to 1e-3.
129        replay_capacity : int
130            The maximum capacity of the replay buffer. By default, it is set to 10 000.
131        epsilon_fn : Callable[[int], float]]
132            The :math:`\epsilon` greedy probability as a function of the time. By default, it is set to zero.
133        """
134
135        tf.random.set_seed(seed)
136        np.random.seed(seed)
137
138        ensemble = make_ensemble(
139            mdp_specs.actions.num_values, n_ensemble, network_depth, network_width
140        )
141        optimizer = snt.optimizers.Adam(learning_rate=learning_rate)
142
143        agent = BootstrappedDqn(
144            obs_spec=mdp_specs.observations,
145            action_spec=mdp_specs.actions,
146            ensemble=ensemble,
147            batch_size=batch_size,
148            discount=1,
149            replay_capacity=replay_capacity,
150            min_replay_size=batch_size,
151            sgd_period=sgd_period,
152            target_update_period=target_update_period,
153            optimizer=optimizer,
154            mask_prob=mask_prob,
155            noise_scale=noise_scale,
156            seed=seed,
157            epsilon_fn=epsilon_fn,
158        )
159        super(BootDQNEpisodic, self).__init__(seed, agent, mdp_specs)
@gin.configurable
class BootDQNEpisodic(colosseum.utils.non_tabular.bsuite.NonTabularBsuiteAgentWrapper):
 19@gin.configurable
 20class BootDQNEpisodic(NonTabularBsuiteAgentWrapper):
 21    """
 22    The wrapper for the `BootDQN` agent from `bsuite`.
 23    """
 24
 25    @staticmethod
 26    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
 27        string = ""
 28        for k, v in parameters.items():
 29            string += f"prms_{index}/BootDQNEpisodic.{k} = {v}\n"
 30        return string[:-1]
 31
 32    @staticmethod
 33    def is_episodic() -> bool:
 34        return True
 35
 36    @staticmethod
 37    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
 38        return {
 39            "network_width": tune.choice([64, 128, 256]),
 40            "network_depth": tune.choice([2, 4]),
 41            "batch_size": tune.choice([32, 64, 128]),
 42            "sgd_period": tune.choice([1, 4, 8]),
 43            "target_update_period": tune.choice([4, 16, 32]),
 44            "mask_prob": tune.choice([0.8, 0.9, 1.0]),
 45            "noise_scale": tune.choice([0.0, 0.05, 0.1]),
 46            "n_ensemble": tune.choice([8, 16, 20]),
 47        }
 48
 49    @staticmethod
 50    def get_agent_instance_from_parameters(
 51        seed: int,
 52        optimization_horizon: int,
 53        mdp_specs: "MDPSpec",
 54        parameters: Dict[str, Any],
 55    ) -> "BaseAgent":
 56        return BootDQNEpisodic(
 57            seed,
 58            mdp_specs,
 59            optimization_horizon,
 60            parameters["network_width"],
 61            parameters["network_depth"],
 62            parameters["batch_size"],
 63            parameters["sgd_period"],
 64            parameters["target_update_period"],
 65            parameters["mask_prob"],
 66            parameters["noise_scale"],
 67            parameters["n_ensemble"],
 68        )
 69
 70    @property
 71    def current_optimal_stochastic_policy(self) -> np.ndarray:
 72        H, S, d = self.emission_map.all_observations.shape
 73        qvals = (
 74            tf.stop_gradient(
 75                self._agent._forward[self._agent._active_head](
 76                    self.emission_map.all_observations.reshape(-1, d)
 77                )
 78            )
 79            .numpy()
 80            .reshape(H, S, self._mdp_spec.actions.num_values)
 81        )
 82        return get_policy_from_q_values(qvals, True)
 83
 84    def __init__(
 85        self,
 86        seed: int,
 87        mdp_specs: "MDPSpec",
 88        optimization_horizon: int,
 89        # MDP model parameters
 90        network_width: int,
 91        network_depth: int,
 92        batch_size: int,
 93        sgd_period: int,
 94        target_update_period: int,
 95        # Actor parameters
 96        mask_prob: float,
 97        noise_scale: float,
 98        n_ensemble: int,
 99        learning_rate: float = 1e-3,
100        replay_capacity: int = 10000,
101        epsilon_fn: Callable[[int], float] = lambda t: 0,
102    ):
103        r"""
104        Parameters
105        ----------
106        seed : int
107            The random seed.
108        mdp_specs : MDPSpec
109            The full specification of the MDP.
110        optimization_horizon : int
111            The total number of interactions that the agent is expected to have with the MDP.
112        network_width : int
113            The width of the neural networks of the agent.
114        network_depth : int
115            The depth of the neural networks of the agent.
116        batch_size : int
117            The batch size for training the agent.
118        sgd_period : int
119            The stochastic gradient descent update period.
120        target_update_period : int
121            The interval length between updating the target network.
122        mask_prob : float
123            The masking probability for the bootstrapping procedure.
124        noise_scale : float
125            The scale of the Gaussian noise_class added to the value estimates.
126        n_ensemble : int
127            The number of ensembles.
128        learning_rate : float
129            The learning rate of the optimizer. By default, it is set to 1e-3.
130        replay_capacity : int
131            The maximum capacity of the replay buffer. By default, it is set to 10 000.
132        epsilon_fn : Callable[[int], float]]
133            The :math:`\epsilon` greedy probability as a function of the time. By default, it is set to zero.
134        """
135
136        tf.random.set_seed(seed)
137        np.random.seed(seed)
138
139        ensemble = make_ensemble(
140            mdp_specs.actions.num_values, n_ensemble, network_depth, network_width
141        )
142        optimizer = snt.optimizers.Adam(learning_rate=learning_rate)
143
144        agent = BootstrappedDqn(
145            obs_spec=mdp_specs.observations,
146            action_spec=mdp_specs.actions,
147            ensemble=ensemble,
148            batch_size=batch_size,
149            discount=1,
150            replay_capacity=replay_capacity,
151            min_replay_size=batch_size,
152            sgd_period=sgd_period,
153            target_update_period=target_update_period,
154            optimizer=optimizer,
155            mask_prob=mask_prob,
156            noise_scale=noise_scale,
157            seed=seed,
158            epsilon_fn=epsilon_fn,
159        )
160        super(BootDQNEpisodic, self).__init__(seed, agent, mdp_specs)

The wrapper for the BootDQN agent from bsuite.

BootDQNEpisodic( seed: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, optimization_horizon: int, network_width: int, network_depth: int, batch_size: int, sgd_period: int, target_update_period: int, mask_prob: float, noise_scale: float, n_ensemble: int, learning_rate: float = 0.001, replay_capacity: int = 10000, epsilon_fn: Callable[[int], float] = <function BootDQNEpisodic.<lambda>>)
 84    def __init__(
 85        self,
 86        seed: int,
 87        mdp_specs: "MDPSpec",
 88        optimization_horizon: int,
 89        # MDP model parameters
 90        network_width: int,
 91        network_depth: int,
 92        batch_size: int,
 93        sgd_period: int,
 94        target_update_period: int,
 95        # Actor parameters
 96        mask_prob: float,
 97        noise_scale: float,
 98        n_ensemble: int,
 99        learning_rate: float = 1e-3,
100        replay_capacity: int = 10000,
101        epsilon_fn: Callable[[int], float] = lambda t: 0,
102    ):
103        r"""
104        Parameters
105        ----------
106        seed : int
107            The random seed.
108        mdp_specs : MDPSpec
109            The full specification of the MDP.
110        optimization_horizon : int
111            The total number of interactions that the agent is expected to have with the MDP.
112        network_width : int
113            The width of the neural networks of the agent.
114        network_depth : int
115            The depth of the neural networks of the agent.
116        batch_size : int
117            The batch size for training the agent.
118        sgd_period : int
119            The stochastic gradient descent update period.
120        target_update_period : int
121            The interval length between updating the target network.
122        mask_prob : float
123            The masking probability for the bootstrapping procedure.
124        noise_scale : float
125            The scale of the Gaussian noise_class added to the value estimates.
126        n_ensemble : int
127            The number of ensembles.
128        learning_rate : float
129            The learning rate of the optimizer. By default, it is set to 1e-3.
130        replay_capacity : int
131            The maximum capacity of the replay buffer. By default, it is set to 10 000.
132        epsilon_fn : Callable[[int], float]]
133            The :math:`\epsilon` greedy probability as a function of the time. By default, it is set to zero.
134        """
135
136        tf.random.set_seed(seed)
137        np.random.seed(seed)
138
139        ensemble = make_ensemble(
140            mdp_specs.actions.num_values, n_ensemble, network_depth, network_width
141        )
142        optimizer = snt.optimizers.Adam(learning_rate=learning_rate)
143
144        agent = BootstrappedDqn(
145            obs_spec=mdp_specs.observations,
146            action_spec=mdp_specs.actions,
147            ensemble=ensemble,
148            batch_size=batch_size,
149            discount=1,
150            replay_capacity=replay_capacity,
151            min_replay_size=batch_size,
152            sgd_period=sgd_period,
153            target_update_period=target_update_period,
154            optimizer=optimizer,
155            mask_prob=mask_prob,
156            noise_scale=noise_scale,
157            seed=seed,
158            epsilon_fn=epsilon_fn,
159        )
160        super(BootDQNEpisodic, self).__init__(seed, agent, mdp_specs)
Parameters
  • seed (int): The random seed.
  • mdp_specs (MDPSpec): The full specification of the MDP.
  • optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
  • network_width (int): The width of the neural networks of the agent.
  • network_depth (int): The depth of the neural networks of the agent.
  • batch_size (int): The batch size for training the agent.
  • sgd_period (int): The stochastic gradient descent update period.
  • target_update_period (int): The interval length between updating the target network.
  • mask_prob (float): The masking probability for the bootstrapping procedure.
  • noise_scale (float): The scale of the Gaussian noise_class added to the value estimates.
  • n_ensemble (int): The number of ensembles.
  • learning_rate (float): The learning rate of the optimizer. By default, it is set to 1e-3.
  • replay_capacity (int): The maximum capacity of the replay buffer. By default, it is set to 10 000.
  • epsilon_fn (Callable[[int], float]]): The \( \epsilon \) greedy probability as a function of the time. By default, it is set to zero.
@staticmethod
def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
25    @staticmethod
26    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
27        string = ""
28        for k, v in parameters.items():
29            string += f"prms_{index}/BootDQNEpisodic.{k} = {v}\n"
30        return string[:-1]

produces a string containing the gin config file corresponding to the parameters given in input.

Parameters
  • parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
  • index (int): The index assigned to the gin configuration.
Returns
  • gin_config (str): The gin configuration file.
@staticmethod
def is_episodic() -> bool:
32    @staticmethod
33    def is_episodic() -> bool:
34        return True
Returns
  • bool: True if the agent is suited for the episodic setting.
@staticmethod
def get_hyperparameters_search_spaces() -> Dict[str, ray.tune.sample.Domain]:
36    @staticmethod
37    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
38        return {
39            "network_width": tune.choice([64, 128, 256]),
40            "network_depth": tune.choice([2, 4]),
41            "batch_size": tune.choice([32, 64, 128]),
42            "sgd_period": tune.choice([1, 4, 8]),
43            "target_update_period": tune.choice([4, 16, 32]),
44            "mask_prob": tune.choice([0.8, 0.9, 1.0]),
45            "noise_scale": tune.choice([0.0, 0.05, 0.1]),
46            "n_ensemble": tune.choice([8, 16, 20]),
47        }
Returns
  • Dict[str, tune.sample.Domain]: The dictionary with key value pairs corresponding to hyperparameter names and corresponding ray.tune samplers.
@staticmethod
def get_agent_instance_from_parameters( seed: int, optimization_horizon: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, parameters: Dict[str, Any]) -> colosseum.agent.agents.base.BaseAgent:
49    @staticmethod
50    def get_agent_instance_from_parameters(
51        seed: int,
52        optimization_horizon: int,
53        mdp_specs: "MDPSpec",
54        parameters: Dict[str, Any],
55    ) -> "BaseAgent":
56        return BootDQNEpisodic(
57            seed,
58            mdp_specs,
59            optimization_horizon,
60            parameters["network_width"],
61            parameters["network_depth"],
62            parameters["batch_size"],
63            parameters["sgd_period"],
64            parameters["target_update_period"],
65            parameters["mask_prob"],
66            parameters["noise_scale"],
67            parameters["n_ensemble"],
68        )

returns an agent instance for the mdp specification and agent parameters given in input.

Parameters
  • seed (int): The random seed.
  • optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
  • mdp_specs (MDPSpec): The full specification of the MDP.
  • parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
Returns
  • BaseAgent: The agent instance.
current_optimal_stochastic_policy: numpy.ndarray
Returns
  • np.ndarray: The estimates of the best optimal policy given the current knowledge of the agent in the form of distribution over actions.