colosseum.agent.agents.episodic.actor_critic_rnn

  1from typing import Dict, Any, TYPE_CHECKING
  2
  3import dm_env
  4import gin
  5import numpy as np
  6import sonnet as snt
  7import tensorflow as tf
  8from bsuite.baselines.tf.actor_critic_rnn import PolicyValueRNN, ActorCriticRNN
  9from ray import tune
 10
 11from colosseum.utils.non_tabular.bsuite import NonTabularBsuiteAgentWrapper
 12
 13if TYPE_CHECKING:
 14    from colosseum.agent.agents.base import BaseAgent
 15    from colosseum.utils.acme.specs import MDPSpec
 16    from colosseum.mdp import ACTION_TYPE
 17
 18
 19@gin.configurable
 20class ActorCriticRNNEpisodic(NonTabularBsuiteAgentWrapper):
 21    """
 22    The wrapper for the `ActorCriticRNN` agent from `bsuite`.
 23    """
 24
 25    @staticmethod
 26    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
 27        string = ""
 28        for k, v in parameters.items():
 29            string += f"prms_{index}/ActorCriticRNNEpisodic.{k} = {v}\n"
 30        return string[:-1]
 31
 32    @staticmethod
 33    def is_episodic() -> bool:
 34        return True
 35
 36    @staticmethod
 37    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
 38        return {
 39            "network_width": tune.choice([64, 128, 256]),
 40            "network_depth": tune.choice([2, 4]),
 41            "max_sequence_length": tune.choice([16, 32, 64, 128]),
 42            "td_lambda": tune.choice([0.7, 0.8, 0.9]),
 43        }
 44
 45    @staticmethod
 46    def get_agent_instance_from_parameters(
 47        seed: int,
 48        optimization_horizon: int,
 49        mdp_specs: "MDPSpec",
 50        parameters: Dict[str, Any],
 51    ) -> "BaseAgent":
 52
 53        return ActorCriticRNNEpisodic(
 54            seed,
 55            mdp_specs,
 56            optimization_horizon,
 57            parameters["network_width"],
 58            parameters["network_depth"],
 59            parameters["max_sequence_length"],
 60            parameters["td_lambda"],
 61        )
 62
 63    @property
 64    def current_optimal_stochastic_policy(self) -> np.ndarray:
 65        H, S, d = self.emission_map.all_observations.shape
 66
 67        logits = (
 68            self._agent._network(
 69                tf.convert_to_tensor(self.emission_map.all_observations.reshape(-1, d)),
 70                self._agent._network.initial_state(self._mdp_spec.n_states * H),
 71            )[0][0]
 72            .numpy()
 73            .reshape(H, S, self._mdp_spec.actions.num_values)
 74        )
 75
 76        return (logits >= logits.max(-1, keepdims=True)).astype(np.float32)
 77
 78    def __init__(
 79        self,
 80        seed: int,
 81        mdp_specs: "MDPSpec",
 82        optimization_horizon: int,
 83        # MDP model parameters
 84        network_width: int,
 85        network_depth: int,
 86        max_sequence_length: int,
 87        td_lambda: float,
 88    ):
 89        r"""
 90        Parameters
 91        ----------
 92        seed : int
 93            The random seed.
 94        mdp_specs : MDPSpec
 95            The full specification of the MDP.
 96        optimization_horizon : int
 97            The total number of interactions that the agent is expected to have with the MDP.
 98        network_width : int
 99            The width of the neural networks of the agent.
100        network_depth : int
101            The depth of the neural networks of the agent.
102        max_sequence_length : int
103            The maximum sequence length for training the agent.
104        td_lambda : float
105            The TD(:math:`\lambda`) parameter for training the agent.
106        """
107
108        tf.random.set_seed(seed)
109        np.random.seed(seed)
110
111        network = PolicyValueRNN(
112            hidden_sizes=[network_width] * network_depth,
113            n_actions=mdp_specs.actions.num_values,
114        )
115        agent = ActorCriticRNN(
116            obs_spec=mdp_specs.observations,
117            action_spec=mdp_specs.actions,
118            network=network,
119            optimizer=snt.optimizers.Adam(learning_rate=3e-3),
120            max_sequence_length=max_sequence_length,
121            td_lambda=td_lambda,
122            discount=0.99,
123            seed=seed,
124        )
125        super(ActorCriticRNNEpisodic, self).__init__(seed, agent, mdp_specs)
126
127    def select_action(self, ts: dm_env.TimeStep, time: int) -> "ACTION_TYPE":
128        action = super(ActorCriticRNNEpisodic, self).select_action(ts, time)
129        if action >= self._mdp_spec.actions.num_values:
130            return self._rng.randint(self._mdp_spec.actions.num_values)
131        return action
@gin.configurable
class ActorCriticRNNEpisodic(colosseum.utils.non_tabular.bsuite.NonTabularBsuiteAgentWrapper):
 20@gin.configurable
 21class ActorCriticRNNEpisodic(NonTabularBsuiteAgentWrapper):
 22    """
 23    The wrapper for the `ActorCriticRNN` agent from `bsuite`.
 24    """
 25
 26    @staticmethod
 27    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
 28        string = ""
 29        for k, v in parameters.items():
 30            string += f"prms_{index}/ActorCriticRNNEpisodic.{k} = {v}\n"
 31        return string[:-1]
 32
 33    @staticmethod
 34    def is_episodic() -> bool:
 35        return True
 36
 37    @staticmethod
 38    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
 39        return {
 40            "network_width": tune.choice([64, 128, 256]),
 41            "network_depth": tune.choice([2, 4]),
 42            "max_sequence_length": tune.choice([16, 32, 64, 128]),
 43            "td_lambda": tune.choice([0.7, 0.8, 0.9]),
 44        }
 45
 46    @staticmethod
 47    def get_agent_instance_from_parameters(
 48        seed: int,
 49        optimization_horizon: int,
 50        mdp_specs: "MDPSpec",
 51        parameters: Dict[str, Any],
 52    ) -> "BaseAgent":
 53
 54        return ActorCriticRNNEpisodic(
 55            seed,
 56            mdp_specs,
 57            optimization_horizon,
 58            parameters["network_width"],
 59            parameters["network_depth"],
 60            parameters["max_sequence_length"],
 61            parameters["td_lambda"],
 62        )
 63
 64    @property
 65    def current_optimal_stochastic_policy(self) -> np.ndarray:
 66        H, S, d = self.emission_map.all_observations.shape
 67
 68        logits = (
 69            self._agent._network(
 70                tf.convert_to_tensor(self.emission_map.all_observations.reshape(-1, d)),
 71                self._agent._network.initial_state(self._mdp_spec.n_states * H),
 72            )[0][0]
 73            .numpy()
 74            .reshape(H, S, self._mdp_spec.actions.num_values)
 75        )
 76
 77        return (logits >= logits.max(-1, keepdims=True)).astype(np.float32)
 78
 79    def __init__(
 80        self,
 81        seed: int,
 82        mdp_specs: "MDPSpec",
 83        optimization_horizon: int,
 84        # MDP model parameters
 85        network_width: int,
 86        network_depth: int,
 87        max_sequence_length: int,
 88        td_lambda: float,
 89    ):
 90        r"""
 91        Parameters
 92        ----------
 93        seed : int
 94            The random seed.
 95        mdp_specs : MDPSpec
 96            The full specification of the MDP.
 97        optimization_horizon : int
 98            The total number of interactions that the agent is expected to have with the MDP.
 99        network_width : int
100            The width of the neural networks of the agent.
101        network_depth : int
102            The depth of the neural networks of the agent.
103        max_sequence_length : int
104            The maximum sequence length for training the agent.
105        td_lambda : float
106            The TD(:math:`\lambda`) parameter for training the agent.
107        """
108
109        tf.random.set_seed(seed)
110        np.random.seed(seed)
111
112        network = PolicyValueRNN(
113            hidden_sizes=[network_width] * network_depth,
114            n_actions=mdp_specs.actions.num_values,
115        )
116        agent = ActorCriticRNN(
117            obs_spec=mdp_specs.observations,
118            action_spec=mdp_specs.actions,
119            network=network,
120            optimizer=snt.optimizers.Adam(learning_rate=3e-3),
121            max_sequence_length=max_sequence_length,
122            td_lambda=td_lambda,
123            discount=0.99,
124            seed=seed,
125        )
126        super(ActorCriticRNNEpisodic, self).__init__(seed, agent, mdp_specs)
127
128    def select_action(self, ts: dm_env.TimeStep, time: int) -> "ACTION_TYPE":
129        action = super(ActorCriticRNNEpisodic, self).select_action(ts, time)
130        if action >= self._mdp_spec.actions.num_values:
131            return self._rng.randint(self._mdp_spec.actions.num_values)
132        return action

The wrapper for the ActorCriticRNN agent from bsuite.

ActorCriticRNNEpisodic( seed: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, optimization_horizon: int, network_width: int, network_depth: int, max_sequence_length: int, td_lambda: float)
 79    def __init__(
 80        self,
 81        seed: int,
 82        mdp_specs: "MDPSpec",
 83        optimization_horizon: int,
 84        # MDP model parameters
 85        network_width: int,
 86        network_depth: int,
 87        max_sequence_length: int,
 88        td_lambda: float,
 89    ):
 90        r"""
 91        Parameters
 92        ----------
 93        seed : int
 94            The random seed.
 95        mdp_specs : MDPSpec
 96            The full specification of the MDP.
 97        optimization_horizon : int
 98            The total number of interactions that the agent is expected to have with the MDP.
 99        network_width : int
100            The width of the neural networks of the agent.
101        network_depth : int
102            The depth of the neural networks of the agent.
103        max_sequence_length : int
104            The maximum sequence length for training the agent.
105        td_lambda : float
106            The TD(:math:`\lambda`) parameter for training the agent.
107        """
108
109        tf.random.set_seed(seed)
110        np.random.seed(seed)
111
112        network = PolicyValueRNN(
113            hidden_sizes=[network_width] * network_depth,
114            n_actions=mdp_specs.actions.num_values,
115        )
116        agent = ActorCriticRNN(
117            obs_spec=mdp_specs.observations,
118            action_spec=mdp_specs.actions,
119            network=network,
120            optimizer=snt.optimizers.Adam(learning_rate=3e-3),
121            max_sequence_length=max_sequence_length,
122            td_lambda=td_lambda,
123            discount=0.99,
124            seed=seed,
125        )
126        super(ActorCriticRNNEpisodic, self).__init__(seed, agent, mdp_specs)
Parameters
  • seed (int): The random seed.
  • mdp_specs (MDPSpec): The full specification of the MDP.
  • optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
  • network_width (int): The width of the neural networks of the agent.
  • network_depth (int): The depth of the neural networks of the agent.
  • max_sequence_length (int): The maximum sequence length for training the agent.
  • td_lambda (float): The TD(\( \lambda \)) parameter for training the agent.
@staticmethod
def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
26    @staticmethod
27    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
28        string = ""
29        for k, v in parameters.items():
30            string += f"prms_{index}/ActorCriticRNNEpisodic.{k} = {v}\n"
31        return string[:-1]

produces a string containing the gin config file corresponding to the parameters given in input.

Parameters
  • parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
  • index (int): The index assigned to the gin configuration.
Returns
  • gin_config (str): The gin configuration file.
@staticmethod
def is_episodic() -> bool:
33    @staticmethod
34    def is_episodic() -> bool:
35        return True
Returns
  • bool: True if the agent is suited for the episodic setting.
@staticmethod
def get_hyperparameters_search_spaces() -> Dict[str, ray.tune.sample.Domain]:
37    @staticmethod
38    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
39        return {
40            "network_width": tune.choice([64, 128, 256]),
41            "network_depth": tune.choice([2, 4]),
42            "max_sequence_length": tune.choice([16, 32, 64, 128]),
43            "td_lambda": tune.choice([0.7, 0.8, 0.9]),
44        }
Returns
  • Dict[str, tune.sample.Domain]: The dictionary with key value pairs corresponding to hyperparameter names and corresponding ray.tune samplers.
@staticmethod
def get_agent_instance_from_parameters( seed: int, optimization_horizon: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, parameters: Dict[str, Any]) -> colosseum.agent.agents.base.BaseAgent:
46    @staticmethod
47    def get_agent_instance_from_parameters(
48        seed: int,
49        optimization_horizon: int,
50        mdp_specs: "MDPSpec",
51        parameters: Dict[str, Any],
52    ) -> "BaseAgent":
53
54        return ActorCriticRNNEpisodic(
55            seed,
56            mdp_specs,
57            optimization_horizon,
58            parameters["network_width"],
59            parameters["network_depth"],
60            parameters["max_sequence_length"],
61            parameters["td_lambda"],
62        )

returns an agent instance for the mdp specification and agent parameters given in input.

Parameters
  • seed (int): The random seed.
  • optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
  • mdp_specs (MDPSpec): The full specification of the MDP.
  • parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
Returns
  • BaseAgent: The agent instance.
current_optimal_stochastic_policy: numpy.ndarray
Returns
  • np.ndarray: The estimates of the best optimal policy given the current knowledge of the agent in the form of distribution over actions.
def select_action( self, ts: dm_env._environment.TimeStep, time: int) -> Union[int, float, numpy.ndarray]:
128    def select_action(self, ts: dm_env.TimeStep, time: int) -> "ACTION_TYPE":
129        action = super(ActorCriticRNNEpisodic, self).select_action(ts, time)
130        if action >= self._mdp_spec.actions.num_values:
131            return self._rng.randint(self._mdp_spec.actions.num_values)
132        return action
Parameters
  • ts (dm_env.TimeStep): The TimeStep for which the agent is required to calculate the next action.
  • time (int): The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in the continuous case this refers to the total number of previous interactions.
Returns
  • action (ACTION_TYPE): The action that the agent suggests to take given the observation and the time step.