colosseum.agent.agents.episodic.actor_critic

  1from typing import Dict, Any, TYPE_CHECKING
  2
  3import gin
  4import numpy as np
  5import sonnet as snt
  6import tensorflow as tf
  7from bsuite.baselines.tf.actor_critic import PolicyValueNet, ActorCritic
  8from ray import tune
  9
 10from colosseum.utils.non_tabular.bsuite import NonTabularBsuiteAgentWrapper
 11
 12if TYPE_CHECKING:
 13    from colosseum.agent.agents.base import BaseAgent
 14    from colosseum.utils.acme.specs import MDPSpec
 15
 16
 17@gin.configurable
 18class ActorCriticEpisodic(NonTabularBsuiteAgentWrapper):
 19    """
 20    The wrapper for the `ActorCritic` agent from `bsuite`.
 21    """
 22
 23    @staticmethod
 24    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
 25        string = ""
 26        for k, v in parameters.items():
 27            string += f"prms_{index}/ActorCriticEpisodic.{k} = {v}\n"
 28        return string[:-1]
 29
 30    @staticmethod
 31    def is_episodic() -> bool:
 32        return True
 33
 34    @staticmethod
 35    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
 36        return {
 37            "network_width": tune.choice([64, 128, 256]),
 38            "network_depth": tune.choice([2, 4]),
 39            "max_sequence_length": tune.choice([16, 32, 64, 128]),
 40            "td_lambda": tune.choice([0.7, 0.8, 0.9]),
 41        }
 42
 43    @staticmethod
 44    def get_agent_instance_from_parameters(
 45        seed: int,
 46        optimization_horizon: int,
 47        mdp_specs: "MDPSpec",
 48        parameters: Dict[str, Any],
 49    ) -> "BaseAgent":
 50
 51        return ActorCriticEpisodic(
 52            seed,
 53            mdp_specs,
 54            optimization_horizon,
 55            parameters["network_width"],
 56            parameters["network_depth"],
 57            parameters["max_sequence_length"],
 58            parameters["td_lambda"],
 59        )
 60
 61    @property
 62    def current_optimal_stochastic_policy(self) -> np.ndarray:
 63        H, S, d = self.emission_map.all_observations.shape
 64        logits = (
 65            tf.stop_gradient(
 66                self._agent._network(
 67                    tf.convert_to_tensor(
 68                        self.emission_map.all_observations.reshape(-1, d)
 69                    )
 70                )[0].logits
 71            )
 72            .numpy()
 73            .reshape(H, S, self._mdp_spec.actions.num_values)
 74        )
 75        return (logits >= logits.max(-1, keepdims=True)).astype(np.float32)
 76
 77    def __init__(
 78        self,
 79        seed: int,
 80        mdp_specs: "MDPSpec",
 81        optimization_horizon: int,
 82        # MDP model parameters
 83        network_width: int,
 84        network_depth: int,
 85        max_sequence_length: int,
 86        td_lambda: float,
 87    ):
 88        r"""
 89        Parameters
 90        ----------
 91        seed : int
 92            The random seed.
 93        mdp_specs : MDPSpec
 94            The full specification of the MDP.
 95        optimization_horizon : int
 96            The total number of interactions that the agent is expected to have with the MDP.
 97        network_width : int
 98            The width of the neural networks of the agent.
 99        network_depth : int
100            The depth of the neural networks of the agent.
101        max_sequence_length : int
102            The maximum sequence length for training the agent.
103        td_lambda : float
104            The TD(:math:`\lambda`) parameter for training the agent.
105        """
106
107        tf.random.set_seed(seed)
108        np.random.seed(seed)
109
110        network = PolicyValueNet(
111            hidden_sizes=[network_width] * network_depth,
112            action_spec=mdp_specs.actions,
113        )
114        agent = ActorCritic(
115            obs_spec=mdp_specs.observations,
116            action_spec=mdp_specs.actions,
117            network=network,
118            optimizer=snt.optimizers.Adam(learning_rate=3e-3),
119            max_sequence_length=max_sequence_length,
120            td_lambda=td_lambda,
121            discount=0.99,
122            seed=seed,
123        )
124        super(ActorCriticEpisodic, self).__init__(seed, agent, mdp_specs)
@gin.configurable
class ActorCriticEpisodic(colosseum.utils.non_tabular.bsuite.NonTabularBsuiteAgentWrapper):
 18@gin.configurable
 19class ActorCriticEpisodic(NonTabularBsuiteAgentWrapper):
 20    """
 21    The wrapper for the `ActorCritic` agent from `bsuite`.
 22    """
 23
 24    @staticmethod
 25    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
 26        string = ""
 27        for k, v in parameters.items():
 28            string += f"prms_{index}/ActorCriticEpisodic.{k} = {v}\n"
 29        return string[:-1]
 30
 31    @staticmethod
 32    def is_episodic() -> bool:
 33        return True
 34
 35    @staticmethod
 36    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
 37        return {
 38            "network_width": tune.choice([64, 128, 256]),
 39            "network_depth": tune.choice([2, 4]),
 40            "max_sequence_length": tune.choice([16, 32, 64, 128]),
 41            "td_lambda": tune.choice([0.7, 0.8, 0.9]),
 42        }
 43
 44    @staticmethod
 45    def get_agent_instance_from_parameters(
 46        seed: int,
 47        optimization_horizon: int,
 48        mdp_specs: "MDPSpec",
 49        parameters: Dict[str, Any],
 50    ) -> "BaseAgent":
 51
 52        return ActorCriticEpisodic(
 53            seed,
 54            mdp_specs,
 55            optimization_horizon,
 56            parameters["network_width"],
 57            parameters["network_depth"],
 58            parameters["max_sequence_length"],
 59            parameters["td_lambda"],
 60        )
 61
 62    @property
 63    def current_optimal_stochastic_policy(self) -> np.ndarray:
 64        H, S, d = self.emission_map.all_observations.shape
 65        logits = (
 66            tf.stop_gradient(
 67                self._agent._network(
 68                    tf.convert_to_tensor(
 69                        self.emission_map.all_observations.reshape(-1, d)
 70                    )
 71                )[0].logits
 72            )
 73            .numpy()
 74            .reshape(H, S, self._mdp_spec.actions.num_values)
 75        )
 76        return (logits >= logits.max(-1, keepdims=True)).astype(np.float32)
 77
 78    def __init__(
 79        self,
 80        seed: int,
 81        mdp_specs: "MDPSpec",
 82        optimization_horizon: int,
 83        # MDP model parameters
 84        network_width: int,
 85        network_depth: int,
 86        max_sequence_length: int,
 87        td_lambda: float,
 88    ):
 89        r"""
 90        Parameters
 91        ----------
 92        seed : int
 93            The random seed.
 94        mdp_specs : MDPSpec
 95            The full specification of the MDP.
 96        optimization_horizon : int
 97            The total number of interactions that the agent is expected to have with the MDP.
 98        network_width : int
 99            The width of the neural networks of the agent.
100        network_depth : int
101            The depth of the neural networks of the agent.
102        max_sequence_length : int
103            The maximum sequence length for training the agent.
104        td_lambda : float
105            The TD(:math:`\lambda`) parameter for training the agent.
106        """
107
108        tf.random.set_seed(seed)
109        np.random.seed(seed)
110
111        network = PolicyValueNet(
112            hidden_sizes=[network_width] * network_depth,
113            action_spec=mdp_specs.actions,
114        )
115        agent = ActorCritic(
116            obs_spec=mdp_specs.observations,
117            action_spec=mdp_specs.actions,
118            network=network,
119            optimizer=snt.optimizers.Adam(learning_rate=3e-3),
120            max_sequence_length=max_sequence_length,
121            td_lambda=td_lambda,
122            discount=0.99,
123            seed=seed,
124        )
125        super(ActorCriticEpisodic, self).__init__(seed, agent, mdp_specs)

The wrapper for the ActorCritic agent from bsuite.

ActorCriticEpisodic( seed: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, optimization_horizon: int, network_width: int, network_depth: int, max_sequence_length: int, td_lambda: float)
 78    def __init__(
 79        self,
 80        seed: int,
 81        mdp_specs: "MDPSpec",
 82        optimization_horizon: int,
 83        # MDP model parameters
 84        network_width: int,
 85        network_depth: int,
 86        max_sequence_length: int,
 87        td_lambda: float,
 88    ):
 89        r"""
 90        Parameters
 91        ----------
 92        seed : int
 93            The random seed.
 94        mdp_specs : MDPSpec
 95            The full specification of the MDP.
 96        optimization_horizon : int
 97            The total number of interactions that the agent is expected to have with the MDP.
 98        network_width : int
 99            The width of the neural networks of the agent.
100        network_depth : int
101            The depth of the neural networks of the agent.
102        max_sequence_length : int
103            The maximum sequence length for training the agent.
104        td_lambda : float
105            The TD(:math:`\lambda`) parameter for training the agent.
106        """
107
108        tf.random.set_seed(seed)
109        np.random.seed(seed)
110
111        network = PolicyValueNet(
112            hidden_sizes=[network_width] * network_depth,
113            action_spec=mdp_specs.actions,
114        )
115        agent = ActorCritic(
116            obs_spec=mdp_specs.observations,
117            action_spec=mdp_specs.actions,
118            network=network,
119            optimizer=snt.optimizers.Adam(learning_rate=3e-3),
120            max_sequence_length=max_sequence_length,
121            td_lambda=td_lambda,
122            discount=0.99,
123            seed=seed,
124        )
125        super(ActorCriticEpisodic, self).__init__(seed, agent, mdp_specs)
Parameters
  • seed (int): The random seed.
  • mdp_specs (MDPSpec): The full specification of the MDP.
  • optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
  • network_width (int): The width of the neural networks of the agent.
  • network_depth (int): The depth of the neural networks of the agent.
  • max_sequence_length (int): The maximum sequence length for training the agent.
  • td_lambda (float): The TD(\( \lambda \)) parameter for training the agent.
@staticmethod
def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
24    @staticmethod
25    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
26        string = ""
27        for k, v in parameters.items():
28            string += f"prms_{index}/ActorCriticEpisodic.{k} = {v}\n"
29        return string[:-1]

produces a string containing the gin config file corresponding to the parameters given in input.

Parameters
  • parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
  • index (int): The index assigned to the gin configuration.
Returns
  • gin_config (str): The gin configuration file.
@staticmethod
def is_episodic() -> bool:
31    @staticmethod
32    def is_episodic() -> bool:
33        return True
Returns
  • bool: True if the agent is suited for the episodic setting.
@staticmethod
def get_hyperparameters_search_spaces() -> Dict[str, ray.tune.sample.Domain]:
35    @staticmethod
36    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
37        return {
38            "network_width": tune.choice([64, 128, 256]),
39            "network_depth": tune.choice([2, 4]),
40            "max_sequence_length": tune.choice([16, 32, 64, 128]),
41            "td_lambda": tune.choice([0.7, 0.8, 0.9]),
42        }
Returns
  • Dict[str, tune.sample.Domain]: The dictionary with key value pairs corresponding to hyperparameter names and corresponding ray.tune samplers.
@staticmethod
def get_agent_instance_from_parameters( seed: int, optimization_horizon: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, parameters: Dict[str, Any]) -> colosseum.agent.agents.base.BaseAgent:
44    @staticmethod
45    def get_agent_instance_from_parameters(
46        seed: int,
47        optimization_horizon: int,
48        mdp_specs: "MDPSpec",
49        parameters: Dict[str, Any],
50    ) -> "BaseAgent":
51
52        return ActorCriticEpisodic(
53            seed,
54            mdp_specs,
55            optimization_horizon,
56            parameters["network_width"],
57            parameters["network_depth"],
58            parameters["max_sequence_length"],
59            parameters["td_lambda"],
60        )

returns an agent instance for the mdp specification and agent parameters given in input.

Parameters
  • seed (int): The random seed.
  • optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
  • mdp_specs (MDPSpec): The full specification of the MDP.
  • parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
Returns
  • BaseAgent: The agent instance.
current_optimal_stochastic_policy: numpy.ndarray
Returns
  • np.ndarray: The estimates of the best optimal policy given the current knowledge of the agent in the form of distribution over actions.