colosseum.agent.agents.infinite_horizon.actor_critic

  1from typing import Dict, Any, TYPE_CHECKING
  2
  3import gin
  4import numpy as np
  5import sonnet as snt
  6import tensorflow as tf
  7from bsuite.baselines.tf.actor_critic import PolicyValueNet, ActorCritic
  8from ray import tune
  9
 10from colosseum.utils.non_tabular.bsuite import NonTabularBsuiteAgentWrapper
 11
 12if TYPE_CHECKING:
 13    from colosseum.agent.agents.base import BaseAgent
 14    from colosseum.utils.acme.specs import MDPSpec
 15
 16
 17@gin.configurable
 18class ActorCriticContinuous(NonTabularBsuiteAgentWrapper):
 19    """
 20    The wrapper for the `ActorCritic` agent from `bsuite`.
 21    """
 22
 23    @staticmethod
 24    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
 25        string = ""
 26        for k, v in parameters.items():
 27            string += f"prms_{index}/ActorCriticContinuous.{k} = {v}\n"
 28        return string[:-1]
 29
 30    @staticmethod
 31    def is_episodic() -> bool:
 32        return False
 33
 34    @staticmethod
 35    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
 36        return {
 37            "network_width": tune.choice([64, 128, 256]),
 38            "network_depth": tune.choice([2, 4]),
 39            "max_sequence_length": tune.choice([16, 32, 64, 128]),
 40            "td_lambda": tune.choice([0.7, 0.8, 0.9]),
 41        }
 42
 43    @staticmethod
 44    def get_agent_instance_from_parameters(
 45        seed: int,
 46        optimization_horizon: int,
 47        mdp_specs: "MDPSpec",
 48        parameters: Dict[str, Any],
 49    ) -> "BaseAgent":
 50
 51        return ActorCriticContinuous(
 52            seed,
 53            mdp_specs,
 54            optimization_horizon,
 55            parameters["network_width"],
 56            parameters["network_depth"],
 57            parameters["max_sequence_length"],
 58            parameters["td_lambda"],
 59        )
 60
 61    @property
 62    def current_optimal_stochastic_policy(self) -> np.ndarray:
 63        logits = tf.stop_gradient(
 64            self._agent._network(
 65                tf.convert_to_tensor(self.emission_map.all_observations)
 66            )[0].logits
 67        ).numpy()
 68
 69        return (logits >= logits.max(-1, keepdims=True)).astype(np.float32)
 70
 71    def __init__(
 72        self,
 73        seed: int,
 74        mdp_specs: "MDPSpec",
 75        optimization_horizon: int,
 76        # MDP model parameters
 77        network_width: int,
 78        network_depth: int,
 79        max_sequence_length: int,
 80        td_lambda: float,
 81    ):
 82        r"""
 83        Parameters
 84        ----------
 85        seed : int
 86            The random seed.
 87        mdp_specs : MDPSpec
 88            The full specification of the MDP.
 89        optimization_horizon : int
 90            The total number of interactions that the agent is expected to have with the MDP.
 91        network_width : int
 92            The width of the neural networks of the agent.
 93        network_depth : int
 94            The depth of the neural networks of the agent.
 95        max_sequence_length : int
 96            The maximum sequence length for training the agent.
 97        td_lambda : float
 98            The TD(:math:`\lambda`) parameter for training the agent.
 99        """
100
101        tf.random.set_seed(seed)
102        np.random.seed(seed)
103
104        network = PolicyValueNet(
105            hidden_sizes=[network_width] * network_depth,
106            action_spec=mdp_specs.actions,
107        )
108        agent = ActorCritic(
109            obs_spec=mdp_specs.observations,
110            action_spec=mdp_specs.actions,
111            network=network,
112            optimizer=snt.optimizers.Adam(learning_rate=3e-3),
113            max_sequence_length=max_sequence_length,
114            td_lambda=td_lambda,
115            discount=0.99,
116            seed=seed,
117        )
118        super(ActorCriticContinuous, self).__init__(seed, agent, mdp_specs)
@gin.configurable
class ActorCriticContinuous(colosseum.utils.non_tabular.bsuite.NonTabularBsuiteAgentWrapper):
 18@gin.configurable
 19class ActorCriticContinuous(NonTabularBsuiteAgentWrapper):
 20    """
 21    The wrapper for the `ActorCritic` agent from `bsuite`.
 22    """
 23
 24    @staticmethod
 25    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
 26        string = ""
 27        for k, v in parameters.items():
 28            string += f"prms_{index}/ActorCriticContinuous.{k} = {v}\n"
 29        return string[:-1]
 30
 31    @staticmethod
 32    def is_episodic() -> bool:
 33        return False
 34
 35    @staticmethod
 36    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
 37        return {
 38            "network_width": tune.choice([64, 128, 256]),
 39            "network_depth": tune.choice([2, 4]),
 40            "max_sequence_length": tune.choice([16, 32, 64, 128]),
 41            "td_lambda": tune.choice([0.7, 0.8, 0.9]),
 42        }
 43
 44    @staticmethod
 45    def get_agent_instance_from_parameters(
 46        seed: int,
 47        optimization_horizon: int,
 48        mdp_specs: "MDPSpec",
 49        parameters: Dict[str, Any],
 50    ) -> "BaseAgent":
 51
 52        return ActorCriticContinuous(
 53            seed,
 54            mdp_specs,
 55            optimization_horizon,
 56            parameters["network_width"],
 57            parameters["network_depth"],
 58            parameters["max_sequence_length"],
 59            parameters["td_lambda"],
 60        )
 61
 62    @property
 63    def current_optimal_stochastic_policy(self) -> np.ndarray:
 64        logits = tf.stop_gradient(
 65            self._agent._network(
 66                tf.convert_to_tensor(self.emission_map.all_observations)
 67            )[0].logits
 68        ).numpy()
 69
 70        return (logits >= logits.max(-1, keepdims=True)).astype(np.float32)
 71
 72    def __init__(
 73        self,
 74        seed: int,
 75        mdp_specs: "MDPSpec",
 76        optimization_horizon: int,
 77        # MDP model parameters
 78        network_width: int,
 79        network_depth: int,
 80        max_sequence_length: int,
 81        td_lambda: float,
 82    ):
 83        r"""
 84        Parameters
 85        ----------
 86        seed : int
 87            The random seed.
 88        mdp_specs : MDPSpec
 89            The full specification of the MDP.
 90        optimization_horizon : int
 91            The total number of interactions that the agent is expected to have with the MDP.
 92        network_width : int
 93            The width of the neural networks of the agent.
 94        network_depth : int
 95            The depth of the neural networks of the agent.
 96        max_sequence_length : int
 97            The maximum sequence length for training the agent.
 98        td_lambda : float
 99            The TD(:math:`\lambda`) parameter for training the agent.
100        """
101
102        tf.random.set_seed(seed)
103        np.random.seed(seed)
104
105        network = PolicyValueNet(
106            hidden_sizes=[network_width] * network_depth,
107            action_spec=mdp_specs.actions,
108        )
109        agent = ActorCritic(
110            obs_spec=mdp_specs.observations,
111            action_spec=mdp_specs.actions,
112            network=network,
113            optimizer=snt.optimizers.Adam(learning_rate=3e-3),
114            max_sequence_length=max_sequence_length,
115            td_lambda=td_lambda,
116            discount=0.99,
117            seed=seed,
118        )
119        super(ActorCriticContinuous, self).__init__(seed, agent, mdp_specs)

The wrapper for the ActorCritic agent from bsuite.

ActorCriticContinuous( seed: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, optimization_horizon: int, network_width: int, network_depth: int, max_sequence_length: int, td_lambda: float)
 72    def __init__(
 73        self,
 74        seed: int,
 75        mdp_specs: "MDPSpec",
 76        optimization_horizon: int,
 77        # MDP model parameters
 78        network_width: int,
 79        network_depth: int,
 80        max_sequence_length: int,
 81        td_lambda: float,
 82    ):
 83        r"""
 84        Parameters
 85        ----------
 86        seed : int
 87            The random seed.
 88        mdp_specs : MDPSpec
 89            The full specification of the MDP.
 90        optimization_horizon : int
 91            The total number of interactions that the agent is expected to have with the MDP.
 92        network_width : int
 93            The width of the neural networks of the agent.
 94        network_depth : int
 95            The depth of the neural networks of the agent.
 96        max_sequence_length : int
 97            The maximum sequence length for training the agent.
 98        td_lambda : float
 99            The TD(:math:`\lambda`) parameter for training the agent.
100        """
101
102        tf.random.set_seed(seed)
103        np.random.seed(seed)
104
105        network = PolicyValueNet(
106            hidden_sizes=[network_width] * network_depth,
107            action_spec=mdp_specs.actions,
108        )
109        agent = ActorCritic(
110            obs_spec=mdp_specs.observations,
111            action_spec=mdp_specs.actions,
112            network=network,
113            optimizer=snt.optimizers.Adam(learning_rate=3e-3),
114            max_sequence_length=max_sequence_length,
115            td_lambda=td_lambda,
116            discount=0.99,
117            seed=seed,
118        )
119        super(ActorCriticContinuous, self).__init__(seed, agent, mdp_specs)
Parameters
  • seed (int): The random seed.
  • mdp_specs (MDPSpec): The full specification of the MDP.
  • optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
  • network_width (int): The width of the neural networks of the agent.
  • network_depth (int): The depth of the neural networks of the agent.
  • max_sequence_length (int): The maximum sequence length for training the agent.
  • td_lambda (float): The TD(\( \lambda \)) parameter for training the agent.
@staticmethod
def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
24    @staticmethod
25    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
26        string = ""
27        for k, v in parameters.items():
28            string += f"prms_{index}/ActorCriticContinuous.{k} = {v}\n"
29        return string[:-1]

produces a string containing the gin config file corresponding to the parameters given in input.

Parameters
  • parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
  • index (int): The index assigned to the gin configuration.
Returns
  • gin_config (str): The gin configuration file.
@staticmethod
def is_episodic() -> bool:
31    @staticmethod
32    def is_episodic() -> bool:
33        return False
Returns
  • bool: True if the agent is suited for the episodic setting.
@staticmethod
def get_hyperparameters_search_spaces() -> Dict[str, ray.tune.sample.Domain]:
35    @staticmethod
36    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
37        return {
38            "network_width": tune.choice([64, 128, 256]),
39            "network_depth": tune.choice([2, 4]),
40            "max_sequence_length": tune.choice([16, 32, 64, 128]),
41            "td_lambda": tune.choice([0.7, 0.8, 0.9]),
42        }
Returns
  • Dict[str, tune.sample.Domain]: The dictionary with key value pairs corresponding to hyperparameter names and corresponding ray.tune samplers.
@staticmethod
def get_agent_instance_from_parameters( seed: int, optimization_horizon: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, parameters: Dict[str, Any]) -> colosseum.agent.agents.base.BaseAgent:
44    @staticmethod
45    def get_agent_instance_from_parameters(
46        seed: int,
47        optimization_horizon: int,
48        mdp_specs: "MDPSpec",
49        parameters: Dict[str, Any],
50    ) -> "BaseAgent":
51
52        return ActorCriticContinuous(
53            seed,
54            mdp_specs,
55            optimization_horizon,
56            parameters["network_width"],
57            parameters["network_depth"],
58            parameters["max_sequence_length"],
59            parameters["td_lambda"],
60        )

returns an agent instance for the mdp specification and agent parameters given in input.

Parameters
  • seed (int): The random seed.
  • optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
  • mdp_specs (MDPSpec): The full specification of the MDP.
  • parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
Returns
  • BaseAgent: The agent instance.
current_optimal_stochastic_policy: numpy.ndarray
Returns
  • np.ndarray: The estimates of the best optimal policy given the current knowledge of the agent in the form of distribution over actions.