colosseum.agent.agents.infinite_horizon.actor_critic_rnn

View Source

  1from typing import Dict, Any, TYPE_CHECKING
  2
  3import gin
  4import numpy as np
  5import sonnet as snt
  6import tensorflow as tf
  7from bsuite.baselines.tf.actor_critic_rnn import PolicyValueRNN, ActorCriticRNN
  8from ray import tune
  9
 10from colosseum.utils.non_tabular.bsuite import NonTabularBsuiteAgentWrapper
 11
 12if TYPE_CHECKING:
 13    from colosseum.agent.agents.base import BaseAgent
 14    from colosseum.utils.acme.specs import MDPSpec
 15
 16
 17@gin.configurable
 18class ActorCriticRNNContinuous(NonTabularBsuiteAgentWrapper):
 19    """
 20    The wrapper for the `ActorCriticRNN` agent from `bsuite`.
 21    """
 22
 23    @staticmethod
 24    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
 25        string = ""
 26        for k, v in parameters.items():
 27            string += f"prms_{index}/ActorCriticRNNContinuous.{k} = {v}\n"
 28        return string[:-1]
 29
 30    @staticmethod
 31    def is_episodic() -> bool:
 32        return False
 33
 34    @staticmethod
 35    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
 36        return {
 37            "network_width": tune.choice([64, 128, 256]),
 38            "network_depth": tune.choice([2, 4]),
 39            "max_sequence_length": tune.choice([16, 32, 64, 128]),
 40            "td_lambda": tune.choice([0.7, 0.8, 0.9]),
 41        }
 42
 43    @staticmethod
 44    def get_agent_instance_from_parameters(
 45        seed: int,
 46        optimization_horizon: int,
 47        mdp_specs: "MDPSpec",
 48        parameters: Dict[str, Any],
 49    ) -> "BaseAgent":
 50
 51        return ActorCriticRNNContinuous(
 52            seed,
 53            mdp_specs,
 54            optimization_horizon,
 55            parameters["network_width"],
 56            parameters["network_depth"],
 57            parameters["max_sequence_length"],
 58            parameters["td_lambda"],
 59        )
 60
 61    @property
 62    def current_optimal_stochastic_policy(self) -> np.ndarray:
 63        logits = tf.stop_gradient(
 64            self._agent._network(
 65                tf.convert_to_tensor(self.emission_map.all_observations),
 66                self._agent._network.initial_state(self._mdp_spec.n_states),
 67            )[0][0]
 68        ).numpy()
 69
 70        return (logits >= logits.max(-1, keepdims=True)).astype(np.float32)
 71
 72    def __init__(
 73        self,
 74        seed: int,
 75        mdp_specs: "MDPSpec",
 76        optimization_horizon: int,
 77        # MDP model parameters
 78        network_width: int,
 79        network_depth: int,
 80        max_sequence_length: int,
 81        td_lambda: float,
 82    ):
 83        r"""
 84        Parameters
 85        ----------
 86        seed : int
 87            The random seed.
 88        mdp_specs : MDPSpec
 89            The full specification of the MDP.
 90        optimization_horizon : int
 91            The total number of interactions that the agent is expected to have with the MDP.
 92        network_width : int
 93            The width of the neural networks of the agent.
 94        network_depth : int
 95            The depth of the neural networks of the agent.
 96        max_sequence_length : int
 97            The maximum sequence length for training the agent.
 98        td_lambda : float
 99            The TD(:math:`\lambda`) parameter for training the agent.
100        """
101
102        tf.random.set_seed(seed)
103        np.random.seed(seed)
104
105        network = PolicyValueRNN(
106            hidden_sizes=[network_width] * network_depth,
107            n_actions=mdp_specs.actions.num_values,
108        )
109        agent = ActorCriticRNN(
110            obs_spec=mdp_specs.observations,
111            action_spec=mdp_specs.actions,
112            network=network,
113            optimizer=snt.optimizers.Adam(learning_rate=3e-3),
114            max_sequence_length=max_sequence_length,
115            td_lambda=td_lambda,
116            discount=0.99,
117            seed=seed,
118        )
119        super(ActorCriticRNNContinuous, self).__init__(seed, agent, mdp_specs)

@gin.configurable

class ActorCriticRNNContinuous(colosseum.utils.non_tabular.bsuite.NonTabularBsuiteAgentWrapper): View Source

 18@gin.configurable
 19class ActorCriticRNNContinuous(NonTabularBsuiteAgentWrapper):
 20    """
 21    The wrapper for the `ActorCriticRNN` agent from `bsuite`.
 22    """
 23
 24    @staticmethod
 25    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
 26        string = ""
 27        for k, v in parameters.items():
 28            string += f"prms_{index}/ActorCriticRNNContinuous.{k} = {v}\n"
 29        return string[:-1]
 30
 31    @staticmethod
 32    def is_episodic() -> bool:
 33        return False
 34
 35    @staticmethod
 36    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
 37        return {
 38            "network_width": tune.choice([64, 128, 256]),
 39            "network_depth": tune.choice([2, 4]),
 40            "max_sequence_length": tune.choice([16, 32, 64, 128]),
 41            "td_lambda": tune.choice([0.7, 0.8, 0.9]),
 42        }
 43
 44    @staticmethod
 45    def get_agent_instance_from_parameters(
 46        seed: int,
 47        optimization_horizon: int,
 48        mdp_specs: "MDPSpec",
 49        parameters: Dict[str, Any],
 50    ) -> "BaseAgent":
 51
 52        return ActorCriticRNNContinuous(
 53            seed,
 54            mdp_specs,
 55            optimization_horizon,
 56            parameters["network_width"],
 57            parameters["network_depth"],
 58            parameters["max_sequence_length"],
 59            parameters["td_lambda"],
 60        )
 61
 62    @property
 63    def current_optimal_stochastic_policy(self) -> np.ndarray:
 64        logits = tf.stop_gradient(
 65            self._agent._network(
 66                tf.convert_to_tensor(self.emission_map.all_observations),
 67                self._agent._network.initial_state(self._mdp_spec.n_states),
 68            )[0][0]
 69        ).numpy()
 70
 71        return (logits >= logits.max(-1, keepdims=True)).astype(np.float32)
 72
 73    def __init__(
 74        self,
 75        seed: int,
 76        mdp_specs: "MDPSpec",
 77        optimization_horizon: int,
 78        # MDP model parameters
 79        network_width: int,
 80        network_depth: int,
 81        max_sequence_length: int,
 82        td_lambda: float,
 83    ):
 84        r"""
 85        Parameters
 86        ----------
 87        seed : int
 88            The random seed.
 89        mdp_specs : MDPSpec
 90            The full specification of the MDP.
 91        optimization_horizon : int
 92            The total number of interactions that the agent is expected to have with the MDP.
 93        network_width : int
 94            The width of the neural networks of the agent.
 95        network_depth : int
 96            The depth of the neural networks of the agent.
 97        max_sequence_length : int
 98            The maximum sequence length for training the agent.
 99        td_lambda : float
100            The TD(:math:`\lambda`) parameter for training the agent.
101        """
102
103        tf.random.set_seed(seed)
104        np.random.seed(seed)
105
106        network = PolicyValueRNN(
107            hidden_sizes=[network_width] * network_depth,
108            n_actions=mdp_specs.actions.num_values,
109        )
110        agent = ActorCriticRNN(
111            obs_spec=mdp_specs.observations,
112            action_spec=mdp_specs.actions,
113            network=network,
114            optimizer=snt.optimizers.Adam(learning_rate=3e-3),
115            max_sequence_length=max_sequence_length,
116            td_lambda=td_lambda,
117            discount=0.99,
118            seed=seed,
119        )
120        super(ActorCriticRNNContinuous, self).__init__(seed, agent, mdp_specs)

The wrapper for the ActorCriticRNN agent from bsuite.

ActorCriticRNNContinuous( seed: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, optimization_horizon: int, network_width: int, network_depth: int, max_sequence_length: int, td_lambda: float) View Source

 73    def __init__(
 74        self,
 75        seed: int,
 76        mdp_specs: "MDPSpec",
 77        optimization_horizon: int,
 78        # MDP model parameters
 79        network_width: int,
 80        network_depth: int,
 81        max_sequence_length: int,
 82        td_lambda: float,
 83    ):
 84        r"""
 85        Parameters
 86        ----------
 87        seed : int
 88            The random seed.
 89        mdp_specs : MDPSpec
 90            The full specification of the MDP.
 91        optimization_horizon : int
 92            The total number of interactions that the agent is expected to have with the MDP.
 93        network_width : int
 94            The width of the neural networks of the agent.
 95        network_depth : int
 96            The depth of the neural networks of the agent.
 97        max_sequence_length : int
 98            The maximum sequence length for training the agent.
 99        td_lambda : float
100            The TD(:math:`\lambda`) parameter for training the agent.
101        """
102
103        tf.random.set_seed(seed)
104        np.random.seed(seed)
105
106        network = PolicyValueRNN(
107            hidden_sizes=[network_width] * network_depth,
108            n_actions=mdp_specs.actions.num_values,
109        )
110        agent = ActorCriticRNN(
111            obs_spec=mdp_specs.observations,
112            action_spec=mdp_specs.actions,
113            network=network,
114            optimizer=snt.optimizers.Adam(learning_rate=3e-3),
115            max_sequence_length=max_sequence_length,
116            td_lambda=td_lambda,
117            discount=0.99,
118            seed=seed,
119        )
120        super(ActorCriticRNNContinuous, self).__init__(seed, agent, mdp_specs)

Parameters

seed (int): The random seed.
mdp_specs (MDPSpec): The full specification of the MDP.
optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
network_width (int): The width of the neural networks of the agent.
network_depth (int): The depth of the neural networks of the agent.
max_sequence_length (int): The maximum sequence length for training the agent.
td_lambda (float): The TD(\( \lambda \)) parameter for training the agent.

@staticmethod

def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0): View Source

24    @staticmethod
25    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
26        string = ""
27        for k, v in parameters.items():
28            string += f"prms_{index}/ActorCriticRNNContinuous.{k} = {v}\n"
29        return string[:-1]

produces a string containing the gin config file corresponding to the parameters given in input.

Parameters

parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
index (int): The index assigned to the gin configuration.

Returns

gin_config (str): The gin configuration file.

@staticmethod

def is_episodic() -> bool: View Source

31    @staticmethod
32    def is_episodic() -> bool:
33        return False

Returns

bool: True if the agent is suited for the episodic setting.

@staticmethod

def get_hyperparameters_search_spaces() -> Dict[str, ray.tune.sample.Domain]: View Source

35    @staticmethod
36    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
37        return {
38            "network_width": tune.choice([64, 128, 256]),
39            "network_depth": tune.choice([2, 4]),
40            "max_sequence_length": tune.choice([16, 32, 64, 128]),
41            "td_lambda": tune.choice([0.7, 0.8, 0.9]),
42        }

Returns

Dict[str, tune.sample.Domain]: The dictionary with key value pairs corresponding to hyperparameter names and corresponding ray.tune samplers.

@staticmethod

def get_agent_instance_from_parameters( seed: int, optimization_horizon: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, parameters: Dict[str, Any]) -> colosseum.agent.agents.base.BaseAgent: View Source

44    @staticmethod
45    def get_agent_instance_from_parameters(
46        seed: int,
47        optimization_horizon: int,
48        mdp_specs: "MDPSpec",
49        parameters: Dict[str, Any],
50    ) -> "BaseAgent":
51
52        return ActorCriticRNNContinuous(
53            seed,
54            mdp_specs,
55            optimization_horizon,
56            parameters["network_width"],
57            parameters["network_depth"],
58            parameters["max_sequence_length"],
59            parameters["td_lambda"],
60        )

returns an agent instance for the mdp specification and agent parameters given in input.

Parameters

seed (int): The random seed.
optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
mdp_specs (MDPSpec): The full specification of the MDP.
parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.

Returns

BaseAgent: The agent instance.

current_optimal_stochastic_policy: numpy.ndarray

Returns

np.ndarray: The estimates of the best optimal policy given the current knowledge of the agent in the form of distribution over actions.

Inherited Members

colosseum.utils.non_tabular.bsuite.NonTabularBsuiteAgentWrapper: is_emission_map_accepted; is_episode_end; select_action; step_update; update_models; episode_end_update; before_start_interacting
colosseum.agent.agents.base.BaseAgent: agent_logs