colosseum.agent.actors.Q_values_actor
1from typing import TYPE_CHECKING, Callable, Union 2 3import dm_env 4import numpy as np 5 6from colosseum.agent.actors import BaseActor 7from colosseum.utils.acme.specs import MDPSpec 8 9if TYPE_CHECKING: 10 from colosseum.mdp import ACTION_TYPE 11 12 13class QValuesActor(BaseActor): 14 """ 15 The `QValuesActor` is an actor component that selects actions that maximises a given q-values estimate. It also 16 supports the epsilon-greedy and the Boltzmann action selection strategies to boost exploration. 17 """ 18 19 def __init__( 20 self, 21 seed: int, 22 mdp_specs: MDPSpec, 23 epsilon_greedy: Union[float, Callable[[int], float]] = None, 24 boltzmann_temperature: Union[float, Callable[[int], float]] = None, 25 ): 26 """ 27 Parameters 28 ---------- 29 seed : int 30 The random seed. 31 mdp_specs : MDPSpec 32 The full specification of the MDP. 33 epsilon_greedy : Union[float, Callable], optional 34 The probability of selecting an action at random. It can be provided as a float or as a function of the 35 total number of interactions. By default, the probability is set to zero. 36 boltzmann_temperature : Union[float, Callable], optional 37 The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of 38 the total number of interactions. By default, Boltzmann exploration is disabled. 39 """ 40 super(QValuesActor, self).__init__(seed, mdp_specs) 41 42 if epsilon_greedy is not None: 43 if type(epsilon_greedy) == float: 44 epsilon_greedy = lambda t: epsilon_greedy 45 if boltzmann_temperature is not None: 46 if type(boltzmann_temperature) == float: 47 boltzmann_temperature = lambda t: boltzmann_temperature 48 49 self._epsilon_greedy = epsilon_greedy 50 self._boltzmann_temperature = boltzmann_temperature 51 self._total_interactions = 0 52 self._q_values = None 53 self._n_states = self._mdp_spec.observations.num_values 54 self._n_actions = self._mdp_spec.actions.num_values 55 56 def set_q_values(self, Q: np.ndarray): 57 """ 58 updates the q-values estimates of the component with the one given in input. 59 Parameters 60 ---------- 61 Q : np.ndarray 62 The q-values estimates. 63 """ 64 self._q_values = Q 65 self._episodic = Q.ndim == 3 66 67 def select_action(self, ts: dm_env.TimeStep, time: int) -> "ACTION_TYPE": 68 assert self._q_values is not None, "The q values have not been initialized." 69 70 self._total_interactions += 1 71 72 # Epsilon greedy policy 73 if self._epsilon_greedy is not None: 74 if self._rng_fast.random() < self._epsilon_greedy(self._total_interactions): 75 return self._rng_fast.randint(0, self._n_actions - 1) 76 77 # Retrieve the q-values 78 q = self._q_values[(time, ts.observation) if self._episodic else ts.observation] 79 80 # Boltzmann exploration 81 if self._boltzmann_temperature is not None: 82 q = np.exp(self._boltzmann_temperature(self._total_interactions) * q) 83 return self._rng.choice( 84 range(self._n_actions), replace=False, p=q / q.sum() 85 ) 86 87 # Greedy selection 88 return self._rng.choice(np.where(q == q.max())[0])
14class QValuesActor(BaseActor): 15 """ 16 The `QValuesActor` is an actor component that selects actions that maximises a given q-values estimate. It also 17 supports the epsilon-greedy and the Boltzmann action selection strategies to boost exploration. 18 """ 19 20 def __init__( 21 self, 22 seed: int, 23 mdp_specs: MDPSpec, 24 epsilon_greedy: Union[float, Callable[[int], float]] = None, 25 boltzmann_temperature: Union[float, Callable[[int], float]] = None, 26 ): 27 """ 28 Parameters 29 ---------- 30 seed : int 31 The random seed. 32 mdp_specs : MDPSpec 33 The full specification of the MDP. 34 epsilon_greedy : Union[float, Callable], optional 35 The probability of selecting an action at random. It can be provided as a float or as a function of the 36 total number of interactions. By default, the probability is set to zero. 37 boltzmann_temperature : Union[float, Callable], optional 38 The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of 39 the total number of interactions. By default, Boltzmann exploration is disabled. 40 """ 41 super(QValuesActor, self).__init__(seed, mdp_specs) 42 43 if epsilon_greedy is not None: 44 if type(epsilon_greedy) == float: 45 epsilon_greedy = lambda t: epsilon_greedy 46 if boltzmann_temperature is not None: 47 if type(boltzmann_temperature) == float: 48 boltzmann_temperature = lambda t: boltzmann_temperature 49 50 self._epsilon_greedy = epsilon_greedy 51 self._boltzmann_temperature = boltzmann_temperature 52 self._total_interactions = 0 53 self._q_values = None 54 self._n_states = self._mdp_spec.observations.num_values 55 self._n_actions = self._mdp_spec.actions.num_values 56 57 def set_q_values(self, Q: np.ndarray): 58 """ 59 updates the q-values estimates of the component with the one given in input. 60 Parameters 61 ---------- 62 Q : np.ndarray 63 The q-values estimates. 64 """ 65 self._q_values = Q 66 self._episodic = Q.ndim == 3 67 68 def select_action(self, ts: dm_env.TimeStep, time: int) -> "ACTION_TYPE": 69 assert self._q_values is not None, "The q values have not been initialized." 70 71 self._total_interactions += 1 72 73 # Epsilon greedy policy 74 if self._epsilon_greedy is not None: 75 if self._rng_fast.random() < self._epsilon_greedy(self._total_interactions): 76 return self._rng_fast.randint(0, self._n_actions - 1) 77 78 # Retrieve the q-values 79 q = self._q_values[(time, ts.observation) if self._episodic else ts.observation] 80 81 # Boltzmann exploration 82 if self._boltzmann_temperature is not None: 83 q = np.exp(self._boltzmann_temperature(self._total_interactions) * q) 84 return self._rng.choice( 85 range(self._n_actions), replace=False, p=q / q.sum() 86 ) 87 88 # Greedy selection 89 return self._rng.choice(np.where(q == q.max())[0])
The QValuesActor
is an actor component that selects actions that maximises a given q-values estimate. It also
supports the epsilon-greedy and the Boltzmann action selection strategies to boost exploration.
QValuesActor( seed: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, epsilon_greedy: Union[float, Callable[[int], float]] = None, boltzmann_temperature: Union[float, Callable[[int], float]] = None)
20 def __init__( 21 self, 22 seed: int, 23 mdp_specs: MDPSpec, 24 epsilon_greedy: Union[float, Callable[[int], float]] = None, 25 boltzmann_temperature: Union[float, Callable[[int], float]] = None, 26 ): 27 """ 28 Parameters 29 ---------- 30 seed : int 31 The random seed. 32 mdp_specs : MDPSpec 33 The full specification of the MDP. 34 epsilon_greedy : Union[float, Callable], optional 35 The probability of selecting an action at random. It can be provided as a float or as a function of the 36 total number of interactions. By default, the probability is set to zero. 37 boltzmann_temperature : Union[float, Callable], optional 38 The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of 39 the total number of interactions. By default, Boltzmann exploration is disabled. 40 """ 41 super(QValuesActor, self).__init__(seed, mdp_specs) 42 43 if epsilon_greedy is not None: 44 if type(epsilon_greedy) == float: 45 epsilon_greedy = lambda t: epsilon_greedy 46 if boltzmann_temperature is not None: 47 if type(boltzmann_temperature) == float: 48 boltzmann_temperature = lambda t: boltzmann_temperature 49 50 self._epsilon_greedy = epsilon_greedy 51 self._boltzmann_temperature = boltzmann_temperature 52 self._total_interactions = 0 53 self._q_values = None 54 self._n_states = self._mdp_spec.observations.num_values 55 self._n_actions = self._mdp_spec.actions.num_values
Parameters
- seed (int): The random seed.
- mdp_specs (MDPSpec): The full specification of the MDP.
- epsilon_greedy (Union[float, Callable], optional): The probability of selecting an action at random. It can be provided as a float or as a function of the total number of interactions. By default, the probability is set to zero.
- boltzmann_temperature (Union[float, Callable], optional): The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of the total number of interactions. By default, Boltzmann exploration is disabled.
def
set_q_values(self, Q: numpy.ndarray):
57 def set_q_values(self, Q: np.ndarray): 58 """ 59 updates the q-values estimates of the component with the one given in input. 60 Parameters 61 ---------- 62 Q : np.ndarray 63 The q-values estimates. 64 """ 65 self._q_values = Q 66 self._episodic = Q.ndim == 3
updates the q-values estimates of the component with the one given in input.
Parameters
- Q (np.ndarray): The q-values estimates.
def
select_action( self, ts: dm_env._environment.TimeStep, time: int) -> Union[int, float, numpy.ndarray]:
68 def select_action(self, ts: dm_env.TimeStep, time: int) -> "ACTION_TYPE": 69 assert self._q_values is not None, "The q values have not been initialized." 70 71 self._total_interactions += 1 72 73 # Epsilon greedy policy 74 if self._epsilon_greedy is not None: 75 if self._rng_fast.random() < self._epsilon_greedy(self._total_interactions): 76 return self._rng_fast.randint(0, self._n_actions - 1) 77 78 # Retrieve the q-values 79 q = self._q_values[(time, ts.observation) if self._episodic else ts.observation] 80 81 # Boltzmann exploration 82 if self._boltzmann_temperature is not None: 83 q = np.exp(self._boltzmann_temperature(self._total_interactions) * q) 84 return self._rng.choice( 85 range(self._n_actions), replace=False, p=q / q.sum() 86 ) 87 88 # Greedy selection 89 return self._rng.choice(np.where(q == q.max())[0])
Parameters
- ts (dm_env.TimeStep): The TimeStep for which the agent is required to calculate the next action.
- time (int): The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in the continuous case this refers to the total number of previous interactions.
Returns
- action (ACTION_TYPE): The action that the agent suggests to take given the observation and the time step.