colosseum.agent.actors.Q_values_actor

 1from typing import TYPE_CHECKING, Callable, Union
 2
 3import dm_env
 4import numpy as np
 5
 6from colosseum.agent.actors import BaseActor
 7from colosseum.utils.acme.specs import MDPSpec
 8
 9if TYPE_CHECKING:
10    from colosseum.mdp import ACTION_TYPE
11
12
13class QValuesActor(BaseActor):
14    """
15    The `QValuesActor` is an actor component that selects actions that maximises a given q-values estimate. It also
16    supports the epsilon-greedy and the Boltzmann action selection strategies to boost exploration.
17    """
18
19    def __init__(
20        self,
21        seed: int,
22        mdp_specs: MDPSpec,
23        epsilon_greedy: Union[float, Callable[[int], float]] = None,
24        boltzmann_temperature: Union[float, Callable[[int], float]] = None,
25    ):
26        """
27        Parameters
28        ----------
29        seed : int
30            The random seed.
31        mdp_specs : MDPSpec
32            The full specification of the MDP.
33        epsilon_greedy : Union[float, Callable], optional
34            The probability of selecting an action at random. It can be provided as a float or as a function of the
35            total number of interactions. By default, the probability is set to zero.
36        boltzmann_temperature : Union[float, Callable], optional
37            The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of
38            the total number of interactions. By default, Boltzmann exploration is disabled.
39        """
40        super(QValuesActor, self).__init__(seed, mdp_specs)
41
42        if epsilon_greedy is not None:
43            if type(epsilon_greedy) == float:
44                epsilon_greedy = lambda t: epsilon_greedy
45        if boltzmann_temperature is not None:
46            if type(boltzmann_temperature) == float:
47                boltzmann_temperature = lambda t: boltzmann_temperature
48
49        self._epsilon_greedy = epsilon_greedy
50        self._boltzmann_temperature = boltzmann_temperature
51        self._total_interactions = 0
52        self._q_values = None
53        self._n_states = self._mdp_spec.observations.num_values
54        self._n_actions = self._mdp_spec.actions.num_values
55
56    def set_q_values(self, Q: np.ndarray):
57        """
58        updates the q-values estimates of the component with the one given in input.
59        Parameters
60        ----------
61        Q : np.ndarray
62            The q-values estimates.
63        """
64        self._q_values = Q
65        self._episodic = Q.ndim == 3
66
67    def select_action(self, ts: dm_env.TimeStep, time: int) -> "ACTION_TYPE":
68        assert self._q_values is not None, "The q values have not been initialized."
69
70        self._total_interactions += 1
71
72        # Epsilon greedy policy
73        if self._epsilon_greedy is not None:
74            if self._rng_fast.random() < self._epsilon_greedy(self._total_interactions):
75                return self._rng_fast.randint(0, self._n_actions - 1)
76
77        # Retrieve the q-values
78        q = self._q_values[(time, ts.observation) if self._episodic else ts.observation]
79
80        # Boltzmann exploration
81        if self._boltzmann_temperature is not None:
82            q = np.exp(self._boltzmann_temperature(self._total_interactions) * q)
83            return self._rng.choice(
84                range(self._n_actions), replace=False, p=q / q.sum()
85            )
86
87        # Greedy selection
88        return self._rng.choice(np.where(q == q.max())[0])
class QValuesActor(colosseum.agent.actors.base.BaseActor):
14class QValuesActor(BaseActor):
15    """
16    The `QValuesActor` is an actor component that selects actions that maximises a given q-values estimate. It also
17    supports the epsilon-greedy and the Boltzmann action selection strategies to boost exploration.
18    """
19
20    def __init__(
21        self,
22        seed: int,
23        mdp_specs: MDPSpec,
24        epsilon_greedy: Union[float, Callable[[int], float]] = None,
25        boltzmann_temperature: Union[float, Callable[[int], float]] = None,
26    ):
27        """
28        Parameters
29        ----------
30        seed : int
31            The random seed.
32        mdp_specs : MDPSpec
33            The full specification of the MDP.
34        epsilon_greedy : Union[float, Callable], optional
35            The probability of selecting an action at random. It can be provided as a float or as a function of the
36            total number of interactions. By default, the probability is set to zero.
37        boltzmann_temperature : Union[float, Callable], optional
38            The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of
39            the total number of interactions. By default, Boltzmann exploration is disabled.
40        """
41        super(QValuesActor, self).__init__(seed, mdp_specs)
42
43        if epsilon_greedy is not None:
44            if type(epsilon_greedy) == float:
45                epsilon_greedy = lambda t: epsilon_greedy
46        if boltzmann_temperature is not None:
47            if type(boltzmann_temperature) == float:
48                boltzmann_temperature = lambda t: boltzmann_temperature
49
50        self._epsilon_greedy = epsilon_greedy
51        self._boltzmann_temperature = boltzmann_temperature
52        self._total_interactions = 0
53        self._q_values = None
54        self._n_states = self._mdp_spec.observations.num_values
55        self._n_actions = self._mdp_spec.actions.num_values
56
57    def set_q_values(self, Q: np.ndarray):
58        """
59        updates the q-values estimates of the component with the one given in input.
60        Parameters
61        ----------
62        Q : np.ndarray
63            The q-values estimates.
64        """
65        self._q_values = Q
66        self._episodic = Q.ndim == 3
67
68    def select_action(self, ts: dm_env.TimeStep, time: int) -> "ACTION_TYPE":
69        assert self._q_values is not None, "The q values have not been initialized."
70
71        self._total_interactions += 1
72
73        # Epsilon greedy policy
74        if self._epsilon_greedy is not None:
75            if self._rng_fast.random() < self._epsilon_greedy(self._total_interactions):
76                return self._rng_fast.randint(0, self._n_actions - 1)
77
78        # Retrieve the q-values
79        q = self._q_values[(time, ts.observation) if self._episodic else ts.observation]
80
81        # Boltzmann exploration
82        if self._boltzmann_temperature is not None:
83            q = np.exp(self._boltzmann_temperature(self._total_interactions) * q)
84            return self._rng.choice(
85                range(self._n_actions), replace=False, p=q / q.sum()
86            )
87
88        # Greedy selection
89        return self._rng.choice(np.where(q == q.max())[0])

The QValuesActor is an actor component that selects actions that maximises a given q-values estimate. It also supports the epsilon-greedy and the Boltzmann action selection strategies to boost exploration.

QValuesActor( seed: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, epsilon_greedy: Union[float, Callable[[int], float]] = None, boltzmann_temperature: Union[float, Callable[[int], float]] = None)
20    def __init__(
21        self,
22        seed: int,
23        mdp_specs: MDPSpec,
24        epsilon_greedy: Union[float, Callable[[int], float]] = None,
25        boltzmann_temperature: Union[float, Callable[[int], float]] = None,
26    ):
27        """
28        Parameters
29        ----------
30        seed : int
31            The random seed.
32        mdp_specs : MDPSpec
33            The full specification of the MDP.
34        epsilon_greedy : Union[float, Callable], optional
35            The probability of selecting an action at random. It can be provided as a float or as a function of the
36            total number of interactions. By default, the probability is set to zero.
37        boltzmann_temperature : Union[float, Callable], optional
38            The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of
39            the total number of interactions. By default, Boltzmann exploration is disabled.
40        """
41        super(QValuesActor, self).__init__(seed, mdp_specs)
42
43        if epsilon_greedy is not None:
44            if type(epsilon_greedy) == float:
45                epsilon_greedy = lambda t: epsilon_greedy
46        if boltzmann_temperature is not None:
47            if type(boltzmann_temperature) == float:
48                boltzmann_temperature = lambda t: boltzmann_temperature
49
50        self._epsilon_greedy = epsilon_greedy
51        self._boltzmann_temperature = boltzmann_temperature
52        self._total_interactions = 0
53        self._q_values = None
54        self._n_states = self._mdp_spec.observations.num_values
55        self._n_actions = self._mdp_spec.actions.num_values
Parameters
  • seed (int): The random seed.
  • mdp_specs (MDPSpec): The full specification of the MDP.
  • epsilon_greedy (Union[float, Callable], optional): The probability of selecting an action at random. It can be provided as a float or as a function of the total number of interactions. By default, the probability is set to zero.
  • boltzmann_temperature (Union[float, Callable], optional): The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of the total number of interactions. By default, Boltzmann exploration is disabled.
def set_q_values(self, Q: numpy.ndarray):
57    def set_q_values(self, Q: np.ndarray):
58        """
59        updates the q-values estimates of the component with the one given in input.
60        Parameters
61        ----------
62        Q : np.ndarray
63            The q-values estimates.
64        """
65        self._q_values = Q
66        self._episodic = Q.ndim == 3

updates the q-values estimates of the component with the one given in input.

Parameters
  • Q (np.ndarray): The q-values estimates.
def select_action( self, ts: dm_env._environment.TimeStep, time: int) -> Union[int, float, numpy.ndarray]:
68    def select_action(self, ts: dm_env.TimeStep, time: int) -> "ACTION_TYPE":
69        assert self._q_values is not None, "The q values have not been initialized."
70
71        self._total_interactions += 1
72
73        # Epsilon greedy policy
74        if self._epsilon_greedy is not None:
75            if self._rng_fast.random() < self._epsilon_greedy(self._total_interactions):
76                return self._rng_fast.randint(0, self._n_actions - 1)
77
78        # Retrieve the q-values
79        q = self._q_values[(time, ts.observation) if self._episodic else ts.observation]
80
81        # Boltzmann exploration
82        if self._boltzmann_temperature is not None:
83            q = np.exp(self._boltzmann_temperature(self._total_interactions) * q)
84            return self._rng.choice(
85                range(self._n_actions), replace=False, p=q / q.sum()
86            )
87
88        # Greedy selection
89        return self._rng.choice(np.where(q == q.max())[0])
Parameters
  • ts (dm_env.TimeStep): The TimeStep for which the agent is required to calculate the next action.
  • time (int): The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in the continuous case this refers to the total number of previous interactions.
Returns
  • action (ACTION_TYPE): The action that the agent suggests to take given the observation and the time step.