colosseum.agent.agents.episodic.q_learning

View Source

  1from typing import TYPE_CHECKING, Any, Callable, Dict, Union
  2
  3import dm_env
  4import gin
  5import numpy as np
  6from ray import tune
  7
  8from colosseum.agent.actors import QValuesActor
  9from colosseum.agent.agents.base import BaseAgent
 10from colosseum.agent.mdp_models.base import BaseMDPModel
 11from colosseum.dynamic_programming.utils import get_policy_from_q_values
 12from colosseum.emission_maps import EmissionMap
 13
 14if TYPE_CHECKING:
 15    from colosseum.mdp import ACTION_TYPE
 16    from colosseum.utils.acme.specs import MDPSpec
 17
 18
 19class QValuesModel(BaseMDPModel):
 20    def __init__(
 21        self,
 22        seed: int,
 23        mdp_specs: "MDPSpec",
 24        optimization_horizon: int,
 25        p: float,
 26        c_1: float,
 27        c_2: float = None,
 28        min_at: float = 0,
 29        UCB_type="hoeffding",
 30    ):
 31        super(QValuesModel, self).__init__(seed, mdp_specs)
 32
 33        self._UCB_type = UCB_type
 34        self._min_at = min_at
 35        self._c_1 = c_1
 36        self._c_2 = c_2
 37        self._p = p
 38
 39        self.i = np.log(self._n_states * self._n_actions * optimization_horizon / p)
 40        self.N = np.ones((self._H, self._n_states, self._n_actions), np.int32)
 41        self.Q = (
 42            np.zeros((self._H, self._n_states, self._n_actions), np.float32) + self._H
 43        )
 44        self.V = np.zeros((self._H + 1, self._n_states), np.float32)
 45
 46        if UCB_type == "bernstein":
 47            self.mu = np.zeros((self._H, self._n_states, self._n_actions), np.float32)
 48            self.sigma = np.zeros(
 49                (self._H, self._n_states, self._n_actions), np.float32
 50            )
 51            self.beta = np.zeros((self._H, self._n_states, self._n_actions), np.float32)
 52
 53    def step_update(
 54        self,
 55        ts_t: dm_env.TimeStep,
 56        a_t: "ACTION_TYPE",
 57        ts_tp1: dm_env.TimeStep,
 58        time: int,
 59    ):
 60        s_t = ts_t.observation
 61        s_tp1 = ts_tp1.observation
 62
 63        self.N[time, s_t, a_t] += 1
 64
 65        t = self.N[time, s_t, a_t]
 66        self._alpha_t = max(self._min_at, (self._H + 1) / (self._H + t))
 67
 68        if self._UCB_type == "hoeffding":
 69            b_t = self._c_1 * np.sqrt(self._H ** 3 * self.i / t)
 70        else:
 71            self.mu[time, s_t, a_t] += self.V[time + 1, s_tp1]
 72            self.sigma[time, s_t, a_t] += self.V[time + 1, s_tp1] ** 2
 73            old_beta = self.beta[time, s_t, a_t]
 74            self.beta[time, s_t, a_t] = min(
 75                self._c_1
 76                * (
 77                    np.sqrt(
 78                        (
 79                            self._H
 80                            * (
 81                                (self.sigma[time, s_t, a_t] - self.mu[time, s_t, a_t])
 82                                ** 2
 83                            )
 84                            / t ** 2
 85                            + self._H
 86                        )
 87                        * self.i
 88                    )
 89                    + np.sqrt(self._H ** 7 * self._n_states * self._n_actions)
 90                    * self.i
 91                    / t
 92                ),
 93                self._c_2 * np.sqrt(self._H ** 3 * self.i / t),
 94            )
 95            b_t = (
 96                (self.beta[time, s_t, a_t] - (1 - self._alpha_t) * old_beta)
 97                / 2
 98                / self._alpha_t
 99            )
100
101        self.Q[time, s_t, a_t] = self._alpha_t * self.Q[time, s_t, a_t] + (
102            1 - self._alpha_t
103        ) * (ts_tp1.reward + self.V[time + 1, s_tp1] + b_t)
104        self.V[time, s_t] = min(self._H, self.Q[time, s_t].max())
105
106
107@gin.configurable
108class QLearningEpisodic(BaseAgent):
109    """
110    The q-learning algorithm with UCB exploration.
111
112    Jin, Chi, et al. "Is q-learning provably efficient?." Advances in neural information processing systems 31 (2018).
113    """
114
115    @staticmethod
116    def is_emission_map_accepted(emission_map: "EmissionMap") -> bool:
117        return emission_map.is_tabular
118
119    @staticmethod
120    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
121        string = (
122            f"prms_{index}/QLearningEpisodic.p=0.05\n"
123            f'prms_{index}/QLearningEpisodic.UCB_type="bernstein"\n'
124        )
125        for k, v in parameters.items():
126            string += f"prms_{index}/QLearningEpisodic.{k} = {v}\n"
127        return string[:-1]
128
129    @staticmethod
130    def is_episodic() -> bool:
131        return True
132
133    @staticmethod
134    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
135        return {
136            "c_1": tune.uniform(0.001, 1.1),
137            "c_2": tune.uniform(0.001, 1.1),
138            "min_at": tune.uniform(0.001, 0.2),
139        }
140
141    @staticmethod
142    def get_agent_instance_from_parameters(
143        seed: int,
144        optimization_horizon: int,
145        mdp_specs: "MDPSpec",
146        parameters: Dict[str, Any],
147    ) -> "BaseAgent":
148        return QLearningEpisodic(
149            mdp_specs=mdp_specs,
150            seed=seed,
151            optimization_horizon=optimization_horizon,
152            min_at=parameters["min_at"],
153            c_1=parameters["c_1"],
154            c_2=parameters["c_2"],
155            UCB_type="bernstein",
156            p=0.05,
157        )
158
159    @property
160    def current_optimal_stochastic_policy(self) -> np.ndarray:
161        return get_policy_from_q_values(self._mdp_model.Q, True)
162
163    def __init__(
164        self,
165        seed: int,
166        mdp_specs: "MDPSpec",
167        optimization_horizon: int,
168        # MDP model parameters
169        p: float,
170        c_1: float,
171        c_2: float = None,
172        min_at: float = 0,
173        UCB_type="hoeffding",
174        # Actor parameters
175        epsilon_greedy: Union[float, Callable] = None,
176        boltzmann_temperature: Union[float, Callable] = None,
177    ):
178        """
179        Parameters
180        ----------
181        seed : int
182            The random seed.
183        mdp_specs : MDPSpec
184            The full specification of the MDP.
185        optimization_horizon : int
186            The total number of interactions that the agent is expected to have with the MDP.
187        p : float
188            The value of the probability of failure.
189        c_1 : float
190            The value of the :math:`c_1` coefficient.
191        c_2 : float
192            The value of the :math:`c_2` coefficient.
193        min_at : float
194            The minimum value for the alpha coefficient. By default, it is set to zero.
195        UCB_type : str
196            The type of UCB bonus. It can either be 'hoeffding' or 'bernstein'.
197        epsilon_greedy : Union[float, Callable], optional
198            The probability of selecting an action at random. It can be provided as a float or as a function of the
199            total number of interactions. By default, the probability is set to zero.
200        boltzmann_temperature : Union[float, Callable], optional
201            The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of
202            the total number of interactions. By default, Boltzmann exploration is disabled.
203        """
204
205        UCB_type = UCB_type.lower()
206        assert 0 <= min_at < 0.99
207        assert 0 < p < 1
208        assert c_1 > 0
209        assert UCB_type in ["hoeffding", "bernstein"]
210        if UCB_type == "bernstein":
211            assert c_2 is not None and c_2 > 0
212
213        super(QLearningEpisodic, self).__init__(
214            seed,
215            mdp_specs,
216            QValuesModel(
217                seed,
218                mdp_specs,
219                optimization_horizon,
220                p,
221                c_1,
222                c_2,
223                min_at,
224                UCB_type,
225            ),
226            QValuesActor(seed, mdp_specs, epsilon_greedy, boltzmann_temperature),
227            optimization_horizon,
228        )
229
230    def episode_end_update(self):
231        pass
232
233    def before_start_interacting(self):
234        self._actor.set_q_values(self._mdp_model.Q)
235
236    def step_update(
237        self, ts_t: dm_env.TimeStep, a_t: "ACTION_TYPE", ts_tp1: dm_env.TimeStep, h: int
238    ):
239        super(QLearningEpisodic, self).step_update(ts_t, a_t, ts_tp1, h)
240        self._actor.set_q_values(self._mdp_model.Q)

class QValuesModel(colosseum.agent.mdp_models.base.BaseMDPModel): View Source

 20class QValuesModel(BaseMDPModel):
 21    def __init__(
 22        self,
 23        seed: int,
 24        mdp_specs: "MDPSpec",
 25        optimization_horizon: int,
 26        p: float,
 27        c_1: float,
 28        c_2: float = None,
 29        min_at: float = 0,
 30        UCB_type="hoeffding",
 31    ):
 32        super(QValuesModel, self).__init__(seed, mdp_specs)
 33
 34        self._UCB_type = UCB_type
 35        self._min_at = min_at
 36        self._c_1 = c_1
 37        self._c_2 = c_2
 38        self._p = p
 39
 40        self.i = np.log(self._n_states * self._n_actions * optimization_horizon / p)
 41        self.N = np.ones((self._H, self._n_states, self._n_actions), np.int32)
 42        self.Q = (
 43            np.zeros((self._H, self._n_states, self._n_actions), np.float32) + self._H
 44        )
 45        self.V = np.zeros((self._H + 1, self._n_states), np.float32)
 46
 47        if UCB_type == "bernstein":
 48            self.mu = np.zeros((self._H, self._n_states, self._n_actions), np.float32)
 49            self.sigma = np.zeros(
 50                (self._H, self._n_states, self._n_actions), np.float32
 51            )
 52            self.beta = np.zeros((self._H, self._n_states, self._n_actions), np.float32)
 53
 54    def step_update(
 55        self,
 56        ts_t: dm_env.TimeStep,
 57        a_t: "ACTION_TYPE",
 58        ts_tp1: dm_env.TimeStep,
 59        time: int,
 60    ):
 61        s_t = ts_t.observation
 62        s_tp1 = ts_tp1.observation
 63
 64        self.N[time, s_t, a_t] += 1
 65
 66        t = self.N[time, s_t, a_t]
 67        self._alpha_t = max(self._min_at, (self._H + 1) / (self._H + t))
 68
 69        if self._UCB_type == "hoeffding":
 70            b_t = self._c_1 * np.sqrt(self._H ** 3 * self.i / t)
 71        else:
 72            self.mu[time, s_t, a_t] += self.V[time + 1, s_tp1]
 73            self.sigma[time, s_t, a_t] += self.V[time + 1, s_tp1] ** 2
 74            old_beta = self.beta[time, s_t, a_t]
 75            self.beta[time, s_t, a_t] = min(
 76                self._c_1
 77                * (
 78                    np.sqrt(
 79                        (
 80                            self._H
 81                            * (
 82                                (self.sigma[time, s_t, a_t] - self.mu[time, s_t, a_t])
 83                                ** 2
 84                            )
 85                            / t ** 2
 86                            + self._H
 87                        )
 88                        * self.i
 89                    )
 90                    + np.sqrt(self._H ** 7 * self._n_states * self._n_actions)
 91                    * self.i
 92                    / t
 93                ),
 94                self._c_2 * np.sqrt(self._H ** 3 * self.i / t),
 95            )
 96            b_t = (
 97                (self.beta[time, s_t, a_t] - (1 - self._alpha_t) * old_beta)
 98                / 2
 99                / self._alpha_t
100            )
101
102        self.Q[time, s_t, a_t] = self._alpha_t * self.Q[time, s_t, a_t] + (
103            1 - self._alpha_t
104        ) * (ts_tp1.reward + self.V[time + 1, s_tp1] + b_t)
105        self.V[time, s_t] = min(self._H, self.Q[time, s_t].max())

The BaseMDPModel is the base class for MDP models.

QValuesModel( seed: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, optimization_horizon: int, p: float, c_1: float, c_2: float = None, min_at: float = 0, UCB_type='hoeffding') View Source

21    def __init__(
22        self,
23        seed: int,
24        mdp_specs: "MDPSpec",
25        optimization_horizon: int,
26        p: float,
27        c_1: float,
28        c_2: float = None,
29        min_at: float = 0,
30        UCB_type="hoeffding",
31    ):
32        super(QValuesModel, self).__init__(seed, mdp_specs)
33
34        self._UCB_type = UCB_type
35        self._min_at = min_at
36        self._c_1 = c_1
37        self._c_2 = c_2
38        self._p = p
39
40        self.i = np.log(self._n_states * self._n_actions * optimization_horizon / p)
41        self.N = np.ones((self._H, self._n_states, self._n_actions), np.int32)
42        self.Q = (
43            np.zeros((self._H, self._n_states, self._n_actions), np.float32) + self._H
44        )
45        self.V = np.zeros((self._H + 1, self._n_states), np.float32)
46
47        if UCB_type == "bernstein":
48            self.mu = np.zeros((self._H, self._n_states, self._n_actions), np.float32)
49            self.sigma = np.zeros(
50                (self._H, self._n_states, self._n_actions), np.float32
51            )
52            self.beta = np.zeros((self._H, self._n_states, self._n_actions), np.float32)

Parameters

seed (int): The random seed.
mdp_specs (MDPSpec): The full specification of the MDP.

def step_update( self, ts_t: dm_env._environment.TimeStep, a_t: Union[int, float, numpy.ndarray], ts_tp1: dm_env._environment.TimeStep, time: int): View Source

 54    def step_update(
 55        self,
 56        ts_t: dm_env.TimeStep,
 57        a_t: "ACTION_TYPE",
 58        ts_tp1: dm_env.TimeStep,
 59        time: int,
 60    ):
 61        s_t = ts_t.observation
 62        s_tp1 = ts_tp1.observation
 63
 64        self.N[time, s_t, a_t] += 1
 65
 66        t = self.N[time, s_t, a_t]
 67        self._alpha_t = max(self._min_at, (self._H + 1) / (self._H + t))
 68
 69        if self._UCB_type == "hoeffding":
 70            b_t = self._c_1 * np.sqrt(self._H ** 3 * self.i / t)
 71        else:
 72            self.mu[time, s_t, a_t] += self.V[time + 1, s_tp1]
 73            self.sigma[time, s_t, a_t] += self.V[time + 1, s_tp1] ** 2
 74            old_beta = self.beta[time, s_t, a_t]
 75            self.beta[time, s_t, a_t] = min(
 76                self._c_1
 77                * (
 78                    np.sqrt(
 79                        (
 80                            self._H
 81                            * (
 82                                (self.sigma[time, s_t, a_t] - self.mu[time, s_t, a_t])
 83                                ** 2
 84                            )
 85                            / t ** 2
 86                            + self._H
 87                        )
 88                        * self.i
 89                    )
 90                    + np.sqrt(self._H ** 7 * self._n_states * self._n_actions)
 91                    * self.i
 92                    / t
 93                ),
 94                self._c_2 * np.sqrt(self._H ** 3 * self.i / t),
 95            )
 96            b_t = (
 97                (self.beta[time, s_t, a_t] - (1 - self._alpha_t) * old_beta)
 98                / 2
 99                / self._alpha_t
100            )
101
102        self.Q[time, s_t, a_t] = self._alpha_t * self.Q[time, s_t, a_t] + (
103            1 - self._alpha_t
104        ) * (ts_tp1.reward + self.V[time + 1, s_tp1] + b_t)
105        self.V[time, s_t] = min(self._H, self.Q[time, s_t].max())

updates the model with the transition in input.

Parameters

ts_t (dm_env.TimeStep): The TimeStep at time t.
a_t ("ACTION_TYPE"): The action taken by the agent at time t.
ts_tp1 (dm_env.TimeStep): The TimeStep at time t + 1.
time (int): The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in the continuous case this refers to the total number of previous interactions.

colosseum.agent.agents.episodic.q_learning

Parameters

Parameters

Parameters

Returns

Parameters

Returns

Returns

Returns

Parameters

Returns

Returns

Parameters

Inherited Members