colosseum.agent.agents.infinite_horizon.q_learning

  1from typing import Any, Callable, Dict, Union
  2
  3import dm_env
  4import gin
  5import numpy as np
  6from ray import tune
  7from typing_extensions import TYPE_CHECKING
  8
  9from colosseum.agent.actors import QValuesActor
 10from colosseum.agent.agents.base import BaseAgent
 11from colosseum.agent.mdp_models.base import BaseMDPModel
 12from colosseum.dynamic_programming.utils import get_policy_from_q_values
 13from colosseum.emission_maps import EmissionMap
 14
 15if TYPE_CHECKING:
 16    from colosseum.mdp import ACTION_TYPE
 17    from colosseum.utils.acme.specs import MDPSpec
 18
 19
 20def get_H(n_states, n_actions, T, span_approx, confidence):
 21    """
 22    computes the theoretical value for the horizon approximation value.
 23    Parameters
 24    ----------
 25    n_states : int
 26        The number of states.
 27    n_actions : int
 28        The number of actions.
 29    T : int
 30        The optimization horizon.
 31    span_approx : float
 32        The span approximation value.
 33    confidence : float
 34        One minus the probability of failure.
 35
 36    Returns
 37    -------
 38    float
 39        The theoretical value for the horizon approximation value.
 40    """
 41    return min(
 42        np.sqrt(span_approx * T / n_states / n_actions),
 43        (T / n_states / n_actions / np.log(4 * T / confidence)) ** 0.333,
 44    )
 45
 46
 47class _QValuesModel(BaseMDPModel):
 48    def __init__(
 49        self,
 50        seed: int,
 51        mdp_specs: "MDPSpec",
 52        optimization_horizon: int,
 53        min_at: float,
 54        confidence: float,
 55        span_approx_weight: float,
 56        get_span_approx: Callable[[int, int], float],
 57        h_weight: float,
 58        get_H: Callable[[int, int, int, float, float], float],
 59    ):
 60        super(_QValuesModel, self).__init__(seed, mdp_specs)
 61
 62        self.min_at = min_at if min_at > 0.009 else 0
 63        self.span_approx = span_approx_weight
 64        if get_span_approx is not None:
 65            self.span_approx *= get_span_approx(self._n_states, self._n_actions)
 66
 67        self.confidence = confidence
 68        self.optimization_horizon = optimization_horizon
 69
 70        self.H = h_weight * get_H(
 71            self._n_states,
 72            self._n_actions,
 73            optimization_horizon,
 74            self.span_approx,
 75            confidence,
 76        )
 77        self.gamma = 1 - 1 / self.H
 78
 79        self.N = np.zeros((self._n_states, self._n_actions), np.int32)
 80        self.Q = np.zeros((self._n_states, self._n_actions), np.float32) + self.H
 81        self.Q_main = np.zeros((self._n_states, self._n_actions), np.float32) + self.H
 82        self.V = np.zeros((self._n_states,), np.float32) + self.H
 83
 84    def step_update(
 85        self,
 86        ts_t: dm_env.TimeStep,
 87        a_t: "ACTION_TYPE",
 88        ts_tp1: dm_env.TimeStep,
 89        time: int,
 90    ):
 91        s_t = ts_t.observation
 92        s_tp1 = ts_tp1.observation
 93
 94        self.N[s_t, a_t] += 1
 95        alpha_t = max(self.min_at, (self.H + 1) / (self.H + self.N[s_t, a_t]))
 96        b_t = (
 97            4
 98            * self.span_approx
 99            * np.sqrt(
100                self.H
101                / self.N[s_t, a_t]
102                * np.log(2 * self.optimization_horizon / self.confidence)
103            )
104        )
105
106        self.Q_main[s_t, a_t] = (1 - alpha_t) * self.Q[s_t, a_t] + alpha_t * (
107            ts_tp1.reward + self.gamma * self.V[s_tp1] + b_t
108        )
109        self.Q[s_t, a_t] = min(self.Q[s_t, a_t], self.Q_main[s_t, a_t])
110        self.V[s_tp1] = self.Q[s_tp1].max()
111
112
113@gin.configurable
114class QLearningContinuous(BaseAgent):
115    """
116    The q-learning algorithm with optimism.
117
118    Wei, Chen-Yu, et al. "Model-free reinforcement learning in infinite-horizon average-reward markov decision processes."
119    International conference on machine learning. PMLR, 2020.
120    """
121
122    @staticmethod
123    def is_emission_map_accepted(emission_map: "EmissionMap") -> bool:
124        return emission_map.is_tabular
125
126    @staticmethod
127    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
128        string = ""
129        for k, v in parameters.items():
130            string += f"prms_{index}/QLearningContinuous.{k} = {v}\n"
131        return string[:-1]
132
133    @staticmethod
134    def is_episodic() -> bool:
135        return False
136
137    @staticmethod
138    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
139        return {
140            "h_weight": tune.uniform(0.001, 1.1),
141            "span_approx_weight": tune.uniform(0.001, 1.1),
142            "min_at": tune.uniform(0.001, 0.2),
143        }
144
145    @staticmethod
146    def get_agent_instance_from_parameters(
147        seed: int,
148        optimization_horizon: int,
149        mdp_specs: "MDPSpec",
150        parameters: Dict[str, Any],
151    ) -> "BaseAgent":
152        return QLearningContinuous(
153            mdp_specs=mdp_specs,
154            seed=seed,
155            optimization_horizon=optimization_horizon,
156            min_at=parameters["min_at"],
157            h_weight=parameters["h_weight"],
158            span_approx_weight=parameters["span_approx_weight"],
159        )
160
161    @property
162    def current_optimal_stochastic_policy(self) -> np.ndarray:
163        return get_policy_from_q_values(self._mdp_model.Q, True)
164
165    def __init__(
166        self,
167        seed: int,
168        mdp_specs: "MDPSpec",
169        optimization_horizon: int,
170        # MDP model parameters
171        min_at: float = 0,
172        confidence: float = 0.95,
173        span_approx_weight: float = 1,
174        get_span_approx: Callable[[int, int], float] = None,
175        h_weight: float = 1,
176        get_H: Callable[[int, int, int, float, float], float] = get_H,
177        # Actor parameters
178        epsilon_greedy: Union[float, Callable] = None,
179        boltzmann_temperature: Union[float, Callable] = None,
180    ):
181        """
182
183        Parameters
184        ----------
185        seed : int
186            The random seed.
187        mdp_specs : MDPSpec
188            The full specification of the MDP.
189        optimization_horizon : int
190            The total number of interactions that the agent is expected to have with the MDP.
191        min_at : float
192            The minimum value for the alpha coefficient. By default, it is set to zero.
193        confidence : float
194            One minus the probability of failure. By default, it is set to :math:`0.95`.
195        span_approx_weight : float
196            The weight given to the value of the span approximation.
197        get_span_approx : Callable[[int, int], float]
198            The function that computes the value for the span approximation given number of states and number of actions.
199            By default, the theoretical value is used.
200        h_weight : float
201            The weight given to the value of approximate horizon. By default, it is set to one.
202        get_H : Callable[[int, int, int, float, float], float]
203            The function that computes the approximate horizon given number of states, number of actions, optimization
204            horizon, the value of the span approximation, and the confidence. By default, the theoretical value is used.
205        epsilon_greedy : Union[float, Callable], optional
206            The probability of selecting an action at random. It can be provided as a float or as a function of the
207            total number of interactions. By default, the probability is set to zero.
208        boltzmann_temperature : Union[float, Callable], optional
209            The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of
210            the total number of interactions. By default, Boltzmann exploration is disabled.
211        """
212
213        assert 0 <= min_at < 0.99
214        assert 0 < confidence < 1
215        assert span_approx_weight > 0
216        assert h_weight > 0
217
218        super(QLearningContinuous, self).__init__(
219            seed,
220            mdp_specs,
221            _QValuesModel(
222                seed,
223                mdp_specs,
224                optimization_horizon,
225                min_at,
226                confidence,
227                span_approx_weight,
228                get_span_approx,
229                h_weight,
230                get_H,
231            ),
232            QValuesActor(seed, mdp_specs, epsilon_greedy, boltzmann_temperature),
233            optimization_horizon,
234        )
235
236    def episode_end_update(self):
237        pass
238
239    def before_start_interacting(self):
240        self._actor.set_q_values(self._mdp_model.Q)
241
242    def step_update(
243        self, ts_t: dm_env.TimeStep, a_t: "ACTION_TYPE", ts_tp1: dm_env.TimeStep, h: int
244    ):
245        super(QLearningContinuous, self).step_update(ts_t, a_t, ts_tp1, h)
246        self._actor.set_q_values(self._mdp_model.Q)
247
248    def get_q_value_estimate(self) -> np.ndarray:
249        """
250        Returns
251        -------
252        np.ndarray
253            The q-values estimate.
254        """
255        return self._mdp_model.Q
def get_H(n_states, n_actions, T, span_approx, confidence):
21def get_H(n_states, n_actions, T, span_approx, confidence):
22    """
23    computes the theoretical value for the horizon approximation value.
24    Parameters
25    ----------
26    n_states : int
27        The number of states.
28    n_actions : int
29        The number of actions.
30    T : int
31        The optimization horizon.
32    span_approx : float
33        The span approximation value.
34    confidence : float
35        One minus the probability of failure.
36
37    Returns
38    -------
39    float
40        The theoretical value for the horizon approximation value.
41    """
42    return min(
43        np.sqrt(span_approx * T / n_states / n_actions),
44        (T / n_states / n_actions / np.log(4 * T / confidence)) ** 0.333,
45    )

computes the theoretical value for the horizon approximation value.

Parameters
  • n_states (int): The number of states.
  • n_actions (int): The number of actions.
  • T (int): The optimization horizon.
  • span_approx (float): The span approximation value.
  • confidence (float): One minus the probability of failure.
Returns
  • float: The theoretical value for the horizon approximation value.
@gin.configurable
class QLearningContinuous(colosseum.agent.agents.base.BaseAgent):
114@gin.configurable
115class QLearningContinuous(BaseAgent):
116    """
117    The q-learning algorithm with optimism.
118
119    Wei, Chen-Yu, et al. "Model-free reinforcement learning in infinite-horizon average-reward markov decision processes."
120    International conference on machine learning. PMLR, 2020.
121    """
122
123    @staticmethod
124    def is_emission_map_accepted(emission_map: "EmissionMap") -> bool:
125        return emission_map.is_tabular
126
127    @staticmethod
128    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
129        string = ""
130        for k, v in parameters.items():
131            string += f"prms_{index}/QLearningContinuous.{k} = {v}\n"
132        return string[:-1]
133
134    @staticmethod
135    def is_episodic() -> bool:
136        return False
137
138    @staticmethod
139    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
140        return {
141            "h_weight": tune.uniform(0.001, 1.1),
142            "span_approx_weight": tune.uniform(0.001, 1.1),
143            "min_at": tune.uniform(0.001, 0.2),
144        }
145
146    @staticmethod
147    def get_agent_instance_from_parameters(
148        seed: int,
149        optimization_horizon: int,
150        mdp_specs: "MDPSpec",
151        parameters: Dict[str, Any],
152    ) -> "BaseAgent":
153        return QLearningContinuous(
154            mdp_specs=mdp_specs,
155            seed=seed,
156            optimization_horizon=optimization_horizon,
157            min_at=parameters["min_at"],
158            h_weight=parameters["h_weight"],
159            span_approx_weight=parameters["span_approx_weight"],
160        )
161
162    @property
163    def current_optimal_stochastic_policy(self) -> np.ndarray:
164        return get_policy_from_q_values(self._mdp_model.Q, True)
165
166    def __init__(
167        self,
168        seed: int,
169        mdp_specs: "MDPSpec",
170        optimization_horizon: int,
171        # MDP model parameters
172        min_at: float = 0,
173        confidence: float = 0.95,
174        span_approx_weight: float = 1,
175        get_span_approx: Callable[[int, int], float] = None,
176        h_weight: float = 1,
177        get_H: Callable[[int, int, int, float, float], float] = get_H,
178        # Actor parameters
179        epsilon_greedy: Union[float, Callable] = None,
180        boltzmann_temperature: Union[float, Callable] = None,
181    ):
182        """
183
184        Parameters
185        ----------
186        seed : int
187            The random seed.
188        mdp_specs : MDPSpec
189            The full specification of the MDP.
190        optimization_horizon : int
191            The total number of interactions that the agent is expected to have with the MDP.
192        min_at : float
193            The minimum value for the alpha coefficient. By default, it is set to zero.
194        confidence : float
195            One minus the probability of failure. By default, it is set to :math:`0.95`.
196        span_approx_weight : float
197            The weight given to the value of the span approximation.
198        get_span_approx : Callable[[int, int], float]
199            The function that computes the value for the span approximation given number of states and number of actions.
200            By default, the theoretical value is used.
201        h_weight : float
202            The weight given to the value of approximate horizon. By default, it is set to one.
203        get_H : Callable[[int, int, int, float, float], float]
204            The function that computes the approximate horizon given number of states, number of actions, optimization
205            horizon, the value of the span approximation, and the confidence. By default, the theoretical value is used.
206        epsilon_greedy : Union[float, Callable], optional
207            The probability of selecting an action at random. It can be provided as a float or as a function of the
208            total number of interactions. By default, the probability is set to zero.
209        boltzmann_temperature : Union[float, Callable], optional
210            The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of
211            the total number of interactions. By default, Boltzmann exploration is disabled.
212        """
213
214        assert 0 <= min_at < 0.99
215        assert 0 < confidence < 1
216        assert span_approx_weight > 0
217        assert h_weight > 0
218
219        super(QLearningContinuous, self).__init__(
220            seed,
221            mdp_specs,
222            _QValuesModel(
223                seed,
224                mdp_specs,
225                optimization_horizon,
226                min_at,
227                confidence,
228                span_approx_weight,
229                get_span_approx,
230                h_weight,
231                get_H,
232            ),
233            QValuesActor(seed, mdp_specs, epsilon_greedy, boltzmann_temperature),
234            optimization_horizon,
235        )
236
237    def episode_end_update(self):
238        pass
239
240    def before_start_interacting(self):
241        self._actor.set_q_values(self._mdp_model.Q)
242
243    def step_update(
244        self, ts_t: dm_env.TimeStep, a_t: "ACTION_TYPE", ts_tp1: dm_env.TimeStep, h: int
245    ):
246        super(QLearningContinuous, self).step_update(ts_t, a_t, ts_tp1, h)
247        self._actor.set_q_values(self._mdp_model.Q)
248
249    def get_q_value_estimate(self) -> np.ndarray:
250        """
251        Returns
252        -------
253        np.ndarray
254            The q-values estimate.
255        """
256        return self._mdp_model.Q

The q-learning algorithm with optimism.

Wei, Chen-Yu, et al. "Model-free reinforcement learning in infinite-horizon average-reward markov decision processes." International conference on machine learning. PMLR, 2020.

QLearningContinuous( seed: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, optimization_horizon: int, min_at: float = 0, confidence: float = 0.95, span_approx_weight: float = 1, get_span_approx: Callable[[int, int], float] = None, h_weight: float = 1, get_H: Callable[[int, int, int, float, float], float] = <function get_H>, epsilon_greedy: Union[float, Callable] = None, boltzmann_temperature: Union[float, Callable] = None)
166    def __init__(
167        self,
168        seed: int,
169        mdp_specs: "MDPSpec",
170        optimization_horizon: int,
171        # MDP model parameters
172        min_at: float = 0,
173        confidence: float = 0.95,
174        span_approx_weight: float = 1,
175        get_span_approx: Callable[[int, int], float] = None,
176        h_weight: float = 1,
177        get_H: Callable[[int, int, int, float, float], float] = get_H,
178        # Actor parameters
179        epsilon_greedy: Union[float, Callable] = None,
180        boltzmann_temperature: Union[float, Callable] = None,
181    ):
182        """
183
184        Parameters
185        ----------
186        seed : int
187            The random seed.
188        mdp_specs : MDPSpec
189            The full specification of the MDP.
190        optimization_horizon : int
191            The total number of interactions that the agent is expected to have with the MDP.
192        min_at : float
193            The minimum value for the alpha coefficient. By default, it is set to zero.
194        confidence : float
195            One minus the probability of failure. By default, it is set to :math:`0.95`.
196        span_approx_weight : float
197            The weight given to the value of the span approximation.
198        get_span_approx : Callable[[int, int], float]
199            The function that computes the value for the span approximation given number of states and number of actions.
200            By default, the theoretical value is used.
201        h_weight : float
202            The weight given to the value of approximate horizon. By default, it is set to one.
203        get_H : Callable[[int, int, int, float, float], float]
204            The function that computes the approximate horizon given number of states, number of actions, optimization
205            horizon, the value of the span approximation, and the confidence. By default, the theoretical value is used.
206        epsilon_greedy : Union[float, Callable], optional
207            The probability of selecting an action at random. It can be provided as a float or as a function of the
208            total number of interactions. By default, the probability is set to zero.
209        boltzmann_temperature : Union[float, Callable], optional
210            The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of
211            the total number of interactions. By default, Boltzmann exploration is disabled.
212        """
213
214        assert 0 <= min_at < 0.99
215        assert 0 < confidence < 1
216        assert span_approx_weight > 0
217        assert h_weight > 0
218
219        super(QLearningContinuous, self).__init__(
220            seed,
221            mdp_specs,
222            _QValuesModel(
223                seed,
224                mdp_specs,
225                optimization_horizon,
226                min_at,
227                confidence,
228                span_approx_weight,
229                get_span_approx,
230                h_weight,
231                get_H,
232            ),
233            QValuesActor(seed, mdp_specs, epsilon_greedy, boltzmann_temperature),
234            optimization_horizon,
235        )
Parameters
  • seed (int): The random seed.
  • mdp_specs (MDPSpec): The full specification of the MDP.
  • optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
  • min_at (float): The minimum value for the alpha coefficient. By default, it is set to zero.
  • confidence (float): One minus the probability of failure. By default, it is set to \( 0.95 \).
  • span_approx_weight (float): The weight given to the value of the span approximation.
  • get_span_approx (Callable[[int, int], float]): The function that computes the value for the span approximation given number of states and number of actions. By default, the theoretical value is used.
  • h_weight (float): The weight given to the value of approximate horizon. By default, it is set to one.
  • get_H (Callable[[int, int, int, float, float], float]): The function that computes the approximate horizon given number of states, number of actions, optimization horizon, the value of the span approximation, and the confidence. By default, the theoretical value is used.
  • epsilon_greedy (Union[float, Callable], optional): The probability of selecting an action at random. It can be provided as a float or as a function of the total number of interactions. By default, the probability is set to zero.
  • boltzmann_temperature (Union[float, Callable], optional): The parameter that controls the Boltzmann exploration. It can be provided as a float or as a function of the total number of interactions. By default, Boltzmann exploration is disabled.
@staticmethod
def is_emission_map_accepted(emission_map: colosseum.emission_maps.base.EmissionMap) -> bool:
123    @staticmethod
124    def is_emission_map_accepted(emission_map: "EmissionMap") -> bool:
125        return emission_map.is_tabular
Returns
  • bool: True if the agent class accepts the emission map.
@staticmethod
def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
127    @staticmethod
128    def produce_gin_file_from_parameters(parameters: Dict[str, Any], index: int = 0):
129        string = ""
130        for k, v in parameters.items():
131            string += f"prms_{index}/QLearningContinuous.{k} = {v}\n"
132        return string[:-1]

produces a string containing the gin config file corresponding to the parameters given in input.

Parameters
  • parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
  • index (int): The index assigned to the gin configuration.
Returns
  • gin_config (str): The gin configuration file.
@staticmethod
def is_episodic() -> bool:
134    @staticmethod
135    def is_episodic() -> bool:
136        return False
Returns
  • bool: True if the agent is suited for the episodic setting.
@staticmethod
def get_hyperparameters_search_spaces() -> Dict[str, ray.tune.sample.Domain]:
138    @staticmethod
139    def get_hyperparameters_search_spaces() -> Dict[str, tune.sample.Domain]:
140        return {
141            "h_weight": tune.uniform(0.001, 1.1),
142            "span_approx_weight": tune.uniform(0.001, 1.1),
143            "min_at": tune.uniform(0.001, 0.2),
144        }
Returns
  • Dict[str, tune.sample.Domain]: The dictionary with key value pairs corresponding to hyperparameter names and corresponding ray.tune samplers.
@staticmethod
def get_agent_instance_from_parameters( seed: int, optimization_horizon: int, mdp_specs: colosseum.utils.acme.specs.MDPSpec, parameters: Dict[str, Any]) -> colosseum.agent.agents.base.BaseAgent:
146    @staticmethod
147    def get_agent_instance_from_parameters(
148        seed: int,
149        optimization_horizon: int,
150        mdp_specs: "MDPSpec",
151        parameters: Dict[str, Any],
152    ) -> "BaseAgent":
153        return QLearningContinuous(
154            mdp_specs=mdp_specs,
155            seed=seed,
156            optimization_horizon=optimization_horizon,
157            min_at=parameters["min_at"],
158            h_weight=parameters["h_weight"],
159            span_approx_weight=parameters["span_approx_weight"],
160        )

returns an agent instance for the mdp specification and agent parameters given in input.

Parameters
  • seed (int): The random seed.
  • optimization_horizon (int): The total number of interactions that the agent is expected to have with the MDP.
  • mdp_specs (MDPSpec): The full specification of the MDP.
  • parameters (Dict[str, Any]): The dictionary containing the parameters of the agent.
Returns
  • BaseAgent: The agent instance.
current_optimal_stochastic_policy: numpy.ndarray
Returns
  • np.ndarray: The estimates of the best optimal policy given the current knowledge of the agent in the form of distribution over actions.
def episode_end_update(self):
237    def episode_end_update(self):
238        pass

is called when an episode ends. In the infinite horizon case, we refer to artificial episodes.

def before_start_interacting(self):
240    def before_start_interacting(self):
241        self._actor.set_q_values(self._mdp_model.Q)

is called before the agent starts interacting with the MDP.

def step_update( self, ts_t: dm_env._environment.TimeStep, a_t: Union[int, float, numpy.ndarray], ts_tp1: dm_env._environment.TimeStep, h: int):
243    def step_update(
244        self, ts_t: dm_env.TimeStep, a_t: "ACTION_TYPE", ts_tp1: dm_env.TimeStep, h: int
245    ):
246        super(QLearningContinuous, self).step_update(ts_t, a_t, ts_tp1, h)
247        self._actor.set_q_values(self._mdp_model.Q)

adds the transition in input to the MDP model.

Parameters
  • ts_t (dm_env.TimeStep): The TimeStep at time t.
  • a_t ("ACTION_TYPE"): The action taken by the agent at time t.
  • ts_tp1 (dm_env.TimeStep): The TimeStep at time t + 1.
  • time (int): The current time of the environment. In the episodic case, this refers to the in-episode time, whereas in the continuous case this refers to the total number of previous interactions.
def get_q_value_estimate(self) -> numpy.ndarray:
249    def get_q_value_estimate(self) -> np.ndarray:
250        """
251        Returns
252        -------
253        np.ndarray
254            The q-values estimate.
255        """
256        return self._mdp_model.Q
Returns
  • np.ndarray: The q-values estimate.