colosseum.mdp.river_swim.base

  1import abc
  2from dataclasses import dataclass
  3from enum import IntEnum
  4from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Type, Union
  5
  6import numpy as np
  7from scipy.stats import beta, rv_continuous
  8
  9from colosseum.mdp import BaseMDP
 10from colosseum.mdp.utils.custom_samplers import NextStateSampler
 11from colosseum.utils.miscellanea import (
 12    check_distributions,
 13    deterministic,
 14    get_dist,
 15    rounding_nested_structure,
 16)
 17
 18if TYPE_CHECKING:
 19    from colosseum.mdp import ACTION_TYPE, NODE_TYPE
 20
 21
 22@dataclass(frozen=True)
 23class RiverSwimNode:
 24    """
 25    The node for the RiverSwim MDP.
 26    """
 27
 28    X: int
 29    """x coordinate."""
 30
 31    def __str__(self):
 32        return f"X={self.X}"
 33
 34    def __iter__(self):
 35        return iter((self.X, self.X))
 36
 37
 38class RiverSwimAction(IntEnum):
 39    """
 40    The actions available in the RiverSwim MDP.
 41    """
 42
 43    LEFT = 0
 44    RIGHT = 1
 45
 46
 47class RiverSwimMDP(BaseMDP, abc.ABC):
 48    """
 49    The base class for the RiverSwim family.
 50    """
 51
 52    @staticmethod
 53    def get_action_class() -> RiverSwimAction:
 54        return RiverSwimAction
 55
 56    @staticmethod
 57    def get_unique_symbols() -> List[str]:
 58        return [" ", "A", "S", "G"]
 59
 60    @staticmethod
 61    def does_seed_change_MDP_structure() -> bool:
 62        return False
 63
 64    @staticmethod
 65    def sample_mdp_parameters(
 66        n: int, is_episodic: bool, seed: int = None
 67    ) -> List[Dict[str, Any]]:
 68        rng = np.random.RandomState(np.random.randint(10_000) if seed is None else seed)
 69        samples = []
 70        for _ in range(n):
 71            p_rand, p_lazy, _ = 0.9 * rng.dirichlet([0.2, 0.2, 5])
 72            sample = dict(
 73                size=int(np.minimum(2.5 + (200 / (45 * rng.random() + 11)), 25))
 74                if is_episodic
 75                else int((6 * rng.random() + 2) ** 2.2),
 76                make_reward_stochastic=rng.choice([True, False]),
 77                p_rand=p_rand,
 78                p_lazy=p_lazy,
 79                reward_variance_multiplier=2 * rng.random() + 0.005,
 80            )
 81            sample["p_rand"] = None if sample["p_rand"] < 0.01 else sample["p_rand"]
 82            sample["p_lazy"] = None if sample["p_lazy"] < 0.01 else sample["p_lazy"]
 83
 84            if sample["make_reward_stochastic"]:
 85                sample["sub_optimal_distribution"] = (
 86                    "beta",
 87                    (
 88                        sample["reward_variance_multiplier"],
 89                        sample["reward_variance_multiplier"] * (1 / 0.2 - 1),
 90                    ),
 91                )
 92                sample["optimal_distribution"] = (
 93                    "beta",
 94                    (
 95                        sample["reward_variance_multiplier"],
 96                        sample["reward_variance_multiplier"] * (1 / 0.9 - 1),
 97                    ),
 98                )
 99                sample["other_distribution"] = (
100                    "beta",
101                    (
102                        sample["reward_variance_multiplier"],
103                        sample["reward_variance_multiplier"] * (10 / 0.2 - 1),
104                    ),
105                )
106            else:
107                sample["sub_optimal_distribution"] = (
108                    "deterministic",
109                    (round(5 / 1000, 3),),
110                )
111                sample["optimal_distribution"] = ("deterministic", (1.0,))
112                sample["other_distribution"] = ("deterministic", (0.0,))
113
114            samples.append(rounding_nested_structure(sample))
115        return samples
116
117    @staticmethod
118    def get_node_class() -> Type[RiverSwimNode]:
119        return RiverSwimNode
120
121    def get_gin_parameters(self, index: int) -> str:
122        prms = dict(
123            size=self._size,
124            make_reward_stochastic=self._make_reward_stochastic,
125            reward_variance_multiplier=self._reward_variance_multiplier,
126            optimal_distribution=(
127                self._optimal_distribution.dist.name,
128                self._optimal_distribution.args,
129            ),
130            other_distribution=(
131                self._other_distribution.dist.name,
132                self._other_distribution.args,
133            ),
134            sub_optimal_distribution=(
135                self._sub_optimal_distribution.dist.name,
136                self._sub_optimal_distribution.args,
137            ),
138        )
139
140        if self._p_rand is not None:
141            prms["p_rand"] = self._p_rand
142        if self._p_lazy is not None:
143            prms["p_lazy"] = self._p_lazy
144
145        return RiverSwimMDP.produce_gin_file_from_mdp_parameters(
146            prms, type(self).__name__, index
147        )
148
149    @property
150    def n_actions(self) -> int:
151        return len(RiverSwimAction)
152
153    def __init__(
154        self,
155        seed: int,
156        size: int,
157        optimal_mean_reward: float = 0.9,
158        sub_optimal_mean_reward: float = 0.2,
159        sub_optimal_distribution: Union[Tuple, rv_continuous] = None,
160        optimal_distribution: Union[Tuple, rv_continuous] = None,
161        other_distribution: Union[Tuple, rv_continuous] = None,
162        make_reward_stochastic=False,
163        reward_variance_multiplier: float = 1.0,
164        **kwargs,
165    ):
166        """
167        Parameters
168        ----------
169        seed : int
170            The seed used for sampling rewards and next states.
171        size : int
172            The length of the chain.
173        optimal_mean_reward : float
174            If the rewards are made stochastic, this parameter controls the mean reward for the highly rewarding states.
175            By default, it is set to 0.9.
176        sub_optimal_mean_reward : float
177            If the rewards are made stochastic, this parameter controls the mean reward for the suboptimal states.
178            By default, it is set to 0.2.
179        sub_optimal_distribution : Union[Tuple, rv_continuous]
180            The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta
181            parameters or as a rv_continuous object.
182        optimal_distribution : Union[Tuple, rv_continuous]
183            The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters
184            or as a rv_continuous object.
185        other_distribution : Union[Tuple, rv_continuous]
186            The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a
187            rv_continuous object.
188        make_reward_stochastic : bool
189            If True, the rewards of the MDP will be stochastic. By default, it is set to False.
190        reward_variance_multiplier : float
191            A constant that can be used to increase the variance of the reward distributions without changing their means.
192            The lower the value, the higher the variance. By default, it is set to 1.
193        """
194
195        if type(sub_optimal_distribution) == tuple:
196            sub_optimal_distribution = get_dist(
197                sub_optimal_distribution[0], sub_optimal_distribution[1]
198            )
199        if type(optimal_distribution) == tuple:
200            optimal_distribution = get_dist(
201                optimal_distribution[0], optimal_distribution[1]
202            )
203        if type(other_distribution) == tuple:
204            other_distribution = get_dist(other_distribution[0], other_distribution[1])
205
206        self._size = size
207        self._optimal_mean_reward = optimal_mean_reward
208        self._sub_optimal_mean_reward = sub_optimal_mean_reward
209        self._optimal_distribution = optimal_distribution
210        self._sub_optimal_distribution = sub_optimal_distribution
211        self._other_distribution = other_distribution
212
213        dists = [
214            sub_optimal_distribution,
215            optimal_distribution,
216            other_distribution,
217        ]
218        if dists.count(None) == 0:
219            self._sub_optimal_distribution = sub_optimal_distribution
220            self._optimal_distribution = optimal_distribution
221            self._other_distribution = other_distribution
222        else:
223            if make_reward_stochastic:
224                if self.is_episodic():
225                    sub_optimal_mean_reward /= self._size
226                self._sub_optimal_distribution = beta(
227                    reward_variance_multiplier,
228                    reward_variance_multiplier * (1 / sub_optimal_mean_reward - 1),
229                )
230                self._optimal_distribution = beta(
231                    reward_variance_multiplier,
232                    reward_variance_multiplier * (1 / optimal_mean_reward - 1),
233                )
234                self._other_distribution = beta(
235                    reward_variance_multiplier,
236                    reward_variance_multiplier * (10 / sub_optimal_mean_reward - 1),
237                )
238            else:
239                self._sub_optimal_distribution = deterministic(5 / 1000)
240                self._optimal_distribution = deterministic(1.0)
241                self._other_distribution = deterministic(0.0)
242
243        super(RiverSwimMDP, self).__init__(
244            seed=seed,
245            reward_variance_multiplier=reward_variance_multiplier,
246            make_reward_stochastic=make_reward_stochastic,
247            **kwargs,
248        )
249
250    def _get_next_nodes_parameters(
251        self, node: "NODE_TYPE", action: "ACTION_TYPE"
252    ) -> Tuple[Tuple[dict, float], ...]:
253        return (
254            (
255                dict(
256                    X=min(node.X + 1, self._size - 1)
257                    if action == RiverSwimAction.RIGHT
258                    else max(node.X - 1, 0),
259                ),
260                1.0,
261            ),
262        )
263
264    def _get_reward_distribution(
265        self, node: "NODE_TYPE", action: "ACTION_TYPE", next_node: "NODE_TYPE"
266    ) -> rv_continuous:
267        return (
268            self._optimal_distribution
269            if node.X == self._size - 1 and action == RiverSwimAction.RIGHT
270            else (
271                self._sub_optimal_distribution
272                if node.X == 0 and action == RiverSwimAction.LEFT
273                else self._other_distribution
274            )
275        )
276
277    def _get_starting_node_sampler(self) -> NextStateSampler:
278        return NextStateSampler(next_nodes=self._possible_starting_nodes)
279
280    def _check_parameters_in_input(self):
281        super(RiverSwimMDP, self)._check_parameters_in_input()
282
283        assert self._size > 1
284        assert self._optimal_mean_reward - 0.1 > self._sub_optimal_mean_reward
285
286        dists = [
287            self._sub_optimal_distribution,
288            self._optimal_distribution,
289            self._other_distribution,
290        ]
291        check_distributions(
292            dists,
293            self._make_reward_stochastic,
294        )
295
296    def _get_grid_representation(self, node: "NODE_TYPE") -> np.ndarray:
297        grid = np.zeros((1, self._size), dtype=str)
298        grid[:, :] = " "
299        grid[0, 0] = "S"
300        grid[0, -1] = "G"
301        grid[0, node.X] = "A"
302        return grid
303
304    @property
305    def _possible_starting_nodes(self) -> List["NODE_TYPE"]:
306        return [RiverSwimNode(0)]
307
308    @property
309    def parameters(self) -> Dict[str, Any]:
310        return {
311            **super(RiverSwimMDP, self).parameters,
312            **dict(
313                size=self._size,
314                optimal_mean_reward=self._optimal_mean_reward,
315                sub_optimal_mean_reward=self._sub_optimal_mean_reward,
316                optimal_distribution=self._optimal_distribution,
317                sub_optimal_distribution=self._sub_optimal_distribution,
318                other_distribution=self._other_distribution,
319            ),
320        }
@dataclass(frozen=True)
class RiverSwimNode:
23@dataclass(frozen=True)
24class RiverSwimNode:
25    """
26    The node for the RiverSwim MDP.
27    """
28
29    X: int
30    """x coordinate."""
31
32    def __str__(self):
33        return f"X={self.X}"
34
35    def __iter__(self):
36        return iter((self.X, self.X))

The node for the RiverSwim MDP.

RiverSwimNode(X: int)
X: int

x coordinate.

class RiverSwimAction(enum.IntEnum):
39class RiverSwimAction(IntEnum):
40    """
41    The actions available in the RiverSwim MDP.
42    """
43
44    LEFT = 0
45    RIGHT = 1

The actions available in the RiverSwim MDP.

Inherited Members
enum.Enum
name
value
builtins.int
conjugate
bit_length
to_bytes
from_bytes
as_integer_ratio
real
imag
numerator
denominator
class RiverSwimMDP(colosseum.mdp.base.BaseMDP, abc.ABC):
 48class RiverSwimMDP(BaseMDP, abc.ABC):
 49    """
 50    The base class for the RiverSwim family.
 51    """
 52
 53    @staticmethod
 54    def get_action_class() -> RiverSwimAction:
 55        return RiverSwimAction
 56
 57    @staticmethod
 58    def get_unique_symbols() -> List[str]:
 59        return [" ", "A", "S", "G"]
 60
 61    @staticmethod
 62    def does_seed_change_MDP_structure() -> bool:
 63        return False
 64
 65    @staticmethod
 66    def sample_mdp_parameters(
 67        n: int, is_episodic: bool, seed: int = None
 68    ) -> List[Dict[str, Any]]:
 69        rng = np.random.RandomState(np.random.randint(10_000) if seed is None else seed)
 70        samples = []
 71        for _ in range(n):
 72            p_rand, p_lazy, _ = 0.9 * rng.dirichlet([0.2, 0.2, 5])
 73            sample = dict(
 74                size=int(np.minimum(2.5 + (200 / (45 * rng.random() + 11)), 25))
 75                if is_episodic
 76                else int((6 * rng.random() + 2) ** 2.2),
 77                make_reward_stochastic=rng.choice([True, False]),
 78                p_rand=p_rand,
 79                p_lazy=p_lazy,
 80                reward_variance_multiplier=2 * rng.random() + 0.005,
 81            )
 82            sample["p_rand"] = None if sample["p_rand"] < 0.01 else sample["p_rand"]
 83            sample["p_lazy"] = None if sample["p_lazy"] < 0.01 else sample["p_lazy"]
 84
 85            if sample["make_reward_stochastic"]:
 86                sample["sub_optimal_distribution"] = (
 87                    "beta",
 88                    (
 89                        sample["reward_variance_multiplier"],
 90                        sample["reward_variance_multiplier"] * (1 / 0.2 - 1),
 91                    ),
 92                )
 93                sample["optimal_distribution"] = (
 94                    "beta",
 95                    (
 96                        sample["reward_variance_multiplier"],
 97                        sample["reward_variance_multiplier"] * (1 / 0.9 - 1),
 98                    ),
 99                )
100                sample["other_distribution"] = (
101                    "beta",
102                    (
103                        sample["reward_variance_multiplier"],
104                        sample["reward_variance_multiplier"] * (10 / 0.2 - 1),
105                    ),
106                )
107            else:
108                sample["sub_optimal_distribution"] = (
109                    "deterministic",
110                    (round(5 / 1000, 3),),
111                )
112                sample["optimal_distribution"] = ("deterministic", (1.0,))
113                sample["other_distribution"] = ("deterministic", (0.0,))
114
115            samples.append(rounding_nested_structure(sample))
116        return samples
117
118    @staticmethod
119    def get_node_class() -> Type[RiverSwimNode]:
120        return RiverSwimNode
121
122    def get_gin_parameters(self, index: int) -> str:
123        prms = dict(
124            size=self._size,
125            make_reward_stochastic=self._make_reward_stochastic,
126            reward_variance_multiplier=self._reward_variance_multiplier,
127            optimal_distribution=(
128                self._optimal_distribution.dist.name,
129                self._optimal_distribution.args,
130            ),
131            other_distribution=(
132                self._other_distribution.dist.name,
133                self._other_distribution.args,
134            ),
135            sub_optimal_distribution=(
136                self._sub_optimal_distribution.dist.name,
137                self._sub_optimal_distribution.args,
138            ),
139        )
140
141        if self._p_rand is not None:
142            prms["p_rand"] = self._p_rand
143        if self._p_lazy is not None:
144            prms["p_lazy"] = self._p_lazy
145
146        return RiverSwimMDP.produce_gin_file_from_mdp_parameters(
147            prms, type(self).__name__, index
148        )
149
150    @property
151    def n_actions(self) -> int:
152        return len(RiverSwimAction)
153
154    def __init__(
155        self,
156        seed: int,
157        size: int,
158        optimal_mean_reward: float = 0.9,
159        sub_optimal_mean_reward: float = 0.2,
160        sub_optimal_distribution: Union[Tuple, rv_continuous] = None,
161        optimal_distribution: Union[Tuple, rv_continuous] = None,
162        other_distribution: Union[Tuple, rv_continuous] = None,
163        make_reward_stochastic=False,
164        reward_variance_multiplier: float = 1.0,
165        **kwargs,
166    ):
167        """
168        Parameters
169        ----------
170        seed : int
171            The seed used for sampling rewards and next states.
172        size : int
173            The length of the chain.
174        optimal_mean_reward : float
175            If the rewards are made stochastic, this parameter controls the mean reward for the highly rewarding states.
176            By default, it is set to 0.9.
177        sub_optimal_mean_reward : float
178            If the rewards are made stochastic, this parameter controls the mean reward for the suboptimal states.
179            By default, it is set to 0.2.
180        sub_optimal_distribution : Union[Tuple, rv_continuous]
181            The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta
182            parameters or as a rv_continuous object.
183        optimal_distribution : Union[Tuple, rv_continuous]
184            The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters
185            or as a rv_continuous object.
186        other_distribution : Union[Tuple, rv_continuous]
187            The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a
188            rv_continuous object.
189        make_reward_stochastic : bool
190            If True, the rewards of the MDP will be stochastic. By default, it is set to False.
191        reward_variance_multiplier : float
192            A constant that can be used to increase the variance of the reward distributions without changing their means.
193            The lower the value, the higher the variance. By default, it is set to 1.
194        """
195
196        if type(sub_optimal_distribution) == tuple:
197            sub_optimal_distribution = get_dist(
198                sub_optimal_distribution[0], sub_optimal_distribution[1]
199            )
200        if type(optimal_distribution) == tuple:
201            optimal_distribution = get_dist(
202                optimal_distribution[0], optimal_distribution[1]
203            )
204        if type(other_distribution) == tuple:
205            other_distribution = get_dist(other_distribution[0], other_distribution[1])
206
207        self._size = size
208        self._optimal_mean_reward = optimal_mean_reward
209        self._sub_optimal_mean_reward = sub_optimal_mean_reward
210        self._optimal_distribution = optimal_distribution
211        self._sub_optimal_distribution = sub_optimal_distribution
212        self._other_distribution = other_distribution
213
214        dists = [
215            sub_optimal_distribution,
216            optimal_distribution,
217            other_distribution,
218        ]
219        if dists.count(None) == 0:
220            self._sub_optimal_distribution = sub_optimal_distribution
221            self._optimal_distribution = optimal_distribution
222            self._other_distribution = other_distribution
223        else:
224            if make_reward_stochastic:
225                if self.is_episodic():
226                    sub_optimal_mean_reward /= self._size
227                self._sub_optimal_distribution = beta(
228                    reward_variance_multiplier,
229                    reward_variance_multiplier * (1 / sub_optimal_mean_reward - 1),
230                )
231                self._optimal_distribution = beta(
232                    reward_variance_multiplier,
233                    reward_variance_multiplier * (1 / optimal_mean_reward - 1),
234                )
235                self._other_distribution = beta(
236                    reward_variance_multiplier,
237                    reward_variance_multiplier * (10 / sub_optimal_mean_reward - 1),
238                )
239            else:
240                self._sub_optimal_distribution = deterministic(5 / 1000)
241                self._optimal_distribution = deterministic(1.0)
242                self._other_distribution = deterministic(0.0)
243
244        super(RiverSwimMDP, self).__init__(
245            seed=seed,
246            reward_variance_multiplier=reward_variance_multiplier,
247            make_reward_stochastic=make_reward_stochastic,
248            **kwargs,
249        )
250
251    def _get_next_nodes_parameters(
252        self, node: "NODE_TYPE", action: "ACTION_TYPE"
253    ) -> Tuple[Tuple[dict, float], ...]:
254        return (
255            (
256                dict(
257                    X=min(node.X + 1, self._size - 1)
258                    if action == RiverSwimAction.RIGHT
259                    else max(node.X - 1, 0),
260                ),
261                1.0,
262            ),
263        )
264
265    def _get_reward_distribution(
266        self, node: "NODE_TYPE", action: "ACTION_TYPE", next_node: "NODE_TYPE"
267    ) -> rv_continuous:
268        return (
269            self._optimal_distribution
270            if node.X == self._size - 1 and action == RiverSwimAction.RIGHT
271            else (
272                self._sub_optimal_distribution
273                if node.X == 0 and action == RiverSwimAction.LEFT
274                else self._other_distribution
275            )
276        )
277
278    def _get_starting_node_sampler(self) -> NextStateSampler:
279        return NextStateSampler(next_nodes=self._possible_starting_nodes)
280
281    def _check_parameters_in_input(self):
282        super(RiverSwimMDP, self)._check_parameters_in_input()
283
284        assert self._size > 1
285        assert self._optimal_mean_reward - 0.1 > self._sub_optimal_mean_reward
286
287        dists = [
288            self._sub_optimal_distribution,
289            self._optimal_distribution,
290            self._other_distribution,
291        ]
292        check_distributions(
293            dists,
294            self._make_reward_stochastic,
295        )
296
297    def _get_grid_representation(self, node: "NODE_TYPE") -> np.ndarray:
298        grid = np.zeros((1, self._size), dtype=str)
299        grid[:, :] = " "
300        grid[0, 0] = "S"
301        grid[0, -1] = "G"
302        grid[0, node.X] = "A"
303        return grid
304
305    @property
306    def _possible_starting_nodes(self) -> List["NODE_TYPE"]:
307        return [RiverSwimNode(0)]
308
309    @property
310    def parameters(self) -> Dict[str, Any]:
311        return {
312            **super(RiverSwimMDP, self).parameters,
313            **dict(
314                size=self._size,
315                optimal_mean_reward=self._optimal_mean_reward,
316                sub_optimal_mean_reward=self._sub_optimal_mean_reward,
317                optimal_distribution=self._optimal_distribution,
318                sub_optimal_distribution=self._sub_optimal_distribution,
319                other_distribution=self._other_distribution,
320            ),
321        }

The base class for the RiverSwim family.

RiverSwimMDP( seed: int, size: int, optimal_mean_reward: float = 0.9, sub_optimal_mean_reward: float = 0.2, sub_optimal_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, optimal_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, other_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, make_reward_stochastic=False, reward_variance_multiplier: float = 1.0, **kwargs)
154    def __init__(
155        self,
156        seed: int,
157        size: int,
158        optimal_mean_reward: float = 0.9,
159        sub_optimal_mean_reward: float = 0.2,
160        sub_optimal_distribution: Union[Tuple, rv_continuous] = None,
161        optimal_distribution: Union[Tuple, rv_continuous] = None,
162        other_distribution: Union[Tuple, rv_continuous] = None,
163        make_reward_stochastic=False,
164        reward_variance_multiplier: float = 1.0,
165        **kwargs,
166    ):
167        """
168        Parameters
169        ----------
170        seed : int
171            The seed used for sampling rewards and next states.
172        size : int
173            The length of the chain.
174        optimal_mean_reward : float
175            If the rewards are made stochastic, this parameter controls the mean reward for the highly rewarding states.
176            By default, it is set to 0.9.
177        sub_optimal_mean_reward : float
178            If the rewards are made stochastic, this parameter controls the mean reward for the suboptimal states.
179            By default, it is set to 0.2.
180        sub_optimal_distribution : Union[Tuple, rv_continuous]
181            The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta
182            parameters or as a rv_continuous object.
183        optimal_distribution : Union[Tuple, rv_continuous]
184            The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters
185            or as a rv_continuous object.
186        other_distribution : Union[Tuple, rv_continuous]
187            The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a
188            rv_continuous object.
189        make_reward_stochastic : bool
190            If True, the rewards of the MDP will be stochastic. By default, it is set to False.
191        reward_variance_multiplier : float
192            A constant that can be used to increase the variance of the reward distributions without changing their means.
193            The lower the value, the higher the variance. By default, it is set to 1.
194        """
195
196        if type(sub_optimal_distribution) == tuple:
197            sub_optimal_distribution = get_dist(
198                sub_optimal_distribution[0], sub_optimal_distribution[1]
199            )
200        if type(optimal_distribution) == tuple:
201            optimal_distribution = get_dist(
202                optimal_distribution[0], optimal_distribution[1]
203            )
204        if type(other_distribution) == tuple:
205            other_distribution = get_dist(other_distribution[0], other_distribution[1])
206
207        self._size = size
208        self._optimal_mean_reward = optimal_mean_reward
209        self._sub_optimal_mean_reward = sub_optimal_mean_reward
210        self._optimal_distribution = optimal_distribution
211        self._sub_optimal_distribution = sub_optimal_distribution
212        self._other_distribution = other_distribution
213
214        dists = [
215            sub_optimal_distribution,
216            optimal_distribution,
217            other_distribution,
218        ]
219        if dists.count(None) == 0:
220            self._sub_optimal_distribution = sub_optimal_distribution
221            self._optimal_distribution = optimal_distribution
222            self._other_distribution = other_distribution
223        else:
224            if make_reward_stochastic:
225                if self.is_episodic():
226                    sub_optimal_mean_reward /= self._size
227                self._sub_optimal_distribution = beta(
228                    reward_variance_multiplier,
229                    reward_variance_multiplier * (1 / sub_optimal_mean_reward - 1),
230                )
231                self._optimal_distribution = beta(
232                    reward_variance_multiplier,
233                    reward_variance_multiplier * (1 / optimal_mean_reward - 1),
234                )
235                self._other_distribution = beta(
236                    reward_variance_multiplier,
237                    reward_variance_multiplier * (10 / sub_optimal_mean_reward - 1),
238                )
239            else:
240                self._sub_optimal_distribution = deterministic(5 / 1000)
241                self._optimal_distribution = deterministic(1.0)
242                self._other_distribution = deterministic(0.0)
243
244        super(RiverSwimMDP, self).__init__(
245            seed=seed,
246            reward_variance_multiplier=reward_variance_multiplier,
247            make_reward_stochastic=make_reward_stochastic,
248            **kwargs,
249        )
Parameters
  • seed (int): The seed used for sampling rewards and next states.
  • size (int): The length of the chain.
  • optimal_mean_reward (float): If the rewards are made stochastic, this parameter controls the mean reward for the highly rewarding states. By default, it is set to 0.9.
  • sub_optimal_mean_reward (float): If the rewards are made stochastic, this parameter controls the mean reward for the suboptimal states. By default, it is set to 0.2.
  • sub_optimal_distribution (Union[Tuple, rv_continuous]): The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
  • optimal_distribution (Union[Tuple, rv_continuous]): The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
  • other_distribution (Union[Tuple, rv_continuous]): The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
  • make_reward_stochastic (bool): If True, the rewards of the MDP will be stochastic. By default, it is set to False.
  • reward_variance_multiplier (float): A constant that can be used to increase the variance of the reward distributions without changing their means. The lower the value, the higher the variance. By default, it is set to 1.
@staticmethod
def get_action_class() -> colosseum.mdp.river_swim.base.RiverSwimAction:
53    @staticmethod
54    def get_action_class() -> RiverSwimAction:
55        return RiverSwimAction
@staticmethod
def get_unique_symbols() -> List[str]:
57    @staticmethod
58    def get_unique_symbols() -> List[str]:
59        return [" ", "A", "S", "G"]
Returns
  • List[str]: the unique symbols of the grid representation of the MDP.
@staticmethod
def does_seed_change_MDP_structure() -> bool:
61    @staticmethod
62    def does_seed_change_MDP_structure() -> bool:
63        return False
Returns
  • bool: True if when changing the seed the transition matrix and/or rewards matrix change. This for example may
  • happen when there are fewer starting states that possible one and the effective starting states are picked
  • randomly based on the seed.
@staticmethod
def sample_mdp_parameters(n: int, is_episodic: bool, seed: int = None) -> List[Dict[str, Any]]:
 65    @staticmethod
 66    def sample_mdp_parameters(
 67        n: int, is_episodic: bool, seed: int = None
 68    ) -> List[Dict[str, Any]]:
 69        rng = np.random.RandomState(np.random.randint(10_000) if seed is None else seed)
 70        samples = []
 71        for _ in range(n):
 72            p_rand, p_lazy, _ = 0.9 * rng.dirichlet([0.2, 0.2, 5])
 73            sample = dict(
 74                size=int(np.minimum(2.5 + (200 / (45 * rng.random() + 11)), 25))
 75                if is_episodic
 76                else int((6 * rng.random() + 2) ** 2.2),
 77                make_reward_stochastic=rng.choice([True, False]),
 78                p_rand=p_rand,
 79                p_lazy=p_lazy,
 80                reward_variance_multiplier=2 * rng.random() + 0.005,
 81            )
 82            sample["p_rand"] = None if sample["p_rand"] < 0.01 else sample["p_rand"]
 83            sample["p_lazy"] = None if sample["p_lazy"] < 0.01 else sample["p_lazy"]
 84
 85            if sample["make_reward_stochastic"]:
 86                sample["sub_optimal_distribution"] = (
 87                    "beta",
 88                    (
 89                        sample["reward_variance_multiplier"],
 90                        sample["reward_variance_multiplier"] * (1 / 0.2 - 1),
 91                    ),
 92                )
 93                sample["optimal_distribution"] = (
 94                    "beta",
 95                    (
 96                        sample["reward_variance_multiplier"],
 97                        sample["reward_variance_multiplier"] * (1 / 0.9 - 1),
 98                    ),
 99                )
100                sample["other_distribution"] = (
101                    "beta",
102                    (
103                        sample["reward_variance_multiplier"],
104                        sample["reward_variance_multiplier"] * (10 / 0.2 - 1),
105                    ),
106                )
107            else:
108                sample["sub_optimal_distribution"] = (
109                    "deterministic",
110                    (round(5 / 1000, 3),),
111                )
112                sample["optimal_distribution"] = ("deterministic", (1.0,))
113                sample["other_distribution"] = ("deterministic", (0.0,))
114
115            samples.append(rounding_nested_structure(sample))
116        return samples
Returns
  • List[Dict[str, Any]]: n sampled parameters that can be used to construct an MDP in a reasonable amount of time.
@staticmethod
def get_node_class() -> Type[colosseum.mdp.river_swim.base.RiverSwimNode]:
118    @staticmethod
119    def get_node_class() -> Type[RiverSwimNode]:
120        return RiverSwimNode
Returns
  • Type["NODE_TYPE"]: The class of the nodes of the MDP.
def get_gin_parameters(self, index: int) -> str:
122    def get_gin_parameters(self, index: int) -> str:
123        prms = dict(
124            size=self._size,
125            make_reward_stochastic=self._make_reward_stochastic,
126            reward_variance_multiplier=self._reward_variance_multiplier,
127            optimal_distribution=(
128                self._optimal_distribution.dist.name,
129                self._optimal_distribution.args,
130            ),
131            other_distribution=(
132                self._other_distribution.dist.name,
133                self._other_distribution.args,
134            ),
135            sub_optimal_distribution=(
136                self._sub_optimal_distribution.dist.name,
137                self._sub_optimal_distribution.args,
138            ),
139        )
140
141        if self._p_rand is not None:
142            prms["p_rand"] = self._p_rand
143        if self._p_lazy is not None:
144            prms["p_lazy"] = self._p_lazy
145
146        return RiverSwimMDP.produce_gin_file_from_mdp_parameters(
147            prms, type(self).__name__, index
148        )
Returns
  • str: The gin config of the MDP instance.
n_actions: int
Returns
  • int: The number of available actions.
parameters: Dict[str, Any]
Returns
  • Dict[str, Any]: The parameters of the MDP.