colosseum.mdp.deep_sea.base

View Source

  1import abc
  2from dataclasses import dataclass
  3from enum import IntEnum
  4from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Type, Union
  5
  6import numpy as np
  7from scipy.stats import beta, rv_continuous
  8
  9from colosseum.mdp import BaseMDP
 10from colosseum.mdp.utils.custom_samplers import NextStateSampler
 11from colosseum.utils.miscellanea import (
 12    check_distributions,
 13    deterministic,
 14    get_dist,
 15    rounding_nested_structure,
 16)
 17
 18if TYPE_CHECKING:
 19    from colosseum.mdp import ACTION_TYPE, NODE_TYPE
 20
 21
 22@dataclass(frozen=True)
 23class DeepSeaNode:
 24    """
 25    The node for the DeepSea MDP.
 26    """
 27
 28    X: int
 29    """x coordinate."""
 30    Y: int
 31    """y coordinate."""
 32
 33    def __str__(self):
 34        return f"X={self.X},Y={self.Y}"
 35
 36    def __iter__(self):
 37        return iter((self.X, self.Y))
 38
 39
 40class DeepSeaAction(IntEnum):
 41    """
 42    The actions available in the DeepSea MDP.
 43    """
 44
 45    LEFT = 0
 46    """Move towards the left."""
 47    RIGHT = 1
 48    """Move towards the right."""
 49
 50
 51class DeepSeaMDP(BaseMDP, abc.ABC):
 52    """
 53    The base class for the DeepSea family.
 54    """
 55
 56    @staticmethod
 57    def get_unique_symbols() -> List[str]:
 58        return ["A", " "]
 59
 60    @staticmethod
 61    def does_seed_change_MDP_structure() -> bool:
 62        return False
 63
 64    @staticmethod
 65    def sample_mdp_parameters(
 66        n: int, is_episodic: bool, seed: int = None
 67    ) -> List[Dict[str, Any]]:
 68        rng = np.random.RandomState(np.random.randint(10_000) if seed is None else seed)
 69        samples = []
 70        for _ in range(n):
 71            sample = dict(
 72                size=int(
 73                    (1 + np.minimum((800 / (100 * rng.random() + 35)), 25))
 74                    * (0.8 if is_episodic else 1)
 75                ),
 76                p_rand=min(2 / (8 * rng.random() + 3), 0.95),
 77                make_reward_stochastic=rng.choice([True, False]),
 78                reward_variance_multiplier=2 * rng.random() + 0.005,
 79            )
 80            sample["p_rand"] = None if sample["p_rand"] < 0.01 else sample["p_rand"]
 81
 82            if sample["make_reward_stochastic"]:
 83                sample["sub_optimal_distribution"] = (
 84                    "beta",
 85                    (
 86                        sample["reward_variance_multiplier"],
 87                        sample["reward_variance_multiplier"]
 88                        * (sample["size"] / 0.5 - 1),
 89                    ),
 90                )
 91                sample["optimal_distribution"] = (
 92                    "beta",
 93                    (
 94                        sample["reward_variance_multiplier"] * (sample["size"] / 1 - 1),
 95                        sample["reward_variance_multiplier"],
 96                    ),
 97                )
 98                sample["other_distribution"] = (
 99                    "beta",
100                    (
101                        sample["reward_variance_multiplier"],
102                        sample["reward_variance_multiplier"]
103                        * 10
104                        * (sample["size"] / 0.5 - 1),
105                    ),
106                )
107            else:
108                sample["sub_optimal_distribution"] = (
109                    "deterministic",
110                    (1.0 / (sample["size"] ** 2),),
111                )
112                sample["optimal_distribution"] = ("deterministic", (1.0,))
113                sample["other_distribution"] = ("deterministic", (0.0,))
114
115            samples.append(rounding_nested_structure(sample))
116
117        return samples
118
119    @staticmethod
120    def get_node_class() -> Type[DeepSeaNode]:
121        return DeepSeaNode
122
123    def get_gin_parameters(self, index: int) -> str:
124        prms = dict(
125            size=self._size,
126            make_reward_stochastic=self._make_reward_stochastic,
127            reward_variance_multiplier=self._reward_variance_multiplier,
128            sub_optimal_distribution=(
129                self._sub_optimal_distribution.dist.name,
130                self._sub_optimal_distribution.args,
131            ),
132            optimal_distribution=(
133                self._optimal_distribution.dist.name,
134                self._optimal_distribution.args,
135            ),
136            other_distribution=(
137                self._other_distribution.dist.name,
138                self._other_distribution.args,
139            ),
140        )
141        if self._p_rand is not None:
142            prms["p_rand"] = self._p_rand
143
144        return DeepSeaMDP.produce_gin_file_from_mdp_parameters(
145            prms, type(self).__name__, index
146        )
147
148    @property
149    def n_actions(self) -> int:
150        return len(DeepSeaAction)
151
152    def __init__(
153        self,
154        seed: int,
155        size: int,
156        optimal_return: float = 1.0,
157        suboptimal_return: float = 0.5,
158        optimal_distribution: Union[Tuple, rv_continuous] = None,
159        sub_optimal_distribution: Union[Tuple, rv_continuous] = None,
160        other_distribution: Union[Tuple, rv_continuous] = None,
161        make_reward_stochastic=False,
162        reward_variance_multiplier: float = 1.0,
163        **kwargs,
164    ):
165        """
166        Parameters
167        ----------
168        seed : int
169            The seed used for sampling rewards and next states.
170        size : int
171            The size of the grid.
172        optimal_return : float
173            If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory.
174            By default, it is set to 1.
175        suboptimal_return: float
176            If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories.
177            By default, it is set to 0.5.
178        optimal_distribution : Union[Tuple, rv_continuous]
179            The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters
180            or as a rv_continuous object.
181        sub_optimal_distribution : Union[Tuple, rv_continuous]
182            The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta
183            parameters or as a rv_continuous object.
184        other_distribution : Union[Tuple, rv_continuous]
185            The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a
186            rv_continuous object.
187        make_reward_stochastic : bool
188            If True, the rewards of the MDP will be stochastic. By default, it is set to False.
189        reward_variance_multiplier : float
190            A constant that can be used to increase the variance of the reward distributions without changing their means.
191            The lower the value, the higher the variance. By default, it is set to 1.
192        """
193
194        if type(sub_optimal_distribution) == tuple:
195            sub_optimal_distribution = get_dist(
196                sub_optimal_distribution[0], sub_optimal_distribution[1]
197            )
198        if type(optimal_distribution) == tuple:
199            optimal_distribution = get_dist(
200                optimal_distribution[0], optimal_distribution[1]
201            )
202        if type(other_distribution) == tuple:
203            other_distribution = get_dist(other_distribution[0], other_distribution[1])
204
205        self._size = size
206        self._optimal_return = optimal_return
207        self._suboptimal_return = suboptimal_return
208        self._optimal_distribution = optimal_distribution
209        self._sub_optimal_distribution = sub_optimal_distribution
210        self._other_distribution = other_distribution
211
212        dists = [
213            sub_optimal_distribution,
214            optimal_distribution,
215            other_distribution,
216        ]
217        if dists.count(None) == 0:
218            self._sub_optimal_distribution = sub_optimal_distribution
219            self._optimal_distribution = optimal_distribution
220            self._other_distribution = other_distribution
221        else:
222            if make_reward_stochastic:
223                self._sub_optimal_distribution = beta(
224                    reward_variance_multiplier,
225                    reward_variance_multiplier * (size / self._suboptimal_return - 1),
226                )
227                self._optimal_distribution = beta(
228                    reward_variance_multiplier * (size / self._optimal_return - 1),
229                    reward_variance_multiplier,
230                )
231                self._other_distribution = beta(
232                    reward_variance_multiplier,
233                    reward_variance_multiplier
234                    * 10
235                    * (size / self._suboptimal_return - 1),
236                )
237            else:
238                self._sub_optimal_distribution = deterministic(1.0 / (size ** 2))
239                self._optimal_distribution = deterministic(1.0)
240                self._other_distribution = deterministic(0.0)
241
242        super(DeepSeaMDP, self).__init__(
243            seed=seed,
244            reward_variance_multiplier=reward_variance_multiplier,
245            make_reward_stochastic=make_reward_stochastic,
246            **kwargs,
247        )
248
249    @property
250    def _possible_starting_nodes(self) -> List[DeepSeaNode]:
251        return [DeepSeaNode(0, self._size - 1)]
252
253    def _get_next_nodes_parameters(
254        self, node: "NODE_TYPE", action: "ACTION_TYPE"
255    ) -> Tuple[Tuple[dict, float], ...]:
256        if node.Y == 0:
257            return ((dict(X=0, Y=self._size - 1), 1.0),)
258
259        return (
260            (
261                dict(
262                    X=min(node.X + 1, self._size - 1)
263                    if action == DeepSeaAction.RIGHT
264                    else max(node.X - 1, 0),
265                    Y=max(0, node.Y - 1),
266                ),
267                1.0,
268            ),
269        )
270
271    def _get_reward_distribution(
272        self, node: "NODE_TYPE", action: "ACTION_TYPE", next_node: "NODE_TYPE"
273    ) -> rv_continuous:
274        return (
275            self._optimal_distribution
276            if node.X == self._size - 1
277            and node.Y == 0
278            and action == DeepSeaAction.RIGHT
279            else (
280                self._sub_optimal_distribution
281                if action == DeepSeaAction.LEFT
282                else self._other_distribution
283            )
284        )
285
286    def _get_starting_node_sampler(self) -> NextStateSampler:
287        return NextStateSampler(next_nodes=self._possible_starting_nodes)
288
289    def _check_parameters_in_input(self):
290        super(DeepSeaMDP, self)._check_parameters_in_input()
291
292        assert self._size > 1
293
294        # No lazy mechanic for DeepSea
295        assert self._p_lazy is None
296
297        assert self._suboptimal_return < self._optimal_return - 0.1
298
299        dists = [
300            self._sub_optimal_distribution,
301            self._optimal_distribution,
302            self._other_distribution,
303        ]
304        check_distributions(
305            dists,
306            self._make_reward_stochastic,
307        )
308
309    def _get_grid_representation(self, node: "NODE_TYPE") -> np.ndarray:
310        grid = np.zeros((self._size, self._size), dtype=str)
311        grid[:, :] = " "
312        grid[node.Y, node.X] = "A"
313        return grid[::-1, :]
314
315    @property
316    def parameters(self) -> Dict[str, Any]:
317        return {
318            **super(DeepSeaMDP, self).parameters,
319            **dict(
320                size=self._size,
321                optimal_return=self._optimal_return,
322                suboptimal_return=self._suboptimal_return,
323                optimal_distribution=self._optimal_distribution,
324                sub_optimal_distribution=self._sub_optimal_distribution,
325                other_distribution=self._other_distribution,
326            ),
327        }

@dataclass(frozen=True)

class DeepSeaNode: View Source

23@dataclass(frozen=True)
24class DeepSeaNode:
25    """
26    The node for the DeepSea MDP.
27    """
28
29    X: int
30    """x coordinate."""
31    Y: int
32    """y coordinate."""
33
34    def __str__(self):
35        return f"X={self.X},Y={self.Y}"
36
37    def __iter__(self):
38        return iter((self.X, self.Y))

The node for the DeepSea MDP.

DeepSeaNode(X: int, Y: int)

X: int

x coordinate.

Y: int

y coordinate.

class DeepSeaAction(enum.IntEnum): View Source

41class DeepSeaAction(IntEnum):
42    """
43    The actions available in the DeepSea MDP.
44    """
45
46    LEFT = 0
47    """Move towards the left."""
48    RIGHT = 1
49    """Move towards the right."""

The actions available in the DeepSea MDP.

LEFT = <DeepSeaAction.LEFT: 0>

Move towards the left.

RIGHT = <DeepSeaAction.RIGHT: 1>

Move towards the right.

Inherited Members

enum.Enum: name; value
builtins.int: conjugate; bit_length; to_bytes; from_bytes; as_integer_ratio; real; imag; numerator; denominator

class DeepSeaMDP(colosseum.mdp.base.BaseMDP, abc.ABC): View Source

 52class DeepSeaMDP(BaseMDP, abc.ABC):
 53    """
 54    The base class for the DeepSea family.
 55    """
 56
 57    @staticmethod
 58    def get_unique_symbols() -> List[str]:
 59        return ["A", " "]
 60
 61    @staticmethod
 62    def does_seed_change_MDP_structure() -> bool:
 63        return False
 64
 65    @staticmethod
 66    def sample_mdp_parameters(
 67        n: int, is_episodic: bool, seed: int = None
 68    ) -> List[Dict[str, Any]]:
 69        rng = np.random.RandomState(np.random.randint(10_000) if seed is None else seed)
 70        samples = []
 71        for _ in range(n):
 72            sample = dict(
 73                size=int(
 74                    (1 + np.minimum((800 / (100 * rng.random() + 35)), 25))
 75                    * (0.8 if is_episodic else 1)
 76                ),
 77                p_rand=min(2 / (8 * rng.random() + 3), 0.95),
 78                make_reward_stochastic=rng.choice([True, False]),
 79                reward_variance_multiplier=2 * rng.random() + 0.005,
 80            )
 81            sample["p_rand"] = None if sample["p_rand"] < 0.01 else sample["p_rand"]
 82
 83            if sample["make_reward_stochastic"]:
 84                sample["sub_optimal_distribution"] = (
 85                    "beta",
 86                    (
 87                        sample["reward_variance_multiplier"],
 88                        sample["reward_variance_multiplier"]
 89                        * (sample["size"] / 0.5 - 1),
 90                    ),
 91                )
 92                sample["optimal_distribution"] = (
 93                    "beta",
 94                    (
 95                        sample["reward_variance_multiplier"] * (sample["size"] / 1 - 1),
 96                        sample["reward_variance_multiplier"],
 97                    ),
 98                )
 99                sample["other_distribution"] = (
100                    "beta",
101                    (
102                        sample["reward_variance_multiplier"],
103                        sample["reward_variance_multiplier"]
104                        * 10
105                        * (sample["size"] / 0.5 - 1),
106                    ),
107                )
108            else:
109                sample["sub_optimal_distribution"] = (
110                    "deterministic",
111                    (1.0 / (sample["size"] ** 2),),
112                )
113                sample["optimal_distribution"] = ("deterministic", (1.0,))
114                sample["other_distribution"] = ("deterministic", (0.0,))
115
116            samples.append(rounding_nested_structure(sample))
117
118        return samples
119
120    @staticmethod
121    def get_node_class() -> Type[DeepSeaNode]:
122        return DeepSeaNode
123
124    def get_gin_parameters(self, index: int) -> str:
125        prms = dict(
126            size=self._size,
127            make_reward_stochastic=self._make_reward_stochastic,
128            reward_variance_multiplier=self._reward_variance_multiplier,
129            sub_optimal_distribution=(
130                self._sub_optimal_distribution.dist.name,
131                self._sub_optimal_distribution.args,
132            ),
133            optimal_distribution=(
134                self._optimal_distribution.dist.name,
135                self._optimal_distribution.args,
136            ),
137            other_distribution=(
138                self._other_distribution.dist.name,
139                self._other_distribution.args,
140            ),
141        )
142        if self._p_rand is not None:
143            prms["p_rand"] = self._p_rand
144
145        return DeepSeaMDP.produce_gin_file_from_mdp_parameters(
146            prms, type(self).__name__, index
147        )
148
149    @property
150    def n_actions(self) -> int:
151        return len(DeepSeaAction)
152
153    def __init__(
154        self,
155        seed: int,
156        size: int,
157        optimal_return: float = 1.0,
158        suboptimal_return: float = 0.5,
159        optimal_distribution: Union[Tuple, rv_continuous] = None,
160        sub_optimal_distribution: Union[Tuple, rv_continuous] = None,
161        other_distribution: Union[Tuple, rv_continuous] = None,
162        make_reward_stochastic=False,
163        reward_variance_multiplier: float = 1.0,
164        **kwargs,
165    ):
166        """
167        Parameters
168        ----------
169        seed : int
170            The seed used for sampling rewards and next states.
171        size : int
172            The size of the grid.
173        optimal_return : float
174            If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory.
175            By default, it is set to 1.
176        suboptimal_return: float
177            If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories.
178            By default, it is set to 0.5.
179        optimal_distribution : Union[Tuple, rv_continuous]
180            The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters
181            or as a rv_continuous object.
182        sub_optimal_distribution : Union[Tuple, rv_continuous]
183            The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta
184            parameters or as a rv_continuous object.
185        other_distribution : Union[Tuple, rv_continuous]
186            The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a
187            rv_continuous object.
188        make_reward_stochastic : bool
189            If True, the rewards of the MDP will be stochastic. By default, it is set to False.
190        reward_variance_multiplier : float
191            A constant that can be used to increase the variance of the reward distributions without changing their means.
192            The lower the value, the higher the variance. By default, it is set to 1.
193        """
194
195        if type(sub_optimal_distribution) == tuple:
196            sub_optimal_distribution = get_dist(
197                sub_optimal_distribution[0], sub_optimal_distribution[1]
198            )
199        if type(optimal_distribution) == tuple:
200            optimal_distribution = get_dist(
201                optimal_distribution[0], optimal_distribution[1]
202            )
203        if type(other_distribution) == tuple:
204            other_distribution = get_dist(other_distribution[0], other_distribution[1])
205
206        self._size = size
207        self._optimal_return = optimal_return
208        self._suboptimal_return = suboptimal_return
209        self._optimal_distribution = optimal_distribution
210        self._sub_optimal_distribution = sub_optimal_distribution
211        self._other_distribution = other_distribution
212
213        dists = [
214            sub_optimal_distribution,
215            optimal_distribution,
216            other_distribution,
217        ]
218        if dists.count(None) == 0:
219            self._sub_optimal_distribution = sub_optimal_distribution
220            self._optimal_distribution = optimal_distribution
221            self._other_distribution = other_distribution
222        else:
223            if make_reward_stochastic:
224                self._sub_optimal_distribution = beta(
225                    reward_variance_multiplier,
226                    reward_variance_multiplier * (size / self._suboptimal_return - 1),
227                )
228                self._optimal_distribution = beta(
229                    reward_variance_multiplier * (size / self._optimal_return - 1),
230                    reward_variance_multiplier,
231                )
232                self._other_distribution = beta(
233                    reward_variance_multiplier,
234                    reward_variance_multiplier
235                    * 10
236                    * (size / self._suboptimal_return - 1),
237                )
238            else:
239                self._sub_optimal_distribution = deterministic(1.0 / (size ** 2))
240                self._optimal_distribution = deterministic(1.0)
241                self._other_distribution = deterministic(0.0)
242
243        super(DeepSeaMDP, self).__init__(
244            seed=seed,
245            reward_variance_multiplier=reward_variance_multiplier,
246            make_reward_stochastic=make_reward_stochastic,
247            **kwargs,
248        )
249
250    @property
251    def _possible_starting_nodes(self) -> List[DeepSeaNode]:
252        return [DeepSeaNode(0, self._size - 1)]
253
254    def _get_next_nodes_parameters(
255        self, node: "NODE_TYPE", action: "ACTION_TYPE"
256    ) -> Tuple[Tuple[dict, float], ...]:
257        if node.Y == 0:
258            return ((dict(X=0, Y=self._size - 1), 1.0),)
259
260        return (
261            (
262                dict(
263                    X=min(node.X + 1, self._size - 1)
264                    if action == DeepSeaAction.RIGHT
265                    else max(node.X - 1, 0),
266                    Y=max(0, node.Y - 1),
267                ),
268                1.0,
269            ),
270        )
271
272    def _get_reward_distribution(
273        self, node: "NODE_TYPE", action: "ACTION_TYPE", next_node: "NODE_TYPE"
274    ) -> rv_continuous:
275        return (
276            self._optimal_distribution
277            if node.X == self._size - 1
278            and node.Y == 0
279            and action == DeepSeaAction.RIGHT
280            else (
281                self._sub_optimal_distribution
282                if action == DeepSeaAction.LEFT
283                else self._other_distribution
284            )
285        )
286
287    def _get_starting_node_sampler(self) -> NextStateSampler:
288        return NextStateSampler(next_nodes=self._possible_starting_nodes)
289
290    def _check_parameters_in_input(self):
291        super(DeepSeaMDP, self)._check_parameters_in_input()
292
293        assert self._size > 1
294
295        # No lazy mechanic for DeepSea
296        assert self._p_lazy is None
297
298        assert self._suboptimal_return < self._optimal_return - 0.1
299
300        dists = [
301            self._sub_optimal_distribution,
302            self._optimal_distribution,
303            self._other_distribution,
304        ]
305        check_distributions(
306            dists,
307            self._make_reward_stochastic,
308        )
309
310    def _get_grid_representation(self, node: "NODE_TYPE") -> np.ndarray:
311        grid = np.zeros((self._size, self._size), dtype=str)
312        grid[:, :] = " "
313        grid[node.Y, node.X] = "A"
314        return grid[::-1, :]
315
316    @property
317    def parameters(self) -> Dict[str, Any]:
318        return {
319            **super(DeepSeaMDP, self).parameters,
320            **dict(
321                size=self._size,
322                optimal_return=self._optimal_return,
323                suboptimal_return=self._suboptimal_return,
324                optimal_distribution=self._optimal_distribution,
325                sub_optimal_distribution=self._sub_optimal_distribution,
326                other_distribution=self._other_distribution,
327            ),
328        }

The base class for the DeepSea family.

DeepSeaMDP( seed: int, size: int, optimal_return: float = 1.0, suboptimal_return: float = 0.5, optimal_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, sub_optimal_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, other_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, make_reward_stochastic=False, reward_variance_multiplier: float = 1.0, **kwargs) View Source

153    def __init__(
154        self,
155        seed: int,
156        size: int,
157        optimal_return: float = 1.0,
158        suboptimal_return: float = 0.5,
159        optimal_distribution: Union[Tuple, rv_continuous] = None,
160        sub_optimal_distribution: Union[Tuple, rv_continuous] = None,
161        other_distribution: Union[Tuple, rv_continuous] = None,
162        make_reward_stochastic=False,
163        reward_variance_multiplier: float = 1.0,
164        **kwargs,
165    ):
166        """
167        Parameters
168        ----------
169        seed : int
170            The seed used for sampling rewards and next states.
171        size : int
172            The size of the grid.
173        optimal_return : float
174            If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory.
175            By default, it is set to 1.
176        suboptimal_return: float
177            If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories.
178            By default, it is set to 0.5.
179        optimal_distribution : Union[Tuple, rv_continuous]
180            The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters
181            or as a rv_continuous object.
182        sub_optimal_distribution : Union[Tuple, rv_continuous]
183            The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta
184            parameters or as a rv_continuous object.
185        other_distribution : Union[Tuple, rv_continuous]
186            The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a
187            rv_continuous object.
188        make_reward_stochastic : bool
189            If True, the rewards of the MDP will be stochastic. By default, it is set to False.
190        reward_variance_multiplier : float
191            A constant that can be used to increase the variance of the reward distributions without changing their means.
192            The lower the value, the higher the variance. By default, it is set to 1.
193        """
194
195        if type(sub_optimal_distribution) == tuple:
196            sub_optimal_distribution = get_dist(
197                sub_optimal_distribution[0], sub_optimal_distribution[1]
198            )
199        if type(optimal_distribution) == tuple:
200            optimal_distribution = get_dist(
201                optimal_distribution[0], optimal_distribution[1]
202            )
203        if type(other_distribution) == tuple:
204            other_distribution = get_dist(other_distribution[0], other_distribution[1])
205
206        self._size = size
207        self._optimal_return = optimal_return
208        self._suboptimal_return = suboptimal_return
209        self._optimal_distribution = optimal_distribution
210        self._sub_optimal_distribution = sub_optimal_distribution
211        self._other_distribution = other_distribution
212
213        dists = [
214            sub_optimal_distribution,
215            optimal_distribution,
216            other_distribution,
217        ]
218        if dists.count(None) == 0:
219            self._sub_optimal_distribution = sub_optimal_distribution
220            self._optimal_distribution = optimal_distribution
221            self._other_distribution = other_distribution
222        else:
223            if make_reward_stochastic:
224                self._sub_optimal_distribution = beta(
225                    reward_variance_multiplier,
226                    reward_variance_multiplier * (size / self._suboptimal_return - 1),
227                )
228                self._optimal_distribution = beta(
229                    reward_variance_multiplier * (size / self._optimal_return - 1),
230                    reward_variance_multiplier,
231                )
232                self._other_distribution = beta(
233                    reward_variance_multiplier,
234                    reward_variance_multiplier
235                    * 10
236                    * (size / self._suboptimal_return - 1),
237                )
238            else:
239                self._sub_optimal_distribution = deterministic(1.0 / (size ** 2))
240                self._optimal_distribution = deterministic(1.0)
241                self._other_distribution = deterministic(0.0)
242
243        super(DeepSeaMDP, self).__init__(
244            seed=seed,
245            reward_variance_multiplier=reward_variance_multiplier,
246            make_reward_stochastic=make_reward_stochastic,
247            **kwargs,
248        )

Parameters

seed (int): The seed used for sampling rewards and next states.
size (int): The size of the grid.
optimal_return (float): If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory. By default, it is set to 1.
suboptimal_return (float): If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories. By default, it is set to 0.5.
optimal_distribution (Union[Tuple, rv_continuous]): The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
sub_optimal_distribution (Union[Tuple, rv_continuous]): The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
other_distribution (Union[Tuple, rv_continuous]): The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
make_reward_stochastic (bool): If True, the rewards of the MDP will be stochastic. By default, it is set to False.
reward_variance_multiplier (float): A constant that can be used to increase the variance of the reward distributions without changing their means. The lower the value, the higher the variance. By default, it is set to 1.

@staticmethod

def get_unique_symbols() -> List[str]: View Source

57    @staticmethod
58    def get_unique_symbols() -> List[str]:
59        return ["A", " "]

Returns

List[str]: the unique symbols of the grid representation of the MDP.

@staticmethod

def does_seed_change_MDP_structure() -> bool: View Source

61    @staticmethod
62    def does_seed_change_MDP_structure() -> bool:
63        return False

Returns

bool: True if when changing the seed the transition matrix and/or rewards matrix change. This for example may
happen when there are fewer starting states that possible one and the effective starting states are picked
randomly based on the seed.

@staticmethod

def sample_mdp_parameters(n: int, is_episodic: bool, seed: int = None) -> List[Dict[str, Any]]: View Source

 65    @staticmethod
 66    def sample_mdp_parameters(
 67        n: int, is_episodic: bool, seed: int = None
 68    ) -> List[Dict[str, Any]]:
 69        rng = np.random.RandomState(np.random.randint(10_000) if seed is None else seed)
 70        samples = []
 71        for _ in range(n):
 72            sample = dict(
 73                size=int(
 74                    (1 + np.minimum((800 / (100 * rng.random() + 35)), 25))
 75                    * (0.8 if is_episodic else 1)
 76                ),
 77                p_rand=min(2 / (8 * rng.random() + 3), 0.95),
 78                make_reward_stochastic=rng.choice([True, False]),
 79                reward_variance_multiplier=2 * rng.random() + 0.005,
 80            )
 81            sample["p_rand"] = None if sample["p_rand"] < 0.01 else sample["p_rand"]
 82
 83            if sample["make_reward_stochastic"]:
 84                sample["sub_optimal_distribution"] = (
 85                    "beta",
 86                    (
 87                        sample["reward_variance_multiplier"],
 88                        sample["reward_variance_multiplier"]
 89                        * (sample["size"] / 0.5 - 1),
 90                    ),
 91                )
 92                sample["optimal_distribution"] = (
 93                    "beta",
 94                    (
 95                        sample["reward_variance_multiplier"] * (sample["size"] / 1 - 1),
 96                        sample["reward_variance_multiplier"],
 97                    ),
 98                )
 99                sample["other_distribution"] = (
100                    "beta",
101                    (
102                        sample["reward_variance_multiplier"],
103                        sample["reward_variance_multiplier"]
104                        * 10
105                        * (sample["size"] / 0.5 - 1),
106                    ),
107                )
108            else:
109                sample["sub_optimal_distribution"] = (
110                    "deterministic",
111                    (1.0 / (sample["size"] ** 2),),
112                )
113                sample["optimal_distribution"] = ("deterministic", (1.0,))
114                sample["other_distribution"] = ("deterministic", (0.0,))
115
116            samples.append(rounding_nested_structure(sample))
117
118        return samples

Returns

List[Dict[str, Any]]: n sampled parameters that can be used to construct an MDP in a reasonable amount of time.

@staticmethod

def get_node_class() -> Type[colosseum.mdp.deep_sea.base.DeepSeaNode]: View Source

120    @staticmethod
121    def get_node_class() -> Type[DeepSeaNode]:
122        return DeepSeaNode

Returns

Type["NODE_TYPE"]: The class of the nodes of the MDP.

def get_gin_parameters(self, index: int) -> str: View Source

124    def get_gin_parameters(self, index: int) -> str:
125        prms = dict(
126            size=self._size,
127            make_reward_stochastic=self._make_reward_stochastic,
128            reward_variance_multiplier=self._reward_variance_multiplier,
129            sub_optimal_distribution=(
130                self._sub_optimal_distribution.dist.name,
131                self._sub_optimal_distribution.args,
132            ),
133            optimal_distribution=(
134                self._optimal_distribution.dist.name,
135                self._optimal_distribution.args,
136            ),
137            other_distribution=(
138                self._other_distribution.dist.name,
139                self._other_distribution.args,
140            ),
141        )
142        if self._p_rand is not None:
143            prms["p_rand"] = self._p_rand
144
145        return DeepSeaMDP.produce_gin_file_from_mdp_parameters(
146            prms, type(self).__name__, index
147        )

Returns

str: The gin config of the MDP instance.

n_actions: int

Returns

int: The number of available actions.

parameters: Dict[str, Any]

Returns

Dict[str, Any]: The parameters of the MDP.

Inherited Members

colosseum.mdp.base.BaseMDP: get_available_hardness_measures; produce_gin_file_from_mdp_parameters; is_episodic; sample_parameters; get_grid_representation; get_gin_config; get_node_labels; get_node_action_labels; hash; instantiate_MDP; T; R; recurrent_nodes_set; communication_class; get_optimal_policy; get_worst_policy; get_value_functions; optimal_value_functions; worst_value_functions; random_value_functions; optimal_transition_probabilities; worst_transition_probabilities; random_transition_probabilities; optimal_markov_chain; worst_markov_chain; random_markov_chain; get_stationary_distribution; optimal_stationary_distribution; worst_stationary_distribution; random_stationary_distribution; optimal_average_rewards; worst_average_rewards; random_average_rewards; get_average_reward; optimal_average_reward; worst_average_reward; random_average_reward; transition_matrix_and_rewards; graph_layout; graph_metrics; diameter; sum_reciprocals_suboptimality_gaps; discounted_value_norm; undiscounted_value_norm; value_norm; measures_of_hardness; summary; hardness_report; get_info_class; get_transition_distributions; get_reward_distribution; sample_reward; get_measure_from_name; action_spec; observation_spec; get_observation; reset; step; random_steps; random_step; get_visitation_counts; reset_visitation_counts; get_value_node_labels
dm_env._environment.Environment: reward_spec; discount_spec; close