colosseum.mdp.frozen_lake.base

  1import abc
  2from dataclasses import dataclass
  3from enum import IntEnum
  4from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Type, Union
  5
  6import numpy as np
  7from gym.envs.toy_text.frozen_lake import generate_random_map
  8from scipy.stats import beta, rv_continuous
  9
 10from colosseum.mdp import BaseMDP
 11from colosseum.mdp.utils.custom_samplers import NextStateSampler
 12from colosseum.utils.miscellanea import (
 13    check_distributions,
 14    deterministic,
 15    get_dist,
 16    rounding_nested_structure,
 17)
 18
 19if TYPE_CHECKING:
 20    from colosseum.mdp import ACTION_TYPE, NODE_TYPE
 21
 22
 23@dataclass(frozen=True)
 24class FrozenLakeNode:
 25    """
 26    The node for the FrozenLake MDP.
 27    """
 28
 29    X: int
 30    """x coordinate."""
 31    Y: int
 32    """y coordinate."""
 33
 34    def __str__(self):
 35        return f"X={self.X},Y={self.Y}"
 36
 37    def __iter__(self):
 38        return iter((self.X, self.Y))
 39
 40
 41class FrozenLakeAction(IntEnum):
 42    """The action available in the FrozenLake MDP."""
 43
 44    UP = 0
 45    """Move up."""
 46    RIGHT = 1
 47    """Move towards the right."""
 48    DOWN = 2
 49    """Move down."""
 50    LEFT = 3
 51    """Move towards the left."""
 52
 53
 54class FrozenLakeMDP(BaseMDP, abc.ABC):
 55    """
 56    The base class for the FrozenLake family.
 57    """
 58
 59    @staticmethod
 60    def get_unique_symbols() -> List[str]:
 61        return ["A", "F", "H", "G"]
 62
 63    @staticmethod
 64    def does_seed_change_MDP_structure() -> bool:
 65        return True
 66
 67    @staticmethod
 68    def sample_mdp_parameters(
 69        n: int, is_episodic: bool, seed: int = None
 70    ) -> List[Dict[str, Any]]:
 71        rng = np.random.RandomState(np.random.randint(10_000) if seed is None else seed)
 72        samples = []
 73        for _ in range(n):
 74            p_rand, p_lazy, _ = 0.9 * rng.dirichlet([0.2, 0.2, 5])
 75            sample = dict(
 76                size=rng.choice(range(5, 7), None, True, [0.665, 0.335])
 77                if is_episodic
 78                else int((2.5 + np.minimum((400 / (150 * rng.random() + 35)), 15))),
 79                p_frozen=min((0.55 * rng.random() + 0.45) ** 0.3, 0.95),
 80                p_rand=p_rand,
 81                p_lazy=p_lazy,
 82                make_reward_stochastic=rng.choice([True, False]),
 83                reward_variance_multiplier=2 * rng.random() + 0.005,
 84            )
 85            sample["p_rand"] = None if sample["p_rand"] < 0.01 else sample["p_rand"]
 86            sample["p_lazy"] = None if sample["p_lazy"] < 0.01 else sample["p_lazy"]
 87
 88            if sample["make_reward_stochastic"]:
 89                sample["default_r"] = (
 90                    "beta",
 91                    (
 92                        sample["reward_variance_multiplier"],
 93                        sample["reward_variance_multiplier"]
 94                        * (sample["size"] ** 2 / 0.1 - 1),
 95                    ),
 96                )
 97                sample["goal_r"] = (
 98                    "beta",
 99                    (
100                        sample["reward_variance_multiplier"]
101                        * (sample["size"] ** 2 - 1),
102                        sample["reward_variance_multiplier"],
103                    ),
104                )
105            else:
106                sample["default_r"] = ("deterministic", (0.0,))
107                sample["goal_r"] = ("deterministic", (1.0,))
108
109            samples.append(rounding_nested_structure(sample))
110        return samples
111
112    @staticmethod
113    def get_node_class() -> Type["NODE_TYPE"]:
114        return FrozenLakeNode
115
116    def get_gin_parameters(self, index: int) -> str:
117        prms = dict(
118            size=self._size,
119            p_frozen=self._p_frozen,
120            make_reward_stochastic=self._make_reward_stochastic,
121            reward_variance_multiplier=self._reward_variance_multiplier,
122            default_r=(
123                self._default_r.dist.name,
124                self._default_r.args,
125            ),
126            goal_r=(
127                self._goal_r.dist.name,
128                self._goal_r.args,
129            ),
130        )
131
132        if self._p_rand is not None:
133            prms["p_rand"] = self._p_rand
134        if self._p_lazy is not None:
135            prms["p_lazy"] = self._p_lazy
136
137        return FrozenLakeMDP.produce_gin_file_from_mdp_parameters(
138            prms, type(self).__name__, index
139        )
140
141    @property
142    def n_actions(self) -> int:
143        return len(FrozenLakeAction)
144
145    def _next_positions(self, x, y, a):
146        if self.lake[x, y] == "G":
147            return dict(X=0, Y=0)
148
149        if a == FrozenLakeAction.LEFT:
150            next_x, next_y = x, min(y + 1, self._size - 1)
151        if a == FrozenLakeAction.DOWN:
152            next_x, next_y = min(x + 1, self._size - 1), y
153        if a == FrozenLakeAction.RIGHT:
154            next_x, next_y = x, max(y - 1, 0)
155        if a == FrozenLakeAction.UP:
156            next_x, next_y = max(x - 1, 0), y
157        next_pos = self.lake[next_x, next_y]
158        if next_pos == "H":
159            return dict(X=0, Y=0)
160        else:
161            return dict(X=next_x, Y=next_y)
162
163    def _get_next_nodes_parameters(
164        self, node: "NODE_TYPE", action: "ACTION_TYPE"
165    ) -> Tuple[Tuple[dict, float], ...]:
166        p = 0.5 if self._is_slippery else 1.0
167        next_nodes_prms = []
168        next_nodes_prms.append((self._next_positions(node.X, node.Y, action), p))
169        if self._is_slippery:
170            for a in [(action - 1) % 4, (action + 1) % 4]:
171                next_nodes_prms.append((self._next_positions(node.X, node.Y, a), p / 2))
172        return next_nodes_prms
173
174    def _get_reward_distribution(
175        self, node: "NODE_TYPE", action: "ACTION_TYPE", next_node: "NODE_TYPE"
176    ) -> rv_continuous:
177        if self.lake[next_node.X, next_node.Y] == "G":
178            return self._goal_r
179        return self._default_r
180
181    def _get_starting_node_sampler(self) -> NextStateSampler:
182        return NextStateSampler(next_nodes=self._possible_starting_nodes)
183
184    def _check_parameters_in_input(self):
185        super(FrozenLakeMDP, self)._check_parameters_in_input()
186
187        assert self._p_frozen >= 0.1
188        assert self._size > 2
189
190        assert self._suboptimal_return + 0.2 < self._optimal_return
191
192        dists = [
193            self._goal_r,
194            self._default_r,
195        ]
196        check_distributions(
197            dists,
198            self._make_reward_stochastic,
199        )
200
201    def _get_grid_representation(self, node: "NODE_TYPE") -> np.ndarray:
202        grid = self.lake.copy()
203        grid[0, 0] = "F"
204        grid[node.X, node.Y] = "A"
205        return grid.T[::-1, :]
206
207    @property
208    def _possible_starting_nodes(self) -> List["NODE_TYPE"]:
209        return [FrozenLakeNode(0, 0)]
210
211    @property
212    def parameters(self) -> Dict[str, Any]:
213        return {
214            **super(FrozenLakeMDP, self).parameters,
215            **dict(
216                size=self._size,
217                p_frozen=self._p_frozen,
218                optimal_return=self._optimal_return,
219                suboptimal_return=self._suboptimal_return,
220                is_slippery=self._is_slippery,
221                goal_r=self._goal_r,
222                default_r=self._default_r,
223            ),
224        }
225
226    def __init__(
227        self,
228        seed: int,
229        size: int,
230        p_frozen: float,
231        optimal_return: float = 1.0,
232        suboptimal_return: float = 0.1,
233        is_slippery: bool = True,
234        goal_r: Union[Tuple, rv_continuous] = None,
235        default_r: Union[Tuple, rv_continuous] = None,
236        make_reward_stochastic=False,
237        reward_variance_multiplier: float = 1.0,
238        **kwargs,
239    ):
240        """
241
242        Parameters
243        ----------
244        seed : int
245            The seed used for sampling rewards and next states.
246        size : int
247            The size of the grid.
248        p_frozen : float
249            The probability that a tile of the lake is frozen and does not contain a hole.
250        optimal_return: float
251            If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory.
252            By default, it is set to 1.
253        suboptimal_return: float
254            If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories.
255            By default, it is set to 0.1.
256        is_slippery : bool
257            If True, the outcome of the action is stochastic due to the frozen tiles being slippery. By default, it is
258            set to True.
259        goal_r : Union[Tuple, rv_continuous]
260            The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters
261            or as a rv_continuous object.
262        default_r : Union[Tuple, rv_continuous]
263            The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a
264            rv_continuous object.
265        make_reward_stochastic : bool
266            If True, the rewards of the MDP will be stochastic. By default, it is set to False.
267        reward_variance_multiplier : float
268            A constant that can be used to increase the variance of the reward distributions without changing their means.
269            The lower the value, the higher the variance. By default, it is set to 1.
270        """
271
272        if type(goal_r) == tuple:
273            goal_r = get_dist(goal_r[0], goal_r[1])
274        if type(default_r) == tuple:
275            default_r = get_dist(default_r[0], default_r[1])
276
277        self._size = size
278        self._p_frozen = p_frozen
279        self._optimal_return = optimal_return
280        self._suboptimal_return = suboptimal_return
281        self._is_slippery = is_slippery
282        self._goal_r = goal_r
283        self._default_r = default_r
284
285        np.random.seed(seed)
286        self.lake = np.array(
287            list(
288                map(
289                    lambda x: list(x),
290                    generate_random_map(size=self._size, p=self._p_frozen),
291                )
292            )
293        )
294
295        if (default_r, goal_r).count(None) == 0:
296            self._default_r = default_r
297            self._goal_r = goal_r
298        else:
299            if make_reward_stochastic:
300                self._default_r = beta(
301                    reward_variance_multiplier,
302                    reward_variance_multiplier
303                    * (size ** 2 / self._suboptimal_return - 1),
304                )
305                self._goal_r = beta(
306                    reward_variance_multiplier * (size ** 2 / self._optimal_return - 1),
307                    reward_variance_multiplier,
308                )
309            else:
310                self._default_r = deterministic(0.0)
311                self._goal_r = deterministic(1.0)
312
313        super(FrozenLakeMDP, self).__init__(
314            seed=seed,
315            reward_variance_multiplier=reward_variance_multiplier,
316            make_reward_stochastic=make_reward_stochastic,
317            **kwargs,
318        )
@dataclass(frozen=True)
class FrozenLakeNode:
24@dataclass(frozen=True)
25class FrozenLakeNode:
26    """
27    The node for the FrozenLake MDP.
28    """
29
30    X: int
31    """x coordinate."""
32    Y: int
33    """y coordinate."""
34
35    def __str__(self):
36        return f"X={self.X},Y={self.Y}"
37
38    def __iter__(self):
39        return iter((self.X, self.Y))

The node for the FrozenLake MDP.

FrozenLakeNode(X: int, Y: int)
X: int

x coordinate.

Y: int

y coordinate.

class FrozenLakeAction(enum.IntEnum):
42class FrozenLakeAction(IntEnum):
43    """The action available in the FrozenLake MDP."""
44
45    UP = 0
46    """Move up."""
47    RIGHT = 1
48    """Move towards the right."""
49    DOWN = 2
50    """Move down."""
51    LEFT = 3
52    """Move towards the left."""

The action available in the FrozenLake MDP.

Move up.

RIGHT = <FrozenLakeAction.RIGHT: 1>

Move towards the right.

DOWN = <FrozenLakeAction.DOWN: 2>

Move down.

LEFT = <FrozenLakeAction.LEFT: 3>

Move towards the left.

Inherited Members
enum.Enum
name
value
builtins.int
conjugate
bit_length
to_bytes
from_bytes
as_integer_ratio
real
imag
numerator
denominator
class FrozenLakeMDP(colosseum.mdp.base.BaseMDP, abc.ABC):
 55class FrozenLakeMDP(BaseMDP, abc.ABC):
 56    """
 57    The base class for the FrozenLake family.
 58    """
 59
 60    @staticmethod
 61    def get_unique_symbols() -> List[str]:
 62        return ["A", "F", "H", "G"]
 63
 64    @staticmethod
 65    def does_seed_change_MDP_structure() -> bool:
 66        return True
 67
 68    @staticmethod
 69    def sample_mdp_parameters(
 70        n: int, is_episodic: bool, seed: int = None
 71    ) -> List[Dict[str, Any]]:
 72        rng = np.random.RandomState(np.random.randint(10_000) if seed is None else seed)
 73        samples = []
 74        for _ in range(n):
 75            p_rand, p_lazy, _ = 0.9 * rng.dirichlet([0.2, 0.2, 5])
 76            sample = dict(
 77                size=rng.choice(range(5, 7), None, True, [0.665, 0.335])
 78                if is_episodic
 79                else int((2.5 + np.minimum((400 / (150 * rng.random() + 35)), 15))),
 80                p_frozen=min((0.55 * rng.random() + 0.45) ** 0.3, 0.95),
 81                p_rand=p_rand,
 82                p_lazy=p_lazy,
 83                make_reward_stochastic=rng.choice([True, False]),
 84                reward_variance_multiplier=2 * rng.random() + 0.005,
 85            )
 86            sample["p_rand"] = None if sample["p_rand"] < 0.01 else sample["p_rand"]
 87            sample["p_lazy"] = None if sample["p_lazy"] < 0.01 else sample["p_lazy"]
 88
 89            if sample["make_reward_stochastic"]:
 90                sample["default_r"] = (
 91                    "beta",
 92                    (
 93                        sample["reward_variance_multiplier"],
 94                        sample["reward_variance_multiplier"]
 95                        * (sample["size"] ** 2 / 0.1 - 1),
 96                    ),
 97                )
 98                sample["goal_r"] = (
 99                    "beta",
100                    (
101                        sample["reward_variance_multiplier"]
102                        * (sample["size"] ** 2 - 1),
103                        sample["reward_variance_multiplier"],
104                    ),
105                )
106            else:
107                sample["default_r"] = ("deterministic", (0.0,))
108                sample["goal_r"] = ("deterministic", (1.0,))
109
110            samples.append(rounding_nested_structure(sample))
111        return samples
112
113    @staticmethod
114    def get_node_class() -> Type["NODE_TYPE"]:
115        return FrozenLakeNode
116
117    def get_gin_parameters(self, index: int) -> str:
118        prms = dict(
119            size=self._size,
120            p_frozen=self._p_frozen,
121            make_reward_stochastic=self._make_reward_stochastic,
122            reward_variance_multiplier=self._reward_variance_multiplier,
123            default_r=(
124                self._default_r.dist.name,
125                self._default_r.args,
126            ),
127            goal_r=(
128                self._goal_r.dist.name,
129                self._goal_r.args,
130            ),
131        )
132
133        if self._p_rand is not None:
134            prms["p_rand"] = self._p_rand
135        if self._p_lazy is not None:
136            prms["p_lazy"] = self._p_lazy
137
138        return FrozenLakeMDP.produce_gin_file_from_mdp_parameters(
139            prms, type(self).__name__, index
140        )
141
142    @property
143    def n_actions(self) -> int:
144        return len(FrozenLakeAction)
145
146    def _next_positions(self, x, y, a):
147        if self.lake[x, y] == "G":
148            return dict(X=0, Y=0)
149
150        if a == FrozenLakeAction.LEFT:
151            next_x, next_y = x, min(y + 1, self._size - 1)
152        if a == FrozenLakeAction.DOWN:
153            next_x, next_y = min(x + 1, self._size - 1), y
154        if a == FrozenLakeAction.RIGHT:
155            next_x, next_y = x, max(y - 1, 0)
156        if a == FrozenLakeAction.UP:
157            next_x, next_y = max(x - 1, 0), y
158        next_pos = self.lake[next_x, next_y]
159        if next_pos == "H":
160            return dict(X=0, Y=0)
161        else:
162            return dict(X=next_x, Y=next_y)
163
164    def _get_next_nodes_parameters(
165        self, node: "NODE_TYPE", action: "ACTION_TYPE"
166    ) -> Tuple[Tuple[dict, float], ...]:
167        p = 0.5 if self._is_slippery else 1.0
168        next_nodes_prms = []
169        next_nodes_prms.append((self._next_positions(node.X, node.Y, action), p))
170        if self._is_slippery:
171            for a in [(action - 1) % 4, (action + 1) % 4]:
172                next_nodes_prms.append((self._next_positions(node.X, node.Y, a), p / 2))
173        return next_nodes_prms
174
175    def _get_reward_distribution(
176        self, node: "NODE_TYPE", action: "ACTION_TYPE", next_node: "NODE_TYPE"
177    ) -> rv_continuous:
178        if self.lake[next_node.X, next_node.Y] == "G":
179            return self._goal_r
180        return self._default_r
181
182    def _get_starting_node_sampler(self) -> NextStateSampler:
183        return NextStateSampler(next_nodes=self._possible_starting_nodes)
184
185    def _check_parameters_in_input(self):
186        super(FrozenLakeMDP, self)._check_parameters_in_input()
187
188        assert self._p_frozen >= 0.1
189        assert self._size > 2
190
191        assert self._suboptimal_return + 0.2 < self._optimal_return
192
193        dists = [
194            self._goal_r,
195            self._default_r,
196        ]
197        check_distributions(
198            dists,
199            self._make_reward_stochastic,
200        )
201
202    def _get_grid_representation(self, node: "NODE_TYPE") -> np.ndarray:
203        grid = self.lake.copy()
204        grid[0, 0] = "F"
205        grid[node.X, node.Y] = "A"
206        return grid.T[::-1, :]
207
208    @property
209    def _possible_starting_nodes(self) -> List["NODE_TYPE"]:
210        return [FrozenLakeNode(0, 0)]
211
212    @property
213    def parameters(self) -> Dict[str, Any]:
214        return {
215            **super(FrozenLakeMDP, self).parameters,
216            **dict(
217                size=self._size,
218                p_frozen=self._p_frozen,
219                optimal_return=self._optimal_return,
220                suboptimal_return=self._suboptimal_return,
221                is_slippery=self._is_slippery,
222                goal_r=self._goal_r,
223                default_r=self._default_r,
224            ),
225        }
226
227    def __init__(
228        self,
229        seed: int,
230        size: int,
231        p_frozen: float,
232        optimal_return: float = 1.0,
233        suboptimal_return: float = 0.1,
234        is_slippery: bool = True,
235        goal_r: Union[Tuple, rv_continuous] = None,
236        default_r: Union[Tuple, rv_continuous] = None,
237        make_reward_stochastic=False,
238        reward_variance_multiplier: float = 1.0,
239        **kwargs,
240    ):
241        """
242
243        Parameters
244        ----------
245        seed : int
246            The seed used for sampling rewards and next states.
247        size : int
248            The size of the grid.
249        p_frozen : float
250            The probability that a tile of the lake is frozen and does not contain a hole.
251        optimal_return: float
252            If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory.
253            By default, it is set to 1.
254        suboptimal_return: float
255            If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories.
256            By default, it is set to 0.1.
257        is_slippery : bool
258            If True, the outcome of the action is stochastic due to the frozen tiles being slippery. By default, it is
259            set to True.
260        goal_r : Union[Tuple, rv_continuous]
261            The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters
262            or as a rv_continuous object.
263        default_r : Union[Tuple, rv_continuous]
264            The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a
265            rv_continuous object.
266        make_reward_stochastic : bool
267            If True, the rewards of the MDP will be stochastic. By default, it is set to False.
268        reward_variance_multiplier : float
269            A constant that can be used to increase the variance of the reward distributions without changing their means.
270            The lower the value, the higher the variance. By default, it is set to 1.
271        """
272
273        if type(goal_r) == tuple:
274            goal_r = get_dist(goal_r[0], goal_r[1])
275        if type(default_r) == tuple:
276            default_r = get_dist(default_r[0], default_r[1])
277
278        self._size = size
279        self._p_frozen = p_frozen
280        self._optimal_return = optimal_return
281        self._suboptimal_return = suboptimal_return
282        self._is_slippery = is_slippery
283        self._goal_r = goal_r
284        self._default_r = default_r
285
286        np.random.seed(seed)
287        self.lake = np.array(
288            list(
289                map(
290                    lambda x: list(x),
291                    generate_random_map(size=self._size, p=self._p_frozen),
292                )
293            )
294        )
295
296        if (default_r, goal_r).count(None) == 0:
297            self._default_r = default_r
298            self._goal_r = goal_r
299        else:
300            if make_reward_stochastic:
301                self._default_r = beta(
302                    reward_variance_multiplier,
303                    reward_variance_multiplier
304                    * (size ** 2 / self._suboptimal_return - 1),
305                )
306                self._goal_r = beta(
307                    reward_variance_multiplier * (size ** 2 / self._optimal_return - 1),
308                    reward_variance_multiplier,
309                )
310            else:
311                self._default_r = deterministic(0.0)
312                self._goal_r = deterministic(1.0)
313
314        super(FrozenLakeMDP, self).__init__(
315            seed=seed,
316            reward_variance_multiplier=reward_variance_multiplier,
317            make_reward_stochastic=make_reward_stochastic,
318            **kwargs,
319        )

The base class for the FrozenLake family.

FrozenLakeMDP( seed: int, size: int, p_frozen: float, optimal_return: float = 1.0, suboptimal_return: float = 0.1, is_slippery: bool = True, goal_r: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, default_r: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, make_reward_stochastic=False, reward_variance_multiplier: float = 1.0, **kwargs)
227    def __init__(
228        self,
229        seed: int,
230        size: int,
231        p_frozen: float,
232        optimal_return: float = 1.0,
233        suboptimal_return: float = 0.1,
234        is_slippery: bool = True,
235        goal_r: Union[Tuple, rv_continuous] = None,
236        default_r: Union[Tuple, rv_continuous] = None,
237        make_reward_stochastic=False,
238        reward_variance_multiplier: float = 1.0,
239        **kwargs,
240    ):
241        """
242
243        Parameters
244        ----------
245        seed : int
246            The seed used for sampling rewards and next states.
247        size : int
248            The size of the grid.
249        p_frozen : float
250            The probability that a tile of the lake is frozen and does not contain a hole.
251        optimal_return: float
252            If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory.
253            By default, it is set to 1.
254        suboptimal_return: float
255            If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories.
256            By default, it is set to 0.1.
257        is_slippery : bool
258            If True, the outcome of the action is stochastic due to the frozen tiles being slippery. By default, it is
259            set to True.
260        goal_r : Union[Tuple, rv_continuous]
261            The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters
262            or as a rv_continuous object.
263        default_r : Union[Tuple, rv_continuous]
264            The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a
265            rv_continuous object.
266        make_reward_stochastic : bool
267            If True, the rewards of the MDP will be stochastic. By default, it is set to False.
268        reward_variance_multiplier : float
269            A constant that can be used to increase the variance of the reward distributions without changing their means.
270            The lower the value, the higher the variance. By default, it is set to 1.
271        """
272
273        if type(goal_r) == tuple:
274            goal_r = get_dist(goal_r[0], goal_r[1])
275        if type(default_r) == tuple:
276            default_r = get_dist(default_r[0], default_r[1])
277
278        self._size = size
279        self._p_frozen = p_frozen
280        self._optimal_return = optimal_return
281        self._suboptimal_return = suboptimal_return
282        self._is_slippery = is_slippery
283        self._goal_r = goal_r
284        self._default_r = default_r
285
286        np.random.seed(seed)
287        self.lake = np.array(
288            list(
289                map(
290                    lambda x: list(x),
291                    generate_random_map(size=self._size, p=self._p_frozen),
292                )
293            )
294        )
295
296        if (default_r, goal_r).count(None) == 0:
297            self._default_r = default_r
298            self._goal_r = goal_r
299        else:
300            if make_reward_stochastic:
301                self._default_r = beta(
302                    reward_variance_multiplier,
303                    reward_variance_multiplier
304                    * (size ** 2 / self._suboptimal_return - 1),
305                )
306                self._goal_r = beta(
307                    reward_variance_multiplier * (size ** 2 / self._optimal_return - 1),
308                    reward_variance_multiplier,
309                )
310            else:
311                self._default_r = deterministic(0.0)
312                self._goal_r = deterministic(1.0)
313
314        super(FrozenLakeMDP, self).__init__(
315            seed=seed,
316            reward_variance_multiplier=reward_variance_multiplier,
317            make_reward_stochastic=make_reward_stochastic,
318            **kwargs,
319        )
Parameters
  • seed (int): The seed used for sampling rewards and next states.
  • size (int): The size of the grid.
  • p_frozen (float): The probability that a tile of the lake is frozen and does not contain a hole.
  • optimal_return (float): If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory. By default, it is set to 1.
  • suboptimal_return (float): If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories. By default, it is set to 0.1.
  • is_slippery (bool): If True, the outcome of the action is stochastic due to the frozen tiles being slippery. By default, it is set to True.
  • goal_r (Union[Tuple, rv_continuous]): The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
  • default_r (Union[Tuple, rv_continuous]): The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
  • make_reward_stochastic (bool): If True, the rewards of the MDP will be stochastic. By default, it is set to False.
  • reward_variance_multiplier (float): A constant that can be used to increase the variance of the reward distributions without changing their means. The lower the value, the higher the variance. By default, it is set to 1.
@staticmethod
def get_unique_symbols() -> List[str]:
60    @staticmethod
61    def get_unique_symbols() -> List[str]:
62        return ["A", "F", "H", "G"]
Returns
  • List[str]: the unique symbols of the grid representation of the MDP.
@staticmethod
def does_seed_change_MDP_structure() -> bool:
64    @staticmethod
65    def does_seed_change_MDP_structure() -> bool:
66        return True
Returns
  • bool: True if when changing the seed the transition matrix and/or rewards matrix change. This for example may
  • happen when there are fewer starting states that possible one and the effective starting states are picked
  • randomly based on the seed.
@staticmethod
def sample_mdp_parameters(n: int, is_episodic: bool, seed: int = None) -> List[Dict[str, Any]]:
 68    @staticmethod
 69    def sample_mdp_parameters(
 70        n: int, is_episodic: bool, seed: int = None
 71    ) -> List[Dict[str, Any]]:
 72        rng = np.random.RandomState(np.random.randint(10_000) if seed is None else seed)
 73        samples = []
 74        for _ in range(n):
 75            p_rand, p_lazy, _ = 0.9 * rng.dirichlet([0.2, 0.2, 5])
 76            sample = dict(
 77                size=rng.choice(range(5, 7), None, True, [0.665, 0.335])
 78                if is_episodic
 79                else int((2.5 + np.minimum((400 / (150 * rng.random() + 35)), 15))),
 80                p_frozen=min((0.55 * rng.random() + 0.45) ** 0.3, 0.95),
 81                p_rand=p_rand,
 82                p_lazy=p_lazy,
 83                make_reward_stochastic=rng.choice([True, False]),
 84                reward_variance_multiplier=2 * rng.random() + 0.005,
 85            )
 86            sample["p_rand"] = None if sample["p_rand"] < 0.01 else sample["p_rand"]
 87            sample["p_lazy"] = None if sample["p_lazy"] < 0.01 else sample["p_lazy"]
 88
 89            if sample["make_reward_stochastic"]:
 90                sample["default_r"] = (
 91                    "beta",
 92                    (
 93                        sample["reward_variance_multiplier"],
 94                        sample["reward_variance_multiplier"]
 95                        * (sample["size"] ** 2 / 0.1 - 1),
 96                    ),
 97                )
 98                sample["goal_r"] = (
 99                    "beta",
100                    (
101                        sample["reward_variance_multiplier"]
102                        * (sample["size"] ** 2 - 1),
103                        sample["reward_variance_multiplier"],
104                    ),
105                )
106            else:
107                sample["default_r"] = ("deterministic", (0.0,))
108                sample["goal_r"] = ("deterministic", (1.0,))
109
110            samples.append(rounding_nested_structure(sample))
111        return samples
Returns
  • List[Dict[str, Any]]: n sampled parameters that can be used to construct an MDP in a reasonable amount of time.
def get_gin_parameters(self, index: int) -> str:
117    def get_gin_parameters(self, index: int) -> str:
118        prms = dict(
119            size=self._size,
120            p_frozen=self._p_frozen,
121            make_reward_stochastic=self._make_reward_stochastic,
122            reward_variance_multiplier=self._reward_variance_multiplier,
123            default_r=(
124                self._default_r.dist.name,
125                self._default_r.args,
126            ),
127            goal_r=(
128                self._goal_r.dist.name,
129                self._goal_r.args,
130            ),
131        )
132
133        if self._p_rand is not None:
134            prms["p_rand"] = self._p_rand
135        if self._p_lazy is not None:
136            prms["p_lazy"] = self._p_lazy
137
138        return FrozenLakeMDP.produce_gin_file_from_mdp_parameters(
139            prms, type(self).__name__, index
140        )
Returns
  • str: The gin config of the MDP instance.
n_actions: int
Returns
  • int: The number of available actions.
parameters: Dict[str, Any]
Returns
  • Dict[str, Any]: The parameters of the MDP.