colosseum.mdp.simple_grid.base

  1import abc
  2from dataclasses import dataclass
  3from enum import IntEnum
  4from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Type, Union
  5
  6import gin
  7import numpy as np
  8from scipy.stats import beta, rv_continuous
  9
 10from colosseum.mdp import BaseMDP
 11from colosseum.mdp.utils.custom_samplers import NextStateSampler
 12from colosseum.utils.miscellanea import (
 13    check_distributions,
 14    deterministic,
 15    get_dist,
 16    rounding_nested_structure,
 17)
 18
 19if TYPE_CHECKING:
 20    from colosseum.mdp import ACTION_TYPE, NODE_TYPE
 21
 22
 23@dataclass(frozen=True)
 24class SimpleGridNode:
 25    """
 26    The node for the SimpleGrid MDP.
 27    """
 28
 29    X: int
 30    """x coordinate."""
 31    Y: int
 32    """y coordinate."""
 33
 34    def __str__(self):
 35        return f"X={self.X},Y={self.Y}"
 36
 37    def __iter__(self):
 38        return iter((self.X, self.Y))
 39
 40
 41class SimpleGridAction(IntEnum):
 42    """
 43    The actions available in the SimpleGrid MDP.
 44    """
 45
 46    UP = 0
 47    RIGHT = 1
 48    DOWN = 2
 49    LEFT = 3
 50    NO_OP = 4
 51
 52
 53@gin.constants_from_enum
 54class SimpleGridReward(IntEnum):
 55    """
 56    The reward types available in the SimpleGrid MDP. It controls the rewards for the corner states.
 57    """
 58
 59    AND = 0
 60    NAND = 1
 61    OR = 2
 62    XOR = 3
 63
 64
 65class SimpleGridMDP(BaseMDP, abc.ABC):
 66    """
 67    The base class for the SimpleGrid family.
 68    """
 69
 70    @staticmethod
 71    def get_action_class() -> SimpleGridAction:
 72        return SimpleGridAction
 73
 74    @staticmethod
 75    def get_unique_symbols() -> List[str]:
 76        return [" ", "A", "+", "-"]
 77
 78    @staticmethod
 79    def does_seed_change_MDP_structure() -> bool:
 80        return True
 81
 82    @staticmethod
 83    def sample_mdp_parameters(
 84        n: int, is_episodic: bool, seed: int = None
 85    ) -> List[Dict[str, Any]]:
 86        rng = np.random.RandomState(np.random.randint(10_000) if seed is None else seed)
 87        samples = []
 88        for _ in range(n):
 89            p_rand, p_lazy, _ = 0.9 * rng.dirichlet([0.2, 0.2, 5])
 90            sample = dict(
 91                size=int(
 92                    (
 93                        1
 94                        + np.minimum((800 / (100 * rng.random() + 35)), 25)
 95                        * (0.8 if is_episodic else 1)
 96                    )
 97                ),
 98                n_starting_states=rng.randint(1, 5),
 99                p_rand=p_rand,
100                p_lazy=p_lazy,
101                make_reward_stochastic=rng.choice([True, False]),
102                reward_variance_multiplier=2 * rng.random() + 0.005,
103            )
104            sample["p_rand"] = None if sample["p_rand"] < 0.01 else sample["p_rand"]
105            sample["p_lazy"] = None if sample["p_lazy"] < 0.01 else sample["p_lazy"]
106
107            sample["reward_type"] = rng.randint(4)
108
109            if sample["make_reward_stochastic"]:
110                sample["sub_optimal_distribution"] = (
111                    "beta",
112                    (
113                        sample["reward_variance_multiplier"],
114                        sample["reward_variance_multiplier"] * (10 / 0.2 - 1),
115                    ),
116                )
117                sample["optimal_distribution"] = (
118                    "beta",
119                    (
120                        sample["reward_variance_multiplier"],
121                        sample["reward_variance_multiplier"] * (1 / 0.9 - 1),
122                    ),
123                )
124                sample["other_distribution"] = (
125                    "beta",
126                    (
127                        sample["reward_variance_multiplier"],
128                        sample["reward_variance_multiplier"] * (1 / 0.2 - 1),
129                    ),
130                )
131            else:
132                sample["sub_optimal_distribution"] = ("deterministic", (0.0,))
133                sample["optimal_distribution"] = ("deterministic", (1.0,))
134                sample["other_distribution"] = ("deterministic", (0.5,))
135
136            samples.append(rounding_nested_structure(sample))
137        return samples
138
139    @staticmethod
140    def get_node_class() -> Type["NODE_TYPE"]:
141        return SimpleGridNode
142
143    def get_gin_parameters(self, index: int) -> str:
144        prms = dict(
145            size=self._size,
146            n_starting_states=self._n_starting_states,
147            reward_type=int(self._reward_type),
148            make_reward_stochastic=self._make_reward_stochastic,
149            reward_variance_multiplier=self._reward_variance_multiplier,
150            sub_optimal_distribution=(
151                self._sub_optimal_distribution.dist.name,
152                self._sub_optimal_distribution.args,
153            ),
154            optimal_distribution=(
155                self._optimal_distribution.dist.name,
156                self._optimal_distribution.args,
157            ),
158            other_distribution=(
159                self._other_distribution.dist.name,
160                self._other_distribution.args,
161            ),
162        )
163        if self._p_rand is not None:
164            prms["p_rand"] = self._p_rand
165
166        return SimpleGridMDP.produce_gin_file_from_mdp_parameters(
167            prms, type(self).__name__, index
168        )
169
170    @property
171    def n_actions(self) -> int:
172        return len(SimpleGridAction)
173
174    def _get_next_nodes_parameters(
175        self, node: "NODE_TYPE", action: "ACTION_TYPE"
176    ) -> Tuple[Tuple[dict, float], ...]:
177        if action == SimpleGridAction.UP:
178            return ((dict(X=node.X, Y=min(node.Y + 1, self._size - 1)), 1.0),)
179        if action == SimpleGridAction.RIGHT:
180            return ((dict(X=min(node.X + 1, self._size - 1), Y=node.Y), 1.0),)
181        if action == SimpleGridAction.DOWN:
182            return ((dict(X=node.X, Y=max(node.Y - 1, 0)), 1.0),)
183        if action == SimpleGridAction.LEFT:
184            return ((dict(X=max(node.X - 1, 0), Y=node.Y), 1.0),)
185        if action == SimpleGridAction.NO_OP:
186            return ((dict(X=node.X, Y=node.Y), 1.0),)
187
188    @staticmethod
189    def _is_corner_loop(node, next_node, size):
190        return (
191            node.X == next_node.X
192            and node.Y == next_node.Y
193            and node.X in [0, size - 1]
194            and node.Y in [0, size - 1]
195        )
196
197    def _get_reward_distribution(
198        self, node: "NODE_TYPE", action: "ACTION_TYPE", next_node: "NODE_TYPE"
199    ) -> rv_continuous:
200        # Corner nodes
201        if SimpleGridMDP._is_corner_loop(node, next_node, self._size):
202            if (
203                (self._reward_type == SimpleGridReward.AND and (node.X and node.Y))
204                or (
205                    self._reward_type == SimpleGridReward.NAND
206                    and not (node.X and node.Y)
207                )
208                or (self._reward_type == SimpleGridReward.OR and (node.X | node.Y))
209                or (self._reward_type == SimpleGridReward.XOR and (node.X ^ node.Y))
210            ):
211                return self._optimal_distribution
212            else:
213                return self._sub_optimal_distribution
214        else:
215            return self._other_distribution
216
217    def _calculate_starting_nodes(self):
218        center = np.array(((self._size - 1) / 2, (self._size - 1) / 2))
219        distances = np.empty((self._size, self._size))
220        for x in range(self._size):
221            for y in range(self._size):
222                distances[x, y] = ((np.array((x, y)) - center) ** 2).sum()
223
224        batch: list = np.array(np.where(distances == distances.min())).T.tolist()
225        self._rng.shuffle(batch)
226        while not np.all(distances == np.inf):
227            distances[batch[0][0], batch[0][1]] = np.inf
228            yield batch[0]
229            batch.pop(0)
230            if len(batch) == 0:
231                batch: list = np.array(
232                    np.where(distances == distances.min())
233                ).T.tolist()
234
235    def _get_starting_node_sampler(self) -> NextStateSampler:
236        starting_nodes_iter = self._calculate_starting_nodes()
237        self.__possible_starting_nodes = [
238            self.get_node_class()(*next(starting_nodes_iter))
239            for _ in range((self._size - 1) ** 2)
240        ]
241        starting_nodes = self._possible_starting_nodes[: self._n_starting_states]
242        self._rng.shuffle(starting_nodes)
243        if len(starting_nodes) == 1:
244            return NextStateSampler(next_nodes=starting_nodes)
245        return NextStateSampler(
246            next_nodes=starting_nodes,
247            probs=[1 / self._n_starting_states for _ in range(self._n_starting_states)],
248            seed=self._produce_random_seed(),
249        )
250
251    def _check_parameters_in_input(self):
252        super(SimpleGridMDP, self)._check_parameters_in_input()
253
254        assert self._n_starting_states <= (self._size - 1) ** 2
255        assert self._optimal_mean_reward - 0.1 > self._sub_optimal_mean_reward
256
257        dists = [
258            self._sub_optimal_distribution,
259            self._optimal_distribution,
260            self._other_distribution,
261        ]
262        check_distributions(
263            dists,
264            self._make_reward_stochastic,
265        )
266
267    def _get_grid_representation(self, node: "NODE_TYPE") -> np.ndarray:
268        grid = np.zeros((self._size, self._size), dtype=str)
269        grid[:, :] = " "
270
271        # Corner nodes
272        if self._reward_type == SimpleGridReward.AND:
273            grid[0, 0] = "-"
274            grid[0, -1] = "-"
275            grid[-1, 0] = "-"
276            grid[-1, -1] = "+"
277        elif self._reward_type == SimpleGridReward.NAND:
278            grid[0, 0] = "+"
279            grid[0, -1] = "+"
280            grid[-1, 0] = "+"
281            grid[-1, -1] = "-"
282        elif self._reward_type == SimpleGridReward.OR:
283            grid[0, 0] = "-"
284            grid[0, -1] = "+"
285            grid[-1, 0] = "+"
286            grid[-1, -1] = "+"
287        else:
288            grid[0, 0] = "-"
289            grid[0, -1] = "+"
290            grid[-1, 0] = "+"
291            grid[-1, -1] = "-"
292
293        grid[node.Y, node.X] = "A"
294        return grid[::-1, :]
295
296    @property
297    def _possible_starting_nodes(self) -> List["NODE_TYPE"]:
298        return self.__possible_starting_nodes
299
300    @property
301    def parameters(self) -> Dict[str, Any]:
302        return {
303            **super(SimpleGridMDP, self).parameters,
304            **dict(
305                size=self._size,
306                reward_type=self._reward_type,
307                n_starting_states=self._n_starting_states,
308                optimal_mean_reward=self._optimal_mean_reward,
309                sub_optimal_mean_reward=self._sub_optimal_mean_reward,
310                optimal_distribution=self._optimal_distribution,
311                sub_optimal_distribution=self._sub_optimal_distribution,
312                other_distribution=self._other_distribution,
313            ),
314        }
315
316    def __init__(
317        self,
318        seed: int,
319        size: int,
320        reward_type: SimpleGridReward = SimpleGridReward.XOR,
321        n_starting_states: int = 1,
322        optimal_mean_reward: float = 0.9,
323        sub_optimal_mean_reward: float = 0.2,
324        optimal_distribution: Union[Tuple, rv_continuous] = None,
325        sub_optimal_distribution: Union[Tuple, rv_continuous] = None,
326        other_distribution: Union[Tuple, rv_continuous] = None,
327        make_reward_stochastic=False,
328        reward_variance_multiplier: float = 1.0,
329        **kwargs,
330    ):
331        """
332
333        Parameters
334        ----------
335        seed : int
336            The seed used for sampling rewards and next states.
337        size : int
338            The size of the grid.
339        reward_type : SimpleGridReward
340            The type of reward for the MDP. By default, the XOR type is used.
341        n_starting_states : int
342            The number of possible starting states.
343        optimal_mean_reward : float
344            If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory.
345            By default, it is set to 0.9.
346        sub_optimal_mean_reward : float
347            If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories.
348            By default, it is set to 0.2.
349        optimal_distribution : Union[Tuple, rv_continuous]
350            The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters
351            or as a rv_continuous object.
352        sub_optimal_distribution : Union[Tuple, rv_continuous]
353            The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta
354            parameters or as a rv_continuous object.
355        other_distribution : Union[Tuple, rv_continuous]
356            The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a
357            rv_continuous object.
358        make_reward_stochastic : bool
359            If True, the rewards of the MDP will be stochastic. By default, it is set to False.
360        reward_variance_multiplier : float
361            A constant that can be used to increase the variance of the reward distributions without changing their means.
362            The lower the value, the higher the variance. By default, it is set to 1.
363        """
364
365        if type(sub_optimal_distribution) == tuple:
366            sub_optimal_distribution = get_dist(
367                sub_optimal_distribution[0], sub_optimal_distribution[1]
368            )
369        if type(optimal_distribution) == tuple:
370            optimal_distribution = get_dist(
371                optimal_distribution[0], optimal_distribution[1]
372            )
373        if type(other_distribution) == tuple:
374            other_distribution = get_dist(other_distribution[0], other_distribution[1])
375
376        self._size = size
377        self._reward_type = SimpleGridReward(reward_type)
378        self._n_starting_states = n_starting_states
379        self._optimal_mean_reward = optimal_mean_reward
380        self._sub_optimal_mean_reward = sub_optimal_mean_reward
381        dists = [
382            sub_optimal_distribution,
383            optimal_distribution,
384            other_distribution,
385        ]
386
387        if dists.count(None) == 0:
388            self._sub_optimal_distribution = sub_optimal_distribution
389            self._optimal_distribution = optimal_distribution
390            self._other_distribution = other_distribution
391        else:
392            if make_reward_stochastic:
393                self._sub_optimal_distribution = beta(
394                    reward_variance_multiplier,
395                    reward_variance_multiplier * (10 / sub_optimal_mean_reward - 1),
396                )
397                self._optimal_distribution = beta(
398                    reward_variance_multiplier,
399                    reward_variance_multiplier * (1 / optimal_mean_reward - 1),
400                )
401                self._other_distribution = beta(
402                    reward_variance_multiplier,
403                    reward_variance_multiplier * (1 / sub_optimal_mean_reward - 1),
404                )
405            else:
406                self._sub_optimal_distribution = deterministic(0.0)
407                self._optimal_distribution = deterministic(1.0)
408                self._other_distribution = deterministic(0.5)
409
410        super(SimpleGridMDP, self).__init__(
411            seed=seed,
412            reward_variance_multiplier=reward_variance_multiplier,
413            make_reward_stochastic=make_reward_stochastic,
414            **kwargs,
415        )
@dataclass(frozen=True)
class SimpleGridNode:
24@dataclass(frozen=True)
25class SimpleGridNode:
26    """
27    The node for the SimpleGrid MDP.
28    """
29
30    X: int
31    """x coordinate."""
32    Y: int
33    """y coordinate."""
34
35    def __str__(self):
36        return f"X={self.X},Y={self.Y}"
37
38    def __iter__(self):
39        return iter((self.X, self.Y))

The node for the SimpleGrid MDP.

SimpleGridNode(X: int, Y: int)
X: int

x coordinate.

Y: int

y coordinate.

class SimpleGridAction(enum.IntEnum):
42class SimpleGridAction(IntEnum):
43    """
44    The actions available in the SimpleGrid MDP.
45    """
46
47    UP = 0
48    RIGHT = 1
49    DOWN = 2
50    LEFT = 3
51    NO_OP = 4

The actions available in the SimpleGrid MDP.

Inherited Members
enum.Enum
name
value
builtins.int
conjugate
bit_length
to_bytes
from_bytes
as_integer_ratio
real
imag
numerator
denominator
@gin.constants_from_enum
class SimpleGridReward(enum.IntEnum):
54@gin.constants_from_enum
55class SimpleGridReward(IntEnum):
56    """
57    The reward types available in the SimpleGrid MDP. It controls the rewards for the corner states.
58    """
59
60    AND = 0
61    NAND = 1
62    OR = 2
63    XOR = 3

The reward types available in the SimpleGrid MDP. It controls the rewards for the corner states.

Inherited Members
enum.Enum
name
value
builtins.int
conjugate
bit_length
to_bytes
from_bytes
as_integer_ratio
real
imag
numerator
denominator
class SimpleGridMDP(colosseum.mdp.base.BaseMDP, abc.ABC):
 66class SimpleGridMDP(BaseMDP, abc.ABC):
 67    """
 68    The base class for the SimpleGrid family.
 69    """
 70
 71    @staticmethod
 72    def get_action_class() -> SimpleGridAction:
 73        return SimpleGridAction
 74
 75    @staticmethod
 76    def get_unique_symbols() -> List[str]:
 77        return [" ", "A", "+", "-"]
 78
 79    @staticmethod
 80    def does_seed_change_MDP_structure() -> bool:
 81        return True
 82
 83    @staticmethod
 84    def sample_mdp_parameters(
 85        n: int, is_episodic: bool, seed: int = None
 86    ) -> List[Dict[str, Any]]:
 87        rng = np.random.RandomState(np.random.randint(10_000) if seed is None else seed)
 88        samples = []
 89        for _ in range(n):
 90            p_rand, p_lazy, _ = 0.9 * rng.dirichlet([0.2, 0.2, 5])
 91            sample = dict(
 92                size=int(
 93                    (
 94                        1
 95                        + np.minimum((800 / (100 * rng.random() + 35)), 25)
 96                        * (0.8 if is_episodic else 1)
 97                    )
 98                ),
 99                n_starting_states=rng.randint(1, 5),
100                p_rand=p_rand,
101                p_lazy=p_lazy,
102                make_reward_stochastic=rng.choice([True, False]),
103                reward_variance_multiplier=2 * rng.random() + 0.005,
104            )
105            sample["p_rand"] = None if sample["p_rand"] < 0.01 else sample["p_rand"]
106            sample["p_lazy"] = None if sample["p_lazy"] < 0.01 else sample["p_lazy"]
107
108            sample["reward_type"] = rng.randint(4)
109
110            if sample["make_reward_stochastic"]:
111                sample["sub_optimal_distribution"] = (
112                    "beta",
113                    (
114                        sample["reward_variance_multiplier"],
115                        sample["reward_variance_multiplier"] * (10 / 0.2 - 1),
116                    ),
117                )
118                sample["optimal_distribution"] = (
119                    "beta",
120                    (
121                        sample["reward_variance_multiplier"],
122                        sample["reward_variance_multiplier"] * (1 / 0.9 - 1),
123                    ),
124                )
125                sample["other_distribution"] = (
126                    "beta",
127                    (
128                        sample["reward_variance_multiplier"],
129                        sample["reward_variance_multiplier"] * (1 / 0.2 - 1),
130                    ),
131                )
132            else:
133                sample["sub_optimal_distribution"] = ("deterministic", (0.0,))
134                sample["optimal_distribution"] = ("deterministic", (1.0,))
135                sample["other_distribution"] = ("deterministic", (0.5,))
136
137            samples.append(rounding_nested_structure(sample))
138        return samples
139
140    @staticmethod
141    def get_node_class() -> Type["NODE_TYPE"]:
142        return SimpleGridNode
143
144    def get_gin_parameters(self, index: int) -> str:
145        prms = dict(
146            size=self._size,
147            n_starting_states=self._n_starting_states,
148            reward_type=int(self._reward_type),
149            make_reward_stochastic=self._make_reward_stochastic,
150            reward_variance_multiplier=self._reward_variance_multiplier,
151            sub_optimal_distribution=(
152                self._sub_optimal_distribution.dist.name,
153                self._sub_optimal_distribution.args,
154            ),
155            optimal_distribution=(
156                self._optimal_distribution.dist.name,
157                self._optimal_distribution.args,
158            ),
159            other_distribution=(
160                self._other_distribution.dist.name,
161                self._other_distribution.args,
162            ),
163        )
164        if self._p_rand is not None:
165            prms["p_rand"] = self._p_rand
166
167        return SimpleGridMDP.produce_gin_file_from_mdp_parameters(
168            prms, type(self).__name__, index
169        )
170
171    @property
172    def n_actions(self) -> int:
173        return len(SimpleGridAction)
174
175    def _get_next_nodes_parameters(
176        self, node: "NODE_TYPE", action: "ACTION_TYPE"
177    ) -> Tuple[Tuple[dict, float], ...]:
178        if action == SimpleGridAction.UP:
179            return ((dict(X=node.X, Y=min(node.Y + 1, self._size - 1)), 1.0),)
180        if action == SimpleGridAction.RIGHT:
181            return ((dict(X=min(node.X + 1, self._size - 1), Y=node.Y), 1.0),)
182        if action == SimpleGridAction.DOWN:
183            return ((dict(X=node.X, Y=max(node.Y - 1, 0)), 1.0),)
184        if action == SimpleGridAction.LEFT:
185            return ((dict(X=max(node.X - 1, 0), Y=node.Y), 1.0),)
186        if action == SimpleGridAction.NO_OP:
187            return ((dict(X=node.X, Y=node.Y), 1.0),)
188
189    @staticmethod
190    def _is_corner_loop(node, next_node, size):
191        return (
192            node.X == next_node.X
193            and node.Y == next_node.Y
194            and node.X in [0, size - 1]
195            and node.Y in [0, size - 1]
196        )
197
198    def _get_reward_distribution(
199        self, node: "NODE_TYPE", action: "ACTION_TYPE", next_node: "NODE_TYPE"
200    ) -> rv_continuous:
201        # Corner nodes
202        if SimpleGridMDP._is_corner_loop(node, next_node, self._size):
203            if (
204                (self._reward_type == SimpleGridReward.AND and (node.X and node.Y))
205                or (
206                    self._reward_type == SimpleGridReward.NAND
207                    and not (node.X and node.Y)
208                )
209                or (self._reward_type == SimpleGridReward.OR and (node.X | node.Y))
210                or (self._reward_type == SimpleGridReward.XOR and (node.X ^ node.Y))
211            ):
212                return self._optimal_distribution
213            else:
214                return self._sub_optimal_distribution
215        else:
216            return self._other_distribution
217
218    def _calculate_starting_nodes(self):
219        center = np.array(((self._size - 1) / 2, (self._size - 1) / 2))
220        distances = np.empty((self._size, self._size))
221        for x in range(self._size):
222            for y in range(self._size):
223                distances[x, y] = ((np.array((x, y)) - center) ** 2).sum()
224
225        batch: list = np.array(np.where(distances == distances.min())).T.tolist()
226        self._rng.shuffle(batch)
227        while not np.all(distances == np.inf):
228            distances[batch[0][0], batch[0][1]] = np.inf
229            yield batch[0]
230            batch.pop(0)
231            if len(batch) == 0:
232                batch: list = np.array(
233                    np.where(distances == distances.min())
234                ).T.tolist()
235
236    def _get_starting_node_sampler(self) -> NextStateSampler:
237        starting_nodes_iter = self._calculate_starting_nodes()
238        self.__possible_starting_nodes = [
239            self.get_node_class()(*next(starting_nodes_iter))
240            for _ in range((self._size - 1) ** 2)
241        ]
242        starting_nodes = self._possible_starting_nodes[: self._n_starting_states]
243        self._rng.shuffle(starting_nodes)
244        if len(starting_nodes) == 1:
245            return NextStateSampler(next_nodes=starting_nodes)
246        return NextStateSampler(
247            next_nodes=starting_nodes,
248            probs=[1 / self._n_starting_states for _ in range(self._n_starting_states)],
249            seed=self._produce_random_seed(),
250        )
251
252    def _check_parameters_in_input(self):
253        super(SimpleGridMDP, self)._check_parameters_in_input()
254
255        assert self._n_starting_states <= (self._size - 1) ** 2
256        assert self._optimal_mean_reward - 0.1 > self._sub_optimal_mean_reward
257
258        dists = [
259            self._sub_optimal_distribution,
260            self._optimal_distribution,
261            self._other_distribution,
262        ]
263        check_distributions(
264            dists,
265            self._make_reward_stochastic,
266        )
267
268    def _get_grid_representation(self, node: "NODE_TYPE") -> np.ndarray:
269        grid = np.zeros((self._size, self._size), dtype=str)
270        grid[:, :] = " "
271
272        # Corner nodes
273        if self._reward_type == SimpleGridReward.AND:
274            grid[0, 0] = "-"
275            grid[0, -1] = "-"
276            grid[-1, 0] = "-"
277            grid[-1, -1] = "+"
278        elif self._reward_type == SimpleGridReward.NAND:
279            grid[0, 0] = "+"
280            grid[0, -1] = "+"
281            grid[-1, 0] = "+"
282            grid[-1, -1] = "-"
283        elif self._reward_type == SimpleGridReward.OR:
284            grid[0, 0] = "-"
285            grid[0, -1] = "+"
286            grid[-1, 0] = "+"
287            grid[-1, -1] = "+"
288        else:
289            grid[0, 0] = "-"
290            grid[0, -1] = "+"
291            grid[-1, 0] = "+"
292            grid[-1, -1] = "-"
293
294        grid[node.Y, node.X] = "A"
295        return grid[::-1, :]
296
297    @property
298    def _possible_starting_nodes(self) -> List["NODE_TYPE"]:
299        return self.__possible_starting_nodes
300
301    @property
302    def parameters(self) -> Dict[str, Any]:
303        return {
304            **super(SimpleGridMDP, self).parameters,
305            **dict(
306                size=self._size,
307                reward_type=self._reward_type,
308                n_starting_states=self._n_starting_states,
309                optimal_mean_reward=self._optimal_mean_reward,
310                sub_optimal_mean_reward=self._sub_optimal_mean_reward,
311                optimal_distribution=self._optimal_distribution,
312                sub_optimal_distribution=self._sub_optimal_distribution,
313                other_distribution=self._other_distribution,
314            ),
315        }
316
317    def __init__(
318        self,
319        seed: int,
320        size: int,
321        reward_type: SimpleGridReward = SimpleGridReward.XOR,
322        n_starting_states: int = 1,
323        optimal_mean_reward: float = 0.9,
324        sub_optimal_mean_reward: float = 0.2,
325        optimal_distribution: Union[Tuple, rv_continuous] = None,
326        sub_optimal_distribution: Union[Tuple, rv_continuous] = None,
327        other_distribution: Union[Tuple, rv_continuous] = None,
328        make_reward_stochastic=False,
329        reward_variance_multiplier: float = 1.0,
330        **kwargs,
331    ):
332        """
333
334        Parameters
335        ----------
336        seed : int
337            The seed used for sampling rewards and next states.
338        size : int
339            The size of the grid.
340        reward_type : SimpleGridReward
341            The type of reward for the MDP. By default, the XOR type is used.
342        n_starting_states : int
343            The number of possible starting states.
344        optimal_mean_reward : float
345            If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory.
346            By default, it is set to 0.9.
347        sub_optimal_mean_reward : float
348            If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories.
349            By default, it is set to 0.2.
350        optimal_distribution : Union[Tuple, rv_continuous]
351            The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters
352            or as a rv_continuous object.
353        sub_optimal_distribution : Union[Tuple, rv_continuous]
354            The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta
355            parameters or as a rv_continuous object.
356        other_distribution : Union[Tuple, rv_continuous]
357            The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a
358            rv_continuous object.
359        make_reward_stochastic : bool
360            If True, the rewards of the MDP will be stochastic. By default, it is set to False.
361        reward_variance_multiplier : float
362            A constant that can be used to increase the variance of the reward distributions without changing their means.
363            The lower the value, the higher the variance. By default, it is set to 1.
364        """
365
366        if type(sub_optimal_distribution) == tuple:
367            sub_optimal_distribution = get_dist(
368                sub_optimal_distribution[0], sub_optimal_distribution[1]
369            )
370        if type(optimal_distribution) == tuple:
371            optimal_distribution = get_dist(
372                optimal_distribution[0], optimal_distribution[1]
373            )
374        if type(other_distribution) == tuple:
375            other_distribution = get_dist(other_distribution[0], other_distribution[1])
376
377        self._size = size
378        self._reward_type = SimpleGridReward(reward_type)
379        self._n_starting_states = n_starting_states
380        self._optimal_mean_reward = optimal_mean_reward
381        self._sub_optimal_mean_reward = sub_optimal_mean_reward
382        dists = [
383            sub_optimal_distribution,
384            optimal_distribution,
385            other_distribution,
386        ]
387
388        if dists.count(None) == 0:
389            self._sub_optimal_distribution = sub_optimal_distribution
390            self._optimal_distribution = optimal_distribution
391            self._other_distribution = other_distribution
392        else:
393            if make_reward_stochastic:
394                self._sub_optimal_distribution = beta(
395                    reward_variance_multiplier,
396                    reward_variance_multiplier * (10 / sub_optimal_mean_reward - 1),
397                )
398                self._optimal_distribution = beta(
399                    reward_variance_multiplier,
400                    reward_variance_multiplier * (1 / optimal_mean_reward - 1),
401                )
402                self._other_distribution = beta(
403                    reward_variance_multiplier,
404                    reward_variance_multiplier * (1 / sub_optimal_mean_reward - 1),
405                )
406            else:
407                self._sub_optimal_distribution = deterministic(0.0)
408                self._optimal_distribution = deterministic(1.0)
409                self._other_distribution = deterministic(0.5)
410
411        super(SimpleGridMDP, self).__init__(
412            seed=seed,
413            reward_variance_multiplier=reward_variance_multiplier,
414            make_reward_stochastic=make_reward_stochastic,
415            **kwargs,
416        )

The base class for the SimpleGrid family.

SimpleGridMDP( seed: int, size: int, reward_type: colosseum.mdp.simple_grid.base.SimpleGridReward = <SimpleGridReward.XOR: 3>, n_starting_states: int = 1, optimal_mean_reward: float = 0.9, sub_optimal_mean_reward: float = 0.2, optimal_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, sub_optimal_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, other_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, make_reward_stochastic=False, reward_variance_multiplier: float = 1.0, **kwargs)
317    def __init__(
318        self,
319        seed: int,
320        size: int,
321        reward_type: SimpleGridReward = SimpleGridReward.XOR,
322        n_starting_states: int = 1,
323        optimal_mean_reward: float = 0.9,
324        sub_optimal_mean_reward: float = 0.2,
325        optimal_distribution: Union[Tuple, rv_continuous] = None,
326        sub_optimal_distribution: Union[Tuple, rv_continuous] = None,
327        other_distribution: Union[Tuple, rv_continuous] = None,
328        make_reward_stochastic=False,
329        reward_variance_multiplier: float = 1.0,
330        **kwargs,
331    ):
332        """
333
334        Parameters
335        ----------
336        seed : int
337            The seed used for sampling rewards and next states.
338        size : int
339            The size of the grid.
340        reward_type : SimpleGridReward
341            The type of reward for the MDP. By default, the XOR type is used.
342        n_starting_states : int
343            The number of possible starting states.
344        optimal_mean_reward : float
345            If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory.
346            By default, it is set to 0.9.
347        sub_optimal_mean_reward : float
348            If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories.
349            By default, it is set to 0.2.
350        optimal_distribution : Union[Tuple, rv_continuous]
351            The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters
352            or as a rv_continuous object.
353        sub_optimal_distribution : Union[Tuple, rv_continuous]
354            The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta
355            parameters or as a rv_continuous object.
356        other_distribution : Union[Tuple, rv_continuous]
357            The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a
358            rv_continuous object.
359        make_reward_stochastic : bool
360            If True, the rewards of the MDP will be stochastic. By default, it is set to False.
361        reward_variance_multiplier : float
362            A constant that can be used to increase the variance of the reward distributions without changing their means.
363            The lower the value, the higher the variance. By default, it is set to 1.
364        """
365
366        if type(sub_optimal_distribution) == tuple:
367            sub_optimal_distribution = get_dist(
368                sub_optimal_distribution[0], sub_optimal_distribution[1]
369            )
370        if type(optimal_distribution) == tuple:
371            optimal_distribution = get_dist(
372                optimal_distribution[0], optimal_distribution[1]
373            )
374        if type(other_distribution) == tuple:
375            other_distribution = get_dist(other_distribution[0], other_distribution[1])
376
377        self._size = size
378        self._reward_type = SimpleGridReward(reward_type)
379        self._n_starting_states = n_starting_states
380        self._optimal_mean_reward = optimal_mean_reward
381        self._sub_optimal_mean_reward = sub_optimal_mean_reward
382        dists = [
383            sub_optimal_distribution,
384            optimal_distribution,
385            other_distribution,
386        ]
387
388        if dists.count(None) == 0:
389            self._sub_optimal_distribution = sub_optimal_distribution
390            self._optimal_distribution = optimal_distribution
391            self._other_distribution = other_distribution
392        else:
393            if make_reward_stochastic:
394                self._sub_optimal_distribution = beta(
395                    reward_variance_multiplier,
396                    reward_variance_multiplier * (10 / sub_optimal_mean_reward - 1),
397                )
398                self._optimal_distribution = beta(
399                    reward_variance_multiplier,
400                    reward_variance_multiplier * (1 / optimal_mean_reward - 1),
401                )
402                self._other_distribution = beta(
403                    reward_variance_multiplier,
404                    reward_variance_multiplier * (1 / sub_optimal_mean_reward - 1),
405                )
406            else:
407                self._sub_optimal_distribution = deterministic(0.0)
408                self._optimal_distribution = deterministic(1.0)
409                self._other_distribution = deterministic(0.5)
410
411        super(SimpleGridMDP, self).__init__(
412            seed=seed,
413            reward_variance_multiplier=reward_variance_multiplier,
414            make_reward_stochastic=make_reward_stochastic,
415            **kwargs,
416        )
Parameters
  • seed (int): The seed used for sampling rewards and next states.
  • size (int): The size of the grid.
  • reward_type (SimpleGridReward): The type of reward for the MDP. By default, the XOR type is used.
  • n_starting_states (int): The number of possible starting states.
  • optimal_mean_reward (float): If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory. By default, it is set to 0.9.
  • sub_optimal_mean_reward (float): If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories. By default, it is set to 0.2.
  • optimal_distribution (Union[Tuple, rv_continuous]): The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
  • sub_optimal_distribution (Union[Tuple, rv_continuous]): The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
  • other_distribution (Union[Tuple, rv_continuous]): The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
  • make_reward_stochastic (bool): If True, the rewards of the MDP will be stochastic. By default, it is set to False.
  • reward_variance_multiplier (float): A constant that can be used to increase the variance of the reward distributions without changing their means. The lower the value, the higher the variance. By default, it is set to 1.
@staticmethod
def get_action_class() -> colosseum.mdp.simple_grid.base.SimpleGridAction:
71    @staticmethod
72    def get_action_class() -> SimpleGridAction:
73        return SimpleGridAction
@staticmethod
def get_unique_symbols() -> List[str]:
75    @staticmethod
76    def get_unique_symbols() -> List[str]:
77        return [" ", "A", "+", "-"]
Returns
  • List[str]: the unique symbols of the grid representation of the MDP.
@staticmethod
def does_seed_change_MDP_structure() -> bool:
79    @staticmethod
80    def does_seed_change_MDP_structure() -> bool:
81        return True
Returns
  • bool: True if when changing the seed the transition matrix and/or rewards matrix change. This for example may
  • happen when there are fewer starting states that possible one and the effective starting states are picked
  • randomly based on the seed.
@staticmethod
def sample_mdp_parameters(n: int, is_episodic: bool, seed: int = None) -> List[Dict[str, Any]]:
 83    @staticmethod
 84    def sample_mdp_parameters(
 85        n: int, is_episodic: bool, seed: int = None
 86    ) -> List[Dict[str, Any]]:
 87        rng = np.random.RandomState(np.random.randint(10_000) if seed is None else seed)
 88        samples = []
 89        for _ in range(n):
 90            p_rand, p_lazy, _ = 0.9 * rng.dirichlet([0.2, 0.2, 5])
 91            sample = dict(
 92                size=int(
 93                    (
 94                        1
 95                        + np.minimum((800 / (100 * rng.random() + 35)), 25)
 96                        * (0.8 if is_episodic else 1)
 97                    )
 98                ),
 99                n_starting_states=rng.randint(1, 5),
100                p_rand=p_rand,
101                p_lazy=p_lazy,
102                make_reward_stochastic=rng.choice([True, False]),
103                reward_variance_multiplier=2 * rng.random() + 0.005,
104            )
105            sample["p_rand"] = None if sample["p_rand"] < 0.01 else sample["p_rand"]
106            sample["p_lazy"] = None if sample["p_lazy"] < 0.01 else sample["p_lazy"]
107
108            sample["reward_type"] = rng.randint(4)
109
110            if sample["make_reward_stochastic"]:
111                sample["sub_optimal_distribution"] = (
112                    "beta",
113                    (
114                        sample["reward_variance_multiplier"],
115                        sample["reward_variance_multiplier"] * (10 / 0.2 - 1),
116                    ),
117                )
118                sample["optimal_distribution"] = (
119                    "beta",
120                    (
121                        sample["reward_variance_multiplier"],
122                        sample["reward_variance_multiplier"] * (1 / 0.9 - 1),
123                    ),
124                )
125                sample["other_distribution"] = (
126                    "beta",
127                    (
128                        sample["reward_variance_multiplier"],
129                        sample["reward_variance_multiplier"] * (1 / 0.2 - 1),
130                    ),
131                )
132            else:
133                sample["sub_optimal_distribution"] = ("deterministic", (0.0,))
134                sample["optimal_distribution"] = ("deterministic", (1.0,))
135                sample["other_distribution"] = ("deterministic", (0.5,))
136
137            samples.append(rounding_nested_structure(sample))
138        return samples
Returns
  • List[Dict[str, Any]]: n sampled parameters that can be used to construct an MDP in a reasonable amount of time.
def get_gin_parameters(self, index: int) -> str:
144    def get_gin_parameters(self, index: int) -> str:
145        prms = dict(
146            size=self._size,
147            n_starting_states=self._n_starting_states,
148            reward_type=int(self._reward_type),
149            make_reward_stochastic=self._make_reward_stochastic,
150            reward_variance_multiplier=self._reward_variance_multiplier,
151            sub_optimal_distribution=(
152                self._sub_optimal_distribution.dist.name,
153                self._sub_optimal_distribution.args,
154            ),
155            optimal_distribution=(
156                self._optimal_distribution.dist.name,
157                self._optimal_distribution.args,
158            ),
159            other_distribution=(
160                self._other_distribution.dist.name,
161                self._other_distribution.args,
162            ),
163        )
164        if self._p_rand is not None:
165            prms["p_rand"] = self._p_rand
166
167        return SimpleGridMDP.produce_gin_file_from_mdp_parameters(
168            prms, type(self).__name__, index
169        )
Returns
  • str: The gin config of the MDP instance.
n_actions: int
Returns
  • int: The number of available actions.
parameters: Dict[str, Any]
Returns
  • Dict[str, Any]: The parameters of the MDP.