colosseum.mdp.frozen_lake.base
1import abc 2from dataclasses import dataclass 3from enum import IntEnum 4from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Type, Union 5 6import numpy as np 7from gym.envs.toy_text.frozen_lake import generate_random_map 8from scipy.stats import beta, rv_continuous 9 10from colosseum.mdp import BaseMDP 11from colosseum.mdp.utils.custom_samplers import NextStateSampler 12from colosseum.utils.miscellanea import ( 13 check_distributions, 14 deterministic, 15 get_dist, 16 rounding_nested_structure, 17) 18 19if TYPE_CHECKING: 20 from colosseum.mdp import ACTION_TYPE, NODE_TYPE 21 22 23@dataclass(frozen=True) 24class FrozenLakeNode: 25 """ 26 The node for the FrozenLake MDP. 27 """ 28 29 X: int 30 """x coordinate.""" 31 Y: int 32 """y coordinate.""" 33 34 def __str__(self): 35 return f"X={self.X},Y={self.Y}" 36 37 def __iter__(self): 38 return iter((self.X, self.Y)) 39 40 41class FrozenLakeAction(IntEnum): 42 """The action available in the FrozenLake MDP.""" 43 44 UP = 0 45 """Move up.""" 46 RIGHT = 1 47 """Move towards the right.""" 48 DOWN = 2 49 """Move down.""" 50 LEFT = 3 51 """Move towards the left.""" 52 53 54class FrozenLakeMDP(BaseMDP, abc.ABC): 55 """ 56 The base class for the FrozenLake family. 57 """ 58 59 @staticmethod 60 def get_unique_symbols() -> List[str]: 61 return ["A", "F", "H", "G"] 62 63 @staticmethod 64 def does_seed_change_MDP_structure() -> bool: 65 return True 66 67 @staticmethod 68 def sample_mdp_parameters( 69 n: int, is_episodic: bool, seed: int = None 70 ) -> List[Dict[str, Any]]: 71 rng = np.random.RandomState(np.random.randint(10_000) if seed is None else seed) 72 samples = [] 73 for _ in range(n): 74 p_rand, p_lazy, _ = 0.9 * rng.dirichlet([0.2, 0.2, 5]) 75 sample = dict( 76 size=rng.choice(range(5, 7), None, True, [0.665, 0.335]) 77 if is_episodic 78 else int((2.5 + np.minimum((400 / (150 * rng.random() + 35)), 15))), 79 p_frozen=min((0.55 * rng.random() + 0.45) ** 0.3, 0.95), 80 p_rand=p_rand, 81 p_lazy=p_lazy, 82 make_reward_stochastic=rng.choice([True, False]), 83 reward_variance_multiplier=2 * rng.random() + 0.005, 84 ) 85 sample["p_rand"] = None if sample["p_rand"] < 0.01 else sample["p_rand"] 86 sample["p_lazy"] = None if sample["p_lazy"] < 0.01 else sample["p_lazy"] 87 88 if sample["make_reward_stochastic"]: 89 sample["default_r"] = ( 90 "beta", 91 ( 92 sample["reward_variance_multiplier"], 93 sample["reward_variance_multiplier"] 94 * (sample["size"] ** 2 / 0.1 - 1), 95 ), 96 ) 97 sample["goal_r"] = ( 98 "beta", 99 ( 100 sample["reward_variance_multiplier"] 101 * (sample["size"] ** 2 - 1), 102 sample["reward_variance_multiplier"], 103 ), 104 ) 105 else: 106 sample["default_r"] = ("deterministic", (0.0,)) 107 sample["goal_r"] = ("deterministic", (1.0,)) 108 109 samples.append(rounding_nested_structure(sample)) 110 return samples 111 112 @staticmethod 113 def get_node_class() -> Type["NODE_TYPE"]: 114 return FrozenLakeNode 115 116 def get_gin_parameters(self, index: int) -> str: 117 prms = dict( 118 size=self._size, 119 p_frozen=self._p_frozen, 120 make_reward_stochastic=self._make_reward_stochastic, 121 reward_variance_multiplier=self._reward_variance_multiplier, 122 default_r=( 123 self._default_r.dist.name, 124 self._default_r.args, 125 ), 126 goal_r=( 127 self._goal_r.dist.name, 128 self._goal_r.args, 129 ), 130 ) 131 132 if self._p_rand is not None: 133 prms["p_rand"] = self._p_rand 134 if self._p_lazy is not None: 135 prms["p_lazy"] = self._p_lazy 136 137 return FrozenLakeMDP.produce_gin_file_from_mdp_parameters( 138 prms, type(self).__name__, index 139 ) 140 141 @property 142 def n_actions(self) -> int: 143 return len(FrozenLakeAction) 144 145 def _next_positions(self, x, y, a): 146 if self.lake[x, y] == "G": 147 return dict(X=0, Y=0) 148 149 if a == FrozenLakeAction.LEFT: 150 next_x, next_y = x, min(y + 1, self._size - 1) 151 if a == FrozenLakeAction.DOWN: 152 next_x, next_y = min(x + 1, self._size - 1), y 153 if a == FrozenLakeAction.RIGHT: 154 next_x, next_y = x, max(y - 1, 0) 155 if a == FrozenLakeAction.UP: 156 next_x, next_y = max(x - 1, 0), y 157 next_pos = self.lake[next_x, next_y] 158 if next_pos == "H": 159 return dict(X=0, Y=0) 160 else: 161 return dict(X=next_x, Y=next_y) 162 163 def _get_next_nodes_parameters( 164 self, node: "NODE_TYPE", action: "ACTION_TYPE" 165 ) -> Tuple[Tuple[dict, float], ...]: 166 p = 0.5 if self._is_slippery else 1.0 167 next_nodes_prms = [] 168 next_nodes_prms.append((self._next_positions(node.X, node.Y, action), p)) 169 if self._is_slippery: 170 for a in [(action - 1) % 4, (action + 1) % 4]: 171 next_nodes_prms.append((self._next_positions(node.X, node.Y, a), p / 2)) 172 return next_nodes_prms 173 174 def _get_reward_distribution( 175 self, node: "NODE_TYPE", action: "ACTION_TYPE", next_node: "NODE_TYPE" 176 ) -> rv_continuous: 177 if self.lake[next_node.X, next_node.Y] == "G": 178 return self._goal_r 179 return self._default_r 180 181 def _get_starting_node_sampler(self) -> NextStateSampler: 182 return NextStateSampler(next_nodes=self._possible_starting_nodes) 183 184 def _check_parameters_in_input(self): 185 super(FrozenLakeMDP, self)._check_parameters_in_input() 186 187 assert self._p_frozen >= 0.1 188 assert self._size > 2 189 190 assert self._suboptimal_return + 0.2 < self._optimal_return 191 192 dists = [ 193 self._goal_r, 194 self._default_r, 195 ] 196 check_distributions( 197 dists, 198 self._make_reward_stochastic, 199 ) 200 201 def _get_grid_representation(self, node: "NODE_TYPE") -> np.ndarray: 202 grid = self.lake.copy() 203 grid[0, 0] = "F" 204 grid[node.X, node.Y] = "A" 205 return grid.T[::-1, :] 206 207 @property 208 def _possible_starting_nodes(self) -> List["NODE_TYPE"]: 209 return [FrozenLakeNode(0, 0)] 210 211 @property 212 def parameters(self) -> Dict[str, Any]: 213 return { 214 **super(FrozenLakeMDP, self).parameters, 215 **dict( 216 size=self._size, 217 p_frozen=self._p_frozen, 218 optimal_return=self._optimal_return, 219 suboptimal_return=self._suboptimal_return, 220 is_slippery=self._is_slippery, 221 goal_r=self._goal_r, 222 default_r=self._default_r, 223 ), 224 } 225 226 def __init__( 227 self, 228 seed: int, 229 size: int, 230 p_frozen: float, 231 optimal_return: float = 1.0, 232 suboptimal_return: float = 0.1, 233 is_slippery: bool = True, 234 goal_r: Union[Tuple, rv_continuous] = None, 235 default_r: Union[Tuple, rv_continuous] = None, 236 make_reward_stochastic=False, 237 reward_variance_multiplier: float = 1.0, 238 **kwargs, 239 ): 240 """ 241 242 Parameters 243 ---------- 244 seed : int 245 The seed used for sampling rewards and next states. 246 size : int 247 The size of the grid. 248 p_frozen : float 249 The probability that a tile of the lake is frozen and does not contain a hole. 250 optimal_return: float 251 If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory. 252 By default, it is set to 1. 253 suboptimal_return: float 254 If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories. 255 By default, it is set to 0.1. 256 is_slippery : bool 257 If True, the outcome of the action is stochastic due to the frozen tiles being slippery. By default, it is 258 set to True. 259 goal_r : Union[Tuple, rv_continuous] 260 The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters 261 or as a rv_continuous object. 262 default_r : Union[Tuple, rv_continuous] 263 The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a 264 rv_continuous object. 265 make_reward_stochastic : bool 266 If True, the rewards of the MDP will be stochastic. By default, it is set to False. 267 reward_variance_multiplier : float 268 A constant that can be used to increase the variance of the reward distributions without changing their means. 269 The lower the value, the higher the variance. By default, it is set to 1. 270 """ 271 272 if type(goal_r) == tuple: 273 goal_r = get_dist(goal_r[0], goal_r[1]) 274 if type(default_r) == tuple: 275 default_r = get_dist(default_r[0], default_r[1]) 276 277 self._size = size 278 self._p_frozen = p_frozen 279 self._optimal_return = optimal_return 280 self._suboptimal_return = suboptimal_return 281 self._is_slippery = is_slippery 282 self._goal_r = goal_r 283 self._default_r = default_r 284 285 np.random.seed(seed) 286 self.lake = np.array( 287 list( 288 map( 289 lambda x: list(x), 290 generate_random_map(size=self._size, p=self._p_frozen), 291 ) 292 ) 293 ) 294 295 if (default_r, goal_r).count(None) == 0: 296 self._default_r = default_r 297 self._goal_r = goal_r 298 else: 299 if make_reward_stochastic: 300 self._default_r = beta( 301 reward_variance_multiplier, 302 reward_variance_multiplier 303 * (size ** 2 / self._suboptimal_return - 1), 304 ) 305 self._goal_r = beta( 306 reward_variance_multiplier * (size ** 2 / self._optimal_return - 1), 307 reward_variance_multiplier, 308 ) 309 else: 310 self._default_r = deterministic(0.0) 311 self._goal_r = deterministic(1.0) 312 313 super(FrozenLakeMDP, self).__init__( 314 seed=seed, 315 reward_variance_multiplier=reward_variance_multiplier, 316 make_reward_stochastic=make_reward_stochastic, 317 **kwargs, 318 )
@dataclass(frozen=True)
class
FrozenLakeNode:
24@dataclass(frozen=True) 25class FrozenLakeNode: 26 """ 27 The node for the FrozenLake MDP. 28 """ 29 30 X: int 31 """x coordinate.""" 32 Y: int 33 """y coordinate.""" 34 35 def __str__(self): 36 return f"X={self.X},Y={self.Y}" 37 38 def __iter__(self): 39 return iter((self.X, self.Y))
The node for the FrozenLake MDP.
class
FrozenLakeAction(enum.IntEnum):
42class FrozenLakeAction(IntEnum): 43 """The action available in the FrozenLake MDP.""" 44 45 UP = 0 46 """Move up.""" 47 RIGHT = 1 48 """Move towards the right.""" 49 DOWN = 2 50 """Move down.""" 51 LEFT = 3 52 """Move towards the left."""
The action available in the FrozenLake MDP.
Inherited Members
- enum.Enum
- name
- value
- builtins.int
- conjugate
- bit_length
- to_bytes
- from_bytes
- as_integer_ratio
- real
- imag
- numerator
- denominator
55class FrozenLakeMDP(BaseMDP, abc.ABC): 56 """ 57 The base class for the FrozenLake family. 58 """ 59 60 @staticmethod 61 def get_unique_symbols() -> List[str]: 62 return ["A", "F", "H", "G"] 63 64 @staticmethod 65 def does_seed_change_MDP_structure() -> bool: 66 return True 67 68 @staticmethod 69 def sample_mdp_parameters( 70 n: int, is_episodic: bool, seed: int = None 71 ) -> List[Dict[str, Any]]: 72 rng = np.random.RandomState(np.random.randint(10_000) if seed is None else seed) 73 samples = [] 74 for _ in range(n): 75 p_rand, p_lazy, _ = 0.9 * rng.dirichlet([0.2, 0.2, 5]) 76 sample = dict( 77 size=rng.choice(range(5, 7), None, True, [0.665, 0.335]) 78 if is_episodic 79 else int((2.5 + np.minimum((400 / (150 * rng.random() + 35)), 15))), 80 p_frozen=min((0.55 * rng.random() + 0.45) ** 0.3, 0.95), 81 p_rand=p_rand, 82 p_lazy=p_lazy, 83 make_reward_stochastic=rng.choice([True, False]), 84 reward_variance_multiplier=2 * rng.random() + 0.005, 85 ) 86 sample["p_rand"] = None if sample["p_rand"] < 0.01 else sample["p_rand"] 87 sample["p_lazy"] = None if sample["p_lazy"] < 0.01 else sample["p_lazy"] 88 89 if sample["make_reward_stochastic"]: 90 sample["default_r"] = ( 91 "beta", 92 ( 93 sample["reward_variance_multiplier"], 94 sample["reward_variance_multiplier"] 95 * (sample["size"] ** 2 / 0.1 - 1), 96 ), 97 ) 98 sample["goal_r"] = ( 99 "beta", 100 ( 101 sample["reward_variance_multiplier"] 102 * (sample["size"] ** 2 - 1), 103 sample["reward_variance_multiplier"], 104 ), 105 ) 106 else: 107 sample["default_r"] = ("deterministic", (0.0,)) 108 sample["goal_r"] = ("deterministic", (1.0,)) 109 110 samples.append(rounding_nested_structure(sample)) 111 return samples 112 113 @staticmethod 114 def get_node_class() -> Type["NODE_TYPE"]: 115 return FrozenLakeNode 116 117 def get_gin_parameters(self, index: int) -> str: 118 prms = dict( 119 size=self._size, 120 p_frozen=self._p_frozen, 121 make_reward_stochastic=self._make_reward_stochastic, 122 reward_variance_multiplier=self._reward_variance_multiplier, 123 default_r=( 124 self._default_r.dist.name, 125 self._default_r.args, 126 ), 127 goal_r=( 128 self._goal_r.dist.name, 129 self._goal_r.args, 130 ), 131 ) 132 133 if self._p_rand is not None: 134 prms["p_rand"] = self._p_rand 135 if self._p_lazy is not None: 136 prms["p_lazy"] = self._p_lazy 137 138 return FrozenLakeMDP.produce_gin_file_from_mdp_parameters( 139 prms, type(self).__name__, index 140 ) 141 142 @property 143 def n_actions(self) -> int: 144 return len(FrozenLakeAction) 145 146 def _next_positions(self, x, y, a): 147 if self.lake[x, y] == "G": 148 return dict(X=0, Y=0) 149 150 if a == FrozenLakeAction.LEFT: 151 next_x, next_y = x, min(y + 1, self._size - 1) 152 if a == FrozenLakeAction.DOWN: 153 next_x, next_y = min(x + 1, self._size - 1), y 154 if a == FrozenLakeAction.RIGHT: 155 next_x, next_y = x, max(y - 1, 0) 156 if a == FrozenLakeAction.UP: 157 next_x, next_y = max(x - 1, 0), y 158 next_pos = self.lake[next_x, next_y] 159 if next_pos == "H": 160 return dict(X=0, Y=0) 161 else: 162 return dict(X=next_x, Y=next_y) 163 164 def _get_next_nodes_parameters( 165 self, node: "NODE_TYPE", action: "ACTION_TYPE" 166 ) -> Tuple[Tuple[dict, float], ...]: 167 p = 0.5 if self._is_slippery else 1.0 168 next_nodes_prms = [] 169 next_nodes_prms.append((self._next_positions(node.X, node.Y, action), p)) 170 if self._is_slippery: 171 for a in [(action - 1) % 4, (action + 1) % 4]: 172 next_nodes_prms.append((self._next_positions(node.X, node.Y, a), p / 2)) 173 return next_nodes_prms 174 175 def _get_reward_distribution( 176 self, node: "NODE_TYPE", action: "ACTION_TYPE", next_node: "NODE_TYPE" 177 ) -> rv_continuous: 178 if self.lake[next_node.X, next_node.Y] == "G": 179 return self._goal_r 180 return self._default_r 181 182 def _get_starting_node_sampler(self) -> NextStateSampler: 183 return NextStateSampler(next_nodes=self._possible_starting_nodes) 184 185 def _check_parameters_in_input(self): 186 super(FrozenLakeMDP, self)._check_parameters_in_input() 187 188 assert self._p_frozen >= 0.1 189 assert self._size > 2 190 191 assert self._suboptimal_return + 0.2 < self._optimal_return 192 193 dists = [ 194 self._goal_r, 195 self._default_r, 196 ] 197 check_distributions( 198 dists, 199 self._make_reward_stochastic, 200 ) 201 202 def _get_grid_representation(self, node: "NODE_TYPE") -> np.ndarray: 203 grid = self.lake.copy() 204 grid[0, 0] = "F" 205 grid[node.X, node.Y] = "A" 206 return grid.T[::-1, :] 207 208 @property 209 def _possible_starting_nodes(self) -> List["NODE_TYPE"]: 210 return [FrozenLakeNode(0, 0)] 211 212 @property 213 def parameters(self) -> Dict[str, Any]: 214 return { 215 **super(FrozenLakeMDP, self).parameters, 216 **dict( 217 size=self._size, 218 p_frozen=self._p_frozen, 219 optimal_return=self._optimal_return, 220 suboptimal_return=self._suboptimal_return, 221 is_slippery=self._is_slippery, 222 goal_r=self._goal_r, 223 default_r=self._default_r, 224 ), 225 } 226 227 def __init__( 228 self, 229 seed: int, 230 size: int, 231 p_frozen: float, 232 optimal_return: float = 1.0, 233 suboptimal_return: float = 0.1, 234 is_slippery: bool = True, 235 goal_r: Union[Tuple, rv_continuous] = None, 236 default_r: Union[Tuple, rv_continuous] = None, 237 make_reward_stochastic=False, 238 reward_variance_multiplier: float = 1.0, 239 **kwargs, 240 ): 241 """ 242 243 Parameters 244 ---------- 245 seed : int 246 The seed used for sampling rewards and next states. 247 size : int 248 The size of the grid. 249 p_frozen : float 250 The probability that a tile of the lake is frozen and does not contain a hole. 251 optimal_return: float 252 If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory. 253 By default, it is set to 1. 254 suboptimal_return: float 255 If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories. 256 By default, it is set to 0.1. 257 is_slippery : bool 258 If True, the outcome of the action is stochastic due to the frozen tiles being slippery. By default, it is 259 set to True. 260 goal_r : Union[Tuple, rv_continuous] 261 The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters 262 or as a rv_continuous object. 263 default_r : Union[Tuple, rv_continuous] 264 The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a 265 rv_continuous object. 266 make_reward_stochastic : bool 267 If True, the rewards of the MDP will be stochastic. By default, it is set to False. 268 reward_variance_multiplier : float 269 A constant that can be used to increase the variance of the reward distributions without changing their means. 270 The lower the value, the higher the variance. By default, it is set to 1. 271 """ 272 273 if type(goal_r) == tuple: 274 goal_r = get_dist(goal_r[0], goal_r[1]) 275 if type(default_r) == tuple: 276 default_r = get_dist(default_r[0], default_r[1]) 277 278 self._size = size 279 self._p_frozen = p_frozen 280 self._optimal_return = optimal_return 281 self._suboptimal_return = suboptimal_return 282 self._is_slippery = is_slippery 283 self._goal_r = goal_r 284 self._default_r = default_r 285 286 np.random.seed(seed) 287 self.lake = np.array( 288 list( 289 map( 290 lambda x: list(x), 291 generate_random_map(size=self._size, p=self._p_frozen), 292 ) 293 ) 294 ) 295 296 if (default_r, goal_r).count(None) == 0: 297 self._default_r = default_r 298 self._goal_r = goal_r 299 else: 300 if make_reward_stochastic: 301 self._default_r = beta( 302 reward_variance_multiplier, 303 reward_variance_multiplier 304 * (size ** 2 / self._suboptimal_return - 1), 305 ) 306 self._goal_r = beta( 307 reward_variance_multiplier * (size ** 2 / self._optimal_return - 1), 308 reward_variance_multiplier, 309 ) 310 else: 311 self._default_r = deterministic(0.0) 312 self._goal_r = deterministic(1.0) 313 314 super(FrozenLakeMDP, self).__init__( 315 seed=seed, 316 reward_variance_multiplier=reward_variance_multiplier, 317 make_reward_stochastic=make_reward_stochastic, 318 **kwargs, 319 )
The base class for the FrozenLake family.
FrozenLakeMDP( seed: int, size: int, p_frozen: float, optimal_return: float = 1.0, suboptimal_return: float = 0.1, is_slippery: bool = True, goal_r: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, default_r: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, make_reward_stochastic=False, reward_variance_multiplier: float = 1.0, **kwargs)
227 def __init__( 228 self, 229 seed: int, 230 size: int, 231 p_frozen: float, 232 optimal_return: float = 1.0, 233 suboptimal_return: float = 0.1, 234 is_slippery: bool = True, 235 goal_r: Union[Tuple, rv_continuous] = None, 236 default_r: Union[Tuple, rv_continuous] = None, 237 make_reward_stochastic=False, 238 reward_variance_multiplier: float = 1.0, 239 **kwargs, 240 ): 241 """ 242 243 Parameters 244 ---------- 245 seed : int 246 The seed used for sampling rewards and next states. 247 size : int 248 The size of the grid. 249 p_frozen : float 250 The probability that a tile of the lake is frozen and does not contain a hole. 251 optimal_return: float 252 If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory. 253 By default, it is set to 1. 254 suboptimal_return: float 255 If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories. 256 By default, it is set to 0.1. 257 is_slippery : bool 258 If True, the outcome of the action is stochastic due to the frozen tiles being slippery. By default, it is 259 set to True. 260 goal_r : Union[Tuple, rv_continuous] 261 The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters 262 or as a rv_continuous object. 263 default_r : Union[Tuple, rv_continuous] 264 The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a 265 rv_continuous object. 266 make_reward_stochastic : bool 267 If True, the rewards of the MDP will be stochastic. By default, it is set to False. 268 reward_variance_multiplier : float 269 A constant that can be used to increase the variance of the reward distributions without changing their means. 270 The lower the value, the higher the variance. By default, it is set to 1. 271 """ 272 273 if type(goal_r) == tuple: 274 goal_r = get_dist(goal_r[0], goal_r[1]) 275 if type(default_r) == tuple: 276 default_r = get_dist(default_r[0], default_r[1]) 277 278 self._size = size 279 self._p_frozen = p_frozen 280 self._optimal_return = optimal_return 281 self._suboptimal_return = suboptimal_return 282 self._is_slippery = is_slippery 283 self._goal_r = goal_r 284 self._default_r = default_r 285 286 np.random.seed(seed) 287 self.lake = np.array( 288 list( 289 map( 290 lambda x: list(x), 291 generate_random_map(size=self._size, p=self._p_frozen), 292 ) 293 ) 294 ) 295 296 if (default_r, goal_r).count(None) == 0: 297 self._default_r = default_r 298 self._goal_r = goal_r 299 else: 300 if make_reward_stochastic: 301 self._default_r = beta( 302 reward_variance_multiplier, 303 reward_variance_multiplier 304 * (size ** 2 / self._suboptimal_return - 1), 305 ) 306 self._goal_r = beta( 307 reward_variance_multiplier * (size ** 2 / self._optimal_return - 1), 308 reward_variance_multiplier, 309 ) 310 else: 311 self._default_r = deterministic(0.0) 312 self._goal_r = deterministic(1.0) 313 314 super(FrozenLakeMDP, self).__init__( 315 seed=seed, 316 reward_variance_multiplier=reward_variance_multiplier, 317 make_reward_stochastic=make_reward_stochastic, 318 **kwargs, 319 )
Parameters
- seed (int): The seed used for sampling rewards and next states.
- size (int): The size of the grid.
- p_frozen (float): The probability that a tile of the lake is frozen and does not contain a hole.
- optimal_return (float): If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory. By default, it is set to 1.
- suboptimal_return (float): If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories. By default, it is set to 0.1.
- is_slippery (bool): If True, the outcome of the action is stochastic due to the frozen tiles being slippery. By default, it is set to True.
- goal_r (Union[Tuple, rv_continuous]): The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
- default_r (Union[Tuple, rv_continuous]): The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
- make_reward_stochastic (bool): If True, the rewards of the MDP will be stochastic. By default, it is set to False.
- reward_variance_multiplier (float): A constant that can be used to increase the variance of the reward distributions without changing their means. The lower the value, the higher the variance. By default, it is set to 1.
@staticmethod
def
get_unique_symbols() -> List[str]:
Returns
- List[str]: the unique symbols of the grid representation of the MDP.
@staticmethod
def
does_seed_change_MDP_structure() -> bool:
Returns
- bool: True if when changing the seed the transition matrix and/or rewards matrix change. This for example may
- happen when there are fewer starting states that possible one and the effective starting states are picked
- randomly based on the seed.
@staticmethod
def
sample_mdp_parameters(n: int, is_episodic: bool, seed: int = None) -> List[Dict[str, Any]]:
68 @staticmethod 69 def sample_mdp_parameters( 70 n: int, is_episodic: bool, seed: int = None 71 ) -> List[Dict[str, Any]]: 72 rng = np.random.RandomState(np.random.randint(10_000) if seed is None else seed) 73 samples = [] 74 for _ in range(n): 75 p_rand, p_lazy, _ = 0.9 * rng.dirichlet([0.2, 0.2, 5]) 76 sample = dict( 77 size=rng.choice(range(5, 7), None, True, [0.665, 0.335]) 78 if is_episodic 79 else int((2.5 + np.minimum((400 / (150 * rng.random() + 35)), 15))), 80 p_frozen=min((0.55 * rng.random() + 0.45) ** 0.3, 0.95), 81 p_rand=p_rand, 82 p_lazy=p_lazy, 83 make_reward_stochastic=rng.choice([True, False]), 84 reward_variance_multiplier=2 * rng.random() + 0.005, 85 ) 86 sample["p_rand"] = None if sample["p_rand"] < 0.01 else sample["p_rand"] 87 sample["p_lazy"] = None if sample["p_lazy"] < 0.01 else sample["p_lazy"] 88 89 if sample["make_reward_stochastic"]: 90 sample["default_r"] = ( 91 "beta", 92 ( 93 sample["reward_variance_multiplier"], 94 sample["reward_variance_multiplier"] 95 * (sample["size"] ** 2 / 0.1 - 1), 96 ), 97 ) 98 sample["goal_r"] = ( 99 "beta", 100 ( 101 sample["reward_variance_multiplier"] 102 * (sample["size"] ** 2 - 1), 103 sample["reward_variance_multiplier"], 104 ), 105 ) 106 else: 107 sample["default_r"] = ("deterministic", (0.0,)) 108 sample["goal_r"] = ("deterministic", (1.0,)) 109 110 samples.append(rounding_nested_structure(sample)) 111 return samples
Returns
- List[Dict[str, Any]]: n sampled parameters that can be used to construct an MDP in a reasonable amount of time.
@staticmethod
def
get_node_class() -> Type[Union[colosseum.mdp.custom_mdp.CustomNode, colosseum.mdp.river_swim.base.RiverSwimNode, colosseum.mdp.deep_sea.base.DeepSeaNode, colosseum.mdp.frozen_lake.base.FrozenLakeNode, colosseum.mdp.simple_grid.base.SimpleGridNode, colosseum.mdp.minigrid_empty.base.MiniGridEmptyNode, colosseum.mdp.minigrid_rooms.base.MiniGridRoomsNode, colosseum.mdp.taxi.base.TaxiNode]]:
Returns
- Type["NODE_TYPE"]: The class of the nodes of the MDP.
def
get_gin_parameters(self, index: int) -> str:
117 def get_gin_parameters(self, index: int) -> str: 118 prms = dict( 119 size=self._size, 120 p_frozen=self._p_frozen, 121 make_reward_stochastic=self._make_reward_stochastic, 122 reward_variance_multiplier=self._reward_variance_multiplier, 123 default_r=( 124 self._default_r.dist.name, 125 self._default_r.args, 126 ), 127 goal_r=( 128 self._goal_r.dist.name, 129 self._goal_r.args, 130 ), 131 ) 132 133 if self._p_rand is not None: 134 prms["p_rand"] = self._p_rand 135 if self._p_lazy is not None: 136 prms["p_lazy"] = self._p_lazy 137 138 return FrozenLakeMDP.produce_gin_file_from_mdp_parameters( 139 prms, type(self).__name__, index 140 )
Returns
- str: The gin config of the MDP instance.
Inherited Members
- colosseum.mdp.base.BaseMDP
- get_available_hardness_measures
- produce_gin_file_from_mdp_parameters
- is_episodic
- sample_parameters
- get_grid_representation
- get_gin_config
- get_node_labels
- get_node_action_labels
- hash
- instantiate_MDP
- T
- R
- recurrent_nodes_set
- communication_class
- get_optimal_policy
- get_worst_policy
- get_value_functions
- optimal_value_functions
- worst_value_functions
- random_value_functions
- optimal_transition_probabilities
- worst_transition_probabilities
- random_transition_probabilities
- optimal_markov_chain
- worst_markov_chain
- random_markov_chain
- get_stationary_distribution
- optimal_stationary_distribution
- worst_stationary_distribution
- random_stationary_distribution
- optimal_average_rewards
- worst_average_rewards
- random_average_rewards
- get_average_reward
- optimal_average_reward
- worst_average_reward
- random_average_reward
- transition_matrix_and_rewards
- graph_layout
- graph_metrics
- diameter
- sum_reciprocals_suboptimality_gaps
- discounted_value_norm
- undiscounted_value_norm
- value_norm
- measures_of_hardness
- summary
- hardness_report
- get_info_class
- get_transition_distributions
- get_reward_distribution
- sample_reward
- get_measure_from_name
- action_spec
- observation_spec
- get_observation
- reset
- step
- random_steps
- random_step
- get_visitation_counts
- reset_visitation_counts
- get_value_node_labels
- dm_env._environment.Environment
- reward_spec
- discount_spec
- close