colosseum.mdp.simple_grid.base
1import abc 2from dataclasses import dataclass 3from enum import IntEnum 4from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Type, Union 5 6import gin 7import numpy as np 8from scipy.stats import beta, rv_continuous 9 10from colosseum.mdp import BaseMDP 11from colosseum.mdp.utils.custom_samplers import NextStateSampler 12from colosseum.utils.miscellanea import ( 13 check_distributions, 14 deterministic, 15 get_dist, 16 rounding_nested_structure, 17) 18 19if TYPE_CHECKING: 20 from colosseum.mdp import ACTION_TYPE, NODE_TYPE 21 22 23@dataclass(frozen=True) 24class SimpleGridNode: 25 """ 26 The node for the SimpleGrid MDP. 27 """ 28 29 X: int 30 """x coordinate.""" 31 Y: int 32 """y coordinate.""" 33 34 def __str__(self): 35 return f"X={self.X},Y={self.Y}" 36 37 def __iter__(self): 38 return iter((self.X, self.Y)) 39 40 41class SimpleGridAction(IntEnum): 42 """ 43 The actions available in the SimpleGrid MDP. 44 """ 45 46 UP = 0 47 RIGHT = 1 48 DOWN = 2 49 LEFT = 3 50 NO_OP = 4 51 52 53@gin.constants_from_enum 54class SimpleGridReward(IntEnum): 55 """ 56 The reward types available in the SimpleGrid MDP. It controls the rewards for the corner states. 57 """ 58 59 AND = 0 60 NAND = 1 61 OR = 2 62 XOR = 3 63 64 65class SimpleGridMDP(BaseMDP, abc.ABC): 66 """ 67 The base class for the SimpleGrid family. 68 """ 69 70 @staticmethod 71 def get_action_class() -> SimpleGridAction: 72 return SimpleGridAction 73 74 @staticmethod 75 def get_unique_symbols() -> List[str]: 76 return [" ", "A", "+", "-"] 77 78 @staticmethod 79 def does_seed_change_MDP_structure() -> bool: 80 return True 81 82 @staticmethod 83 def sample_mdp_parameters( 84 n: int, is_episodic: bool, seed: int = None 85 ) -> List[Dict[str, Any]]: 86 rng = np.random.RandomState(np.random.randint(10_000) if seed is None else seed) 87 samples = [] 88 for _ in range(n): 89 p_rand, p_lazy, _ = 0.9 * rng.dirichlet([0.2, 0.2, 5]) 90 sample = dict( 91 size=int( 92 ( 93 1 94 + np.minimum((800 / (100 * rng.random() + 35)), 25) 95 * (0.8 if is_episodic else 1) 96 ) 97 ), 98 n_starting_states=rng.randint(1, 5), 99 p_rand=p_rand, 100 p_lazy=p_lazy, 101 make_reward_stochastic=rng.choice([True, False]), 102 reward_variance_multiplier=2 * rng.random() + 0.005, 103 ) 104 sample["p_rand"] = None if sample["p_rand"] < 0.01 else sample["p_rand"] 105 sample["p_lazy"] = None if sample["p_lazy"] < 0.01 else sample["p_lazy"] 106 107 sample["reward_type"] = rng.randint(4) 108 109 if sample["make_reward_stochastic"]: 110 sample["sub_optimal_distribution"] = ( 111 "beta", 112 ( 113 sample["reward_variance_multiplier"], 114 sample["reward_variance_multiplier"] * (10 / 0.2 - 1), 115 ), 116 ) 117 sample["optimal_distribution"] = ( 118 "beta", 119 ( 120 sample["reward_variance_multiplier"], 121 sample["reward_variance_multiplier"] * (1 / 0.9 - 1), 122 ), 123 ) 124 sample["other_distribution"] = ( 125 "beta", 126 ( 127 sample["reward_variance_multiplier"], 128 sample["reward_variance_multiplier"] * (1 / 0.2 - 1), 129 ), 130 ) 131 else: 132 sample["sub_optimal_distribution"] = ("deterministic", (0.0,)) 133 sample["optimal_distribution"] = ("deterministic", (1.0,)) 134 sample["other_distribution"] = ("deterministic", (0.5,)) 135 136 samples.append(rounding_nested_structure(sample)) 137 return samples 138 139 @staticmethod 140 def get_node_class() -> Type["NODE_TYPE"]: 141 return SimpleGridNode 142 143 def get_gin_parameters(self, index: int) -> str: 144 prms = dict( 145 size=self._size, 146 n_starting_states=self._n_starting_states, 147 reward_type=int(self._reward_type), 148 make_reward_stochastic=self._make_reward_stochastic, 149 reward_variance_multiplier=self._reward_variance_multiplier, 150 sub_optimal_distribution=( 151 self._sub_optimal_distribution.dist.name, 152 self._sub_optimal_distribution.args, 153 ), 154 optimal_distribution=( 155 self._optimal_distribution.dist.name, 156 self._optimal_distribution.args, 157 ), 158 other_distribution=( 159 self._other_distribution.dist.name, 160 self._other_distribution.args, 161 ), 162 ) 163 if self._p_rand is not None: 164 prms["p_rand"] = self._p_rand 165 166 return SimpleGridMDP.produce_gin_file_from_mdp_parameters( 167 prms, type(self).__name__, index 168 ) 169 170 @property 171 def n_actions(self) -> int: 172 return len(SimpleGridAction) 173 174 def _get_next_nodes_parameters( 175 self, node: "NODE_TYPE", action: "ACTION_TYPE" 176 ) -> Tuple[Tuple[dict, float], ...]: 177 if action == SimpleGridAction.UP: 178 return ((dict(X=node.X, Y=min(node.Y + 1, self._size - 1)), 1.0),) 179 if action == SimpleGridAction.RIGHT: 180 return ((dict(X=min(node.X + 1, self._size - 1), Y=node.Y), 1.0),) 181 if action == SimpleGridAction.DOWN: 182 return ((dict(X=node.X, Y=max(node.Y - 1, 0)), 1.0),) 183 if action == SimpleGridAction.LEFT: 184 return ((dict(X=max(node.X - 1, 0), Y=node.Y), 1.0),) 185 if action == SimpleGridAction.NO_OP: 186 return ((dict(X=node.X, Y=node.Y), 1.0),) 187 188 @staticmethod 189 def _is_corner_loop(node, next_node, size): 190 return ( 191 node.X == next_node.X 192 and node.Y == next_node.Y 193 and node.X in [0, size - 1] 194 and node.Y in [0, size - 1] 195 ) 196 197 def _get_reward_distribution( 198 self, node: "NODE_TYPE", action: "ACTION_TYPE", next_node: "NODE_TYPE" 199 ) -> rv_continuous: 200 # Corner nodes 201 if SimpleGridMDP._is_corner_loop(node, next_node, self._size): 202 if ( 203 (self._reward_type == SimpleGridReward.AND and (node.X and node.Y)) 204 or ( 205 self._reward_type == SimpleGridReward.NAND 206 and not (node.X and node.Y) 207 ) 208 or (self._reward_type == SimpleGridReward.OR and (node.X | node.Y)) 209 or (self._reward_type == SimpleGridReward.XOR and (node.X ^ node.Y)) 210 ): 211 return self._optimal_distribution 212 else: 213 return self._sub_optimal_distribution 214 else: 215 return self._other_distribution 216 217 def _calculate_starting_nodes(self): 218 center = np.array(((self._size - 1) / 2, (self._size - 1) / 2)) 219 distances = np.empty((self._size, self._size)) 220 for x in range(self._size): 221 for y in range(self._size): 222 distances[x, y] = ((np.array((x, y)) - center) ** 2).sum() 223 224 batch: list = np.array(np.where(distances == distances.min())).T.tolist() 225 self._rng.shuffle(batch) 226 while not np.all(distances == np.inf): 227 distances[batch[0][0], batch[0][1]] = np.inf 228 yield batch[0] 229 batch.pop(0) 230 if len(batch) == 0: 231 batch: list = np.array( 232 np.where(distances == distances.min()) 233 ).T.tolist() 234 235 def _get_starting_node_sampler(self) -> NextStateSampler: 236 starting_nodes_iter = self._calculate_starting_nodes() 237 self.__possible_starting_nodes = [ 238 self.get_node_class()(*next(starting_nodes_iter)) 239 for _ in range((self._size - 1) ** 2) 240 ] 241 starting_nodes = self._possible_starting_nodes[: self._n_starting_states] 242 self._rng.shuffle(starting_nodes) 243 if len(starting_nodes) == 1: 244 return NextStateSampler(next_nodes=starting_nodes) 245 return NextStateSampler( 246 next_nodes=starting_nodes, 247 probs=[1 / self._n_starting_states for _ in range(self._n_starting_states)], 248 seed=self._produce_random_seed(), 249 ) 250 251 def _check_parameters_in_input(self): 252 super(SimpleGridMDP, self)._check_parameters_in_input() 253 254 assert self._n_starting_states <= (self._size - 1) ** 2 255 assert self._optimal_mean_reward - 0.1 > self._sub_optimal_mean_reward 256 257 dists = [ 258 self._sub_optimal_distribution, 259 self._optimal_distribution, 260 self._other_distribution, 261 ] 262 check_distributions( 263 dists, 264 self._make_reward_stochastic, 265 ) 266 267 def _get_grid_representation(self, node: "NODE_TYPE") -> np.ndarray: 268 grid = np.zeros((self._size, self._size), dtype=str) 269 grid[:, :] = " " 270 271 # Corner nodes 272 if self._reward_type == SimpleGridReward.AND: 273 grid[0, 0] = "-" 274 grid[0, -1] = "-" 275 grid[-1, 0] = "-" 276 grid[-1, -1] = "+" 277 elif self._reward_type == SimpleGridReward.NAND: 278 grid[0, 0] = "+" 279 grid[0, -1] = "+" 280 grid[-1, 0] = "+" 281 grid[-1, -1] = "-" 282 elif self._reward_type == SimpleGridReward.OR: 283 grid[0, 0] = "-" 284 grid[0, -1] = "+" 285 grid[-1, 0] = "+" 286 grid[-1, -1] = "+" 287 else: 288 grid[0, 0] = "-" 289 grid[0, -1] = "+" 290 grid[-1, 0] = "+" 291 grid[-1, -1] = "-" 292 293 grid[node.Y, node.X] = "A" 294 return grid[::-1, :] 295 296 @property 297 def _possible_starting_nodes(self) -> List["NODE_TYPE"]: 298 return self.__possible_starting_nodes 299 300 @property 301 def parameters(self) -> Dict[str, Any]: 302 return { 303 **super(SimpleGridMDP, self).parameters, 304 **dict( 305 size=self._size, 306 reward_type=self._reward_type, 307 n_starting_states=self._n_starting_states, 308 optimal_mean_reward=self._optimal_mean_reward, 309 sub_optimal_mean_reward=self._sub_optimal_mean_reward, 310 optimal_distribution=self._optimal_distribution, 311 sub_optimal_distribution=self._sub_optimal_distribution, 312 other_distribution=self._other_distribution, 313 ), 314 } 315 316 def __init__( 317 self, 318 seed: int, 319 size: int, 320 reward_type: SimpleGridReward = SimpleGridReward.XOR, 321 n_starting_states: int = 1, 322 optimal_mean_reward: float = 0.9, 323 sub_optimal_mean_reward: float = 0.2, 324 optimal_distribution: Union[Tuple, rv_continuous] = None, 325 sub_optimal_distribution: Union[Tuple, rv_continuous] = None, 326 other_distribution: Union[Tuple, rv_continuous] = None, 327 make_reward_stochastic=False, 328 reward_variance_multiplier: float = 1.0, 329 **kwargs, 330 ): 331 """ 332 333 Parameters 334 ---------- 335 seed : int 336 The seed used for sampling rewards and next states. 337 size : int 338 The size of the grid. 339 reward_type : SimpleGridReward 340 The type of reward for the MDP. By default, the XOR type is used. 341 n_starting_states : int 342 The number of possible starting states. 343 optimal_mean_reward : float 344 If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory. 345 By default, it is set to 0.9. 346 sub_optimal_mean_reward : float 347 If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories. 348 By default, it is set to 0.2. 349 optimal_distribution : Union[Tuple, rv_continuous] 350 The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters 351 or as a rv_continuous object. 352 sub_optimal_distribution : Union[Tuple, rv_continuous] 353 The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta 354 parameters or as a rv_continuous object. 355 other_distribution : Union[Tuple, rv_continuous] 356 The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a 357 rv_continuous object. 358 make_reward_stochastic : bool 359 If True, the rewards of the MDP will be stochastic. By default, it is set to False. 360 reward_variance_multiplier : float 361 A constant that can be used to increase the variance of the reward distributions without changing their means. 362 The lower the value, the higher the variance. By default, it is set to 1. 363 """ 364 365 if type(sub_optimal_distribution) == tuple: 366 sub_optimal_distribution = get_dist( 367 sub_optimal_distribution[0], sub_optimal_distribution[1] 368 ) 369 if type(optimal_distribution) == tuple: 370 optimal_distribution = get_dist( 371 optimal_distribution[0], optimal_distribution[1] 372 ) 373 if type(other_distribution) == tuple: 374 other_distribution = get_dist(other_distribution[0], other_distribution[1]) 375 376 self._size = size 377 self._reward_type = SimpleGridReward(reward_type) 378 self._n_starting_states = n_starting_states 379 self._optimal_mean_reward = optimal_mean_reward 380 self._sub_optimal_mean_reward = sub_optimal_mean_reward 381 dists = [ 382 sub_optimal_distribution, 383 optimal_distribution, 384 other_distribution, 385 ] 386 387 if dists.count(None) == 0: 388 self._sub_optimal_distribution = sub_optimal_distribution 389 self._optimal_distribution = optimal_distribution 390 self._other_distribution = other_distribution 391 else: 392 if make_reward_stochastic: 393 self._sub_optimal_distribution = beta( 394 reward_variance_multiplier, 395 reward_variance_multiplier * (10 / sub_optimal_mean_reward - 1), 396 ) 397 self._optimal_distribution = beta( 398 reward_variance_multiplier, 399 reward_variance_multiplier * (1 / optimal_mean_reward - 1), 400 ) 401 self._other_distribution = beta( 402 reward_variance_multiplier, 403 reward_variance_multiplier * (1 / sub_optimal_mean_reward - 1), 404 ) 405 else: 406 self._sub_optimal_distribution = deterministic(0.0) 407 self._optimal_distribution = deterministic(1.0) 408 self._other_distribution = deterministic(0.5) 409 410 super(SimpleGridMDP, self).__init__( 411 seed=seed, 412 reward_variance_multiplier=reward_variance_multiplier, 413 make_reward_stochastic=make_reward_stochastic, 414 **kwargs, 415 )
@dataclass(frozen=True)
class
SimpleGridNode:
24@dataclass(frozen=True) 25class SimpleGridNode: 26 """ 27 The node for the SimpleGrid MDP. 28 """ 29 30 X: int 31 """x coordinate.""" 32 Y: int 33 """y coordinate.""" 34 35 def __str__(self): 36 return f"X={self.X},Y={self.Y}" 37 38 def __iter__(self): 39 return iter((self.X, self.Y))
The node for the SimpleGrid MDP.
class
SimpleGridAction(enum.IntEnum):
42class SimpleGridAction(IntEnum): 43 """ 44 The actions available in the SimpleGrid MDP. 45 """ 46 47 UP = 0 48 RIGHT = 1 49 DOWN = 2 50 LEFT = 3 51 NO_OP = 4
The actions available in the SimpleGrid MDP.
Inherited Members
- enum.Enum
- name
- value
- builtins.int
- conjugate
- bit_length
- to_bytes
- from_bytes
- as_integer_ratio
- real
- imag
- numerator
- denominator
@gin.constants_from_enum
class
SimpleGridReward54@gin.constants_from_enum 55class SimpleGridReward(IntEnum): 56 """ 57 The reward types available in the SimpleGrid MDP. It controls the rewards for the corner states. 58 """ 59 60 AND = 0 61 NAND = 1 62 OR = 2 63 XOR = 3
The reward types available in the SimpleGrid MDP. It controls the rewards for the corner states.
Inherited Members
- enum.Enum
- name
- value
- builtins.int
- conjugate
- bit_length
- to_bytes
- from_bytes
- as_integer_ratio
- real
- imag
- numerator
- denominator
66class SimpleGridMDP(BaseMDP, abc.ABC): 67 """ 68 The base class for the SimpleGrid family. 69 """ 70 71 @staticmethod 72 def get_action_class() -> SimpleGridAction: 73 return SimpleGridAction 74 75 @staticmethod 76 def get_unique_symbols() -> List[str]: 77 return [" ", "A", "+", "-"] 78 79 @staticmethod 80 def does_seed_change_MDP_structure() -> bool: 81 return True 82 83 @staticmethod 84 def sample_mdp_parameters( 85 n: int, is_episodic: bool, seed: int = None 86 ) -> List[Dict[str, Any]]: 87 rng = np.random.RandomState(np.random.randint(10_000) if seed is None else seed) 88 samples = [] 89 for _ in range(n): 90 p_rand, p_lazy, _ = 0.9 * rng.dirichlet([0.2, 0.2, 5]) 91 sample = dict( 92 size=int( 93 ( 94 1 95 + np.minimum((800 / (100 * rng.random() + 35)), 25) 96 * (0.8 if is_episodic else 1) 97 ) 98 ), 99 n_starting_states=rng.randint(1, 5), 100 p_rand=p_rand, 101 p_lazy=p_lazy, 102 make_reward_stochastic=rng.choice([True, False]), 103 reward_variance_multiplier=2 * rng.random() + 0.005, 104 ) 105 sample["p_rand"] = None if sample["p_rand"] < 0.01 else sample["p_rand"] 106 sample["p_lazy"] = None if sample["p_lazy"] < 0.01 else sample["p_lazy"] 107 108 sample["reward_type"] = rng.randint(4) 109 110 if sample["make_reward_stochastic"]: 111 sample["sub_optimal_distribution"] = ( 112 "beta", 113 ( 114 sample["reward_variance_multiplier"], 115 sample["reward_variance_multiplier"] * (10 / 0.2 - 1), 116 ), 117 ) 118 sample["optimal_distribution"] = ( 119 "beta", 120 ( 121 sample["reward_variance_multiplier"], 122 sample["reward_variance_multiplier"] * (1 / 0.9 - 1), 123 ), 124 ) 125 sample["other_distribution"] = ( 126 "beta", 127 ( 128 sample["reward_variance_multiplier"], 129 sample["reward_variance_multiplier"] * (1 / 0.2 - 1), 130 ), 131 ) 132 else: 133 sample["sub_optimal_distribution"] = ("deterministic", (0.0,)) 134 sample["optimal_distribution"] = ("deterministic", (1.0,)) 135 sample["other_distribution"] = ("deterministic", (0.5,)) 136 137 samples.append(rounding_nested_structure(sample)) 138 return samples 139 140 @staticmethod 141 def get_node_class() -> Type["NODE_TYPE"]: 142 return SimpleGridNode 143 144 def get_gin_parameters(self, index: int) -> str: 145 prms = dict( 146 size=self._size, 147 n_starting_states=self._n_starting_states, 148 reward_type=int(self._reward_type), 149 make_reward_stochastic=self._make_reward_stochastic, 150 reward_variance_multiplier=self._reward_variance_multiplier, 151 sub_optimal_distribution=( 152 self._sub_optimal_distribution.dist.name, 153 self._sub_optimal_distribution.args, 154 ), 155 optimal_distribution=( 156 self._optimal_distribution.dist.name, 157 self._optimal_distribution.args, 158 ), 159 other_distribution=( 160 self._other_distribution.dist.name, 161 self._other_distribution.args, 162 ), 163 ) 164 if self._p_rand is not None: 165 prms["p_rand"] = self._p_rand 166 167 return SimpleGridMDP.produce_gin_file_from_mdp_parameters( 168 prms, type(self).__name__, index 169 ) 170 171 @property 172 def n_actions(self) -> int: 173 return len(SimpleGridAction) 174 175 def _get_next_nodes_parameters( 176 self, node: "NODE_TYPE", action: "ACTION_TYPE" 177 ) -> Tuple[Tuple[dict, float], ...]: 178 if action == SimpleGridAction.UP: 179 return ((dict(X=node.X, Y=min(node.Y + 1, self._size - 1)), 1.0),) 180 if action == SimpleGridAction.RIGHT: 181 return ((dict(X=min(node.X + 1, self._size - 1), Y=node.Y), 1.0),) 182 if action == SimpleGridAction.DOWN: 183 return ((dict(X=node.X, Y=max(node.Y - 1, 0)), 1.0),) 184 if action == SimpleGridAction.LEFT: 185 return ((dict(X=max(node.X - 1, 0), Y=node.Y), 1.0),) 186 if action == SimpleGridAction.NO_OP: 187 return ((dict(X=node.X, Y=node.Y), 1.0),) 188 189 @staticmethod 190 def _is_corner_loop(node, next_node, size): 191 return ( 192 node.X == next_node.X 193 and node.Y == next_node.Y 194 and node.X in [0, size - 1] 195 and node.Y in [0, size - 1] 196 ) 197 198 def _get_reward_distribution( 199 self, node: "NODE_TYPE", action: "ACTION_TYPE", next_node: "NODE_TYPE" 200 ) -> rv_continuous: 201 # Corner nodes 202 if SimpleGridMDP._is_corner_loop(node, next_node, self._size): 203 if ( 204 (self._reward_type == SimpleGridReward.AND and (node.X and node.Y)) 205 or ( 206 self._reward_type == SimpleGridReward.NAND 207 and not (node.X and node.Y) 208 ) 209 or (self._reward_type == SimpleGridReward.OR and (node.X | node.Y)) 210 or (self._reward_type == SimpleGridReward.XOR and (node.X ^ node.Y)) 211 ): 212 return self._optimal_distribution 213 else: 214 return self._sub_optimal_distribution 215 else: 216 return self._other_distribution 217 218 def _calculate_starting_nodes(self): 219 center = np.array(((self._size - 1) / 2, (self._size - 1) / 2)) 220 distances = np.empty((self._size, self._size)) 221 for x in range(self._size): 222 for y in range(self._size): 223 distances[x, y] = ((np.array((x, y)) - center) ** 2).sum() 224 225 batch: list = np.array(np.where(distances == distances.min())).T.tolist() 226 self._rng.shuffle(batch) 227 while not np.all(distances == np.inf): 228 distances[batch[0][0], batch[0][1]] = np.inf 229 yield batch[0] 230 batch.pop(0) 231 if len(batch) == 0: 232 batch: list = np.array( 233 np.where(distances == distances.min()) 234 ).T.tolist() 235 236 def _get_starting_node_sampler(self) -> NextStateSampler: 237 starting_nodes_iter = self._calculate_starting_nodes() 238 self.__possible_starting_nodes = [ 239 self.get_node_class()(*next(starting_nodes_iter)) 240 for _ in range((self._size - 1) ** 2) 241 ] 242 starting_nodes = self._possible_starting_nodes[: self._n_starting_states] 243 self._rng.shuffle(starting_nodes) 244 if len(starting_nodes) == 1: 245 return NextStateSampler(next_nodes=starting_nodes) 246 return NextStateSampler( 247 next_nodes=starting_nodes, 248 probs=[1 / self._n_starting_states for _ in range(self._n_starting_states)], 249 seed=self._produce_random_seed(), 250 ) 251 252 def _check_parameters_in_input(self): 253 super(SimpleGridMDP, self)._check_parameters_in_input() 254 255 assert self._n_starting_states <= (self._size - 1) ** 2 256 assert self._optimal_mean_reward - 0.1 > self._sub_optimal_mean_reward 257 258 dists = [ 259 self._sub_optimal_distribution, 260 self._optimal_distribution, 261 self._other_distribution, 262 ] 263 check_distributions( 264 dists, 265 self._make_reward_stochastic, 266 ) 267 268 def _get_grid_representation(self, node: "NODE_TYPE") -> np.ndarray: 269 grid = np.zeros((self._size, self._size), dtype=str) 270 grid[:, :] = " " 271 272 # Corner nodes 273 if self._reward_type == SimpleGridReward.AND: 274 grid[0, 0] = "-" 275 grid[0, -1] = "-" 276 grid[-1, 0] = "-" 277 grid[-1, -1] = "+" 278 elif self._reward_type == SimpleGridReward.NAND: 279 grid[0, 0] = "+" 280 grid[0, -1] = "+" 281 grid[-1, 0] = "+" 282 grid[-1, -1] = "-" 283 elif self._reward_type == SimpleGridReward.OR: 284 grid[0, 0] = "-" 285 grid[0, -1] = "+" 286 grid[-1, 0] = "+" 287 grid[-1, -1] = "+" 288 else: 289 grid[0, 0] = "-" 290 grid[0, -1] = "+" 291 grid[-1, 0] = "+" 292 grid[-1, -1] = "-" 293 294 grid[node.Y, node.X] = "A" 295 return grid[::-1, :] 296 297 @property 298 def _possible_starting_nodes(self) -> List["NODE_TYPE"]: 299 return self.__possible_starting_nodes 300 301 @property 302 def parameters(self) -> Dict[str, Any]: 303 return { 304 **super(SimpleGridMDP, self).parameters, 305 **dict( 306 size=self._size, 307 reward_type=self._reward_type, 308 n_starting_states=self._n_starting_states, 309 optimal_mean_reward=self._optimal_mean_reward, 310 sub_optimal_mean_reward=self._sub_optimal_mean_reward, 311 optimal_distribution=self._optimal_distribution, 312 sub_optimal_distribution=self._sub_optimal_distribution, 313 other_distribution=self._other_distribution, 314 ), 315 } 316 317 def __init__( 318 self, 319 seed: int, 320 size: int, 321 reward_type: SimpleGridReward = SimpleGridReward.XOR, 322 n_starting_states: int = 1, 323 optimal_mean_reward: float = 0.9, 324 sub_optimal_mean_reward: float = 0.2, 325 optimal_distribution: Union[Tuple, rv_continuous] = None, 326 sub_optimal_distribution: Union[Tuple, rv_continuous] = None, 327 other_distribution: Union[Tuple, rv_continuous] = None, 328 make_reward_stochastic=False, 329 reward_variance_multiplier: float = 1.0, 330 **kwargs, 331 ): 332 """ 333 334 Parameters 335 ---------- 336 seed : int 337 The seed used for sampling rewards and next states. 338 size : int 339 The size of the grid. 340 reward_type : SimpleGridReward 341 The type of reward for the MDP. By default, the XOR type is used. 342 n_starting_states : int 343 The number of possible starting states. 344 optimal_mean_reward : float 345 If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory. 346 By default, it is set to 0.9. 347 sub_optimal_mean_reward : float 348 If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories. 349 By default, it is set to 0.2. 350 optimal_distribution : Union[Tuple, rv_continuous] 351 The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters 352 or as a rv_continuous object. 353 sub_optimal_distribution : Union[Tuple, rv_continuous] 354 The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta 355 parameters or as a rv_continuous object. 356 other_distribution : Union[Tuple, rv_continuous] 357 The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a 358 rv_continuous object. 359 make_reward_stochastic : bool 360 If True, the rewards of the MDP will be stochastic. By default, it is set to False. 361 reward_variance_multiplier : float 362 A constant that can be used to increase the variance of the reward distributions without changing their means. 363 The lower the value, the higher the variance. By default, it is set to 1. 364 """ 365 366 if type(sub_optimal_distribution) == tuple: 367 sub_optimal_distribution = get_dist( 368 sub_optimal_distribution[0], sub_optimal_distribution[1] 369 ) 370 if type(optimal_distribution) == tuple: 371 optimal_distribution = get_dist( 372 optimal_distribution[0], optimal_distribution[1] 373 ) 374 if type(other_distribution) == tuple: 375 other_distribution = get_dist(other_distribution[0], other_distribution[1]) 376 377 self._size = size 378 self._reward_type = SimpleGridReward(reward_type) 379 self._n_starting_states = n_starting_states 380 self._optimal_mean_reward = optimal_mean_reward 381 self._sub_optimal_mean_reward = sub_optimal_mean_reward 382 dists = [ 383 sub_optimal_distribution, 384 optimal_distribution, 385 other_distribution, 386 ] 387 388 if dists.count(None) == 0: 389 self._sub_optimal_distribution = sub_optimal_distribution 390 self._optimal_distribution = optimal_distribution 391 self._other_distribution = other_distribution 392 else: 393 if make_reward_stochastic: 394 self._sub_optimal_distribution = beta( 395 reward_variance_multiplier, 396 reward_variance_multiplier * (10 / sub_optimal_mean_reward - 1), 397 ) 398 self._optimal_distribution = beta( 399 reward_variance_multiplier, 400 reward_variance_multiplier * (1 / optimal_mean_reward - 1), 401 ) 402 self._other_distribution = beta( 403 reward_variance_multiplier, 404 reward_variance_multiplier * (1 / sub_optimal_mean_reward - 1), 405 ) 406 else: 407 self._sub_optimal_distribution = deterministic(0.0) 408 self._optimal_distribution = deterministic(1.0) 409 self._other_distribution = deterministic(0.5) 410 411 super(SimpleGridMDP, self).__init__( 412 seed=seed, 413 reward_variance_multiplier=reward_variance_multiplier, 414 make_reward_stochastic=make_reward_stochastic, 415 **kwargs, 416 )
The base class for the SimpleGrid family.
SimpleGridMDP( seed: int, size: int, reward_type: colosseum.mdp.simple_grid.base.SimpleGridReward = <SimpleGridReward.XOR: 3>, n_starting_states: int = 1, optimal_mean_reward: float = 0.9, sub_optimal_mean_reward: float = 0.2, optimal_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, sub_optimal_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, other_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, make_reward_stochastic=False, reward_variance_multiplier: float = 1.0, **kwargs)
317 def __init__( 318 self, 319 seed: int, 320 size: int, 321 reward_type: SimpleGridReward = SimpleGridReward.XOR, 322 n_starting_states: int = 1, 323 optimal_mean_reward: float = 0.9, 324 sub_optimal_mean_reward: float = 0.2, 325 optimal_distribution: Union[Tuple, rv_continuous] = None, 326 sub_optimal_distribution: Union[Tuple, rv_continuous] = None, 327 other_distribution: Union[Tuple, rv_continuous] = None, 328 make_reward_stochastic=False, 329 reward_variance_multiplier: float = 1.0, 330 **kwargs, 331 ): 332 """ 333 334 Parameters 335 ---------- 336 seed : int 337 The seed used for sampling rewards and next states. 338 size : int 339 The size of the grid. 340 reward_type : SimpleGridReward 341 The type of reward for the MDP. By default, the XOR type is used. 342 n_starting_states : int 343 The number of possible starting states. 344 optimal_mean_reward : float 345 If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory. 346 By default, it is set to 0.9. 347 sub_optimal_mean_reward : float 348 If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories. 349 By default, it is set to 0.2. 350 optimal_distribution : Union[Tuple, rv_continuous] 351 The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters 352 or as a rv_continuous object. 353 sub_optimal_distribution : Union[Tuple, rv_continuous] 354 The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta 355 parameters or as a rv_continuous object. 356 other_distribution : Union[Tuple, rv_continuous] 357 The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a 358 rv_continuous object. 359 make_reward_stochastic : bool 360 If True, the rewards of the MDP will be stochastic. By default, it is set to False. 361 reward_variance_multiplier : float 362 A constant that can be used to increase the variance of the reward distributions without changing their means. 363 The lower the value, the higher the variance. By default, it is set to 1. 364 """ 365 366 if type(sub_optimal_distribution) == tuple: 367 sub_optimal_distribution = get_dist( 368 sub_optimal_distribution[0], sub_optimal_distribution[1] 369 ) 370 if type(optimal_distribution) == tuple: 371 optimal_distribution = get_dist( 372 optimal_distribution[0], optimal_distribution[1] 373 ) 374 if type(other_distribution) == tuple: 375 other_distribution = get_dist(other_distribution[0], other_distribution[1]) 376 377 self._size = size 378 self._reward_type = SimpleGridReward(reward_type) 379 self._n_starting_states = n_starting_states 380 self._optimal_mean_reward = optimal_mean_reward 381 self._sub_optimal_mean_reward = sub_optimal_mean_reward 382 dists = [ 383 sub_optimal_distribution, 384 optimal_distribution, 385 other_distribution, 386 ] 387 388 if dists.count(None) == 0: 389 self._sub_optimal_distribution = sub_optimal_distribution 390 self._optimal_distribution = optimal_distribution 391 self._other_distribution = other_distribution 392 else: 393 if make_reward_stochastic: 394 self._sub_optimal_distribution = beta( 395 reward_variance_multiplier, 396 reward_variance_multiplier * (10 / sub_optimal_mean_reward - 1), 397 ) 398 self._optimal_distribution = beta( 399 reward_variance_multiplier, 400 reward_variance_multiplier * (1 / optimal_mean_reward - 1), 401 ) 402 self._other_distribution = beta( 403 reward_variance_multiplier, 404 reward_variance_multiplier * (1 / sub_optimal_mean_reward - 1), 405 ) 406 else: 407 self._sub_optimal_distribution = deterministic(0.0) 408 self._optimal_distribution = deterministic(1.0) 409 self._other_distribution = deterministic(0.5) 410 411 super(SimpleGridMDP, self).__init__( 412 seed=seed, 413 reward_variance_multiplier=reward_variance_multiplier, 414 make_reward_stochastic=make_reward_stochastic, 415 **kwargs, 416 )
Parameters
- seed (int): The seed used for sampling rewards and next states.
- size (int): The size of the grid.
- reward_type (SimpleGridReward): The type of reward for the MDP. By default, the XOR type is used.
- n_starting_states (int): The number of possible starting states.
- optimal_mean_reward (float): If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory. By default, it is set to 0.9.
- sub_optimal_mean_reward (float): If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories. By default, it is set to 0.2.
- optimal_distribution (Union[Tuple, rv_continuous]): The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
- sub_optimal_distribution (Union[Tuple, rv_continuous]): The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
- other_distribution (Union[Tuple, rv_continuous]): The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
- make_reward_stochastic (bool): If True, the rewards of the MDP will be stochastic. By default, it is set to False.
- reward_variance_multiplier (float): A constant that can be used to increase the variance of the reward distributions without changing their means. The lower the value, the higher the variance. By default, it is set to 1.
@staticmethod
def
get_unique_symbols() -> List[str]:
Returns
- List[str]: the unique symbols of the grid representation of the MDP.
@staticmethod
def
does_seed_change_MDP_structure() -> bool:
Returns
- bool: True if when changing the seed the transition matrix and/or rewards matrix change. This for example may
- happen when there are fewer starting states that possible one and the effective starting states are picked
- randomly based on the seed.
@staticmethod
def
sample_mdp_parameters(n: int, is_episodic: bool, seed: int = None) -> List[Dict[str, Any]]:
83 @staticmethod 84 def sample_mdp_parameters( 85 n: int, is_episodic: bool, seed: int = None 86 ) -> List[Dict[str, Any]]: 87 rng = np.random.RandomState(np.random.randint(10_000) if seed is None else seed) 88 samples = [] 89 for _ in range(n): 90 p_rand, p_lazy, _ = 0.9 * rng.dirichlet([0.2, 0.2, 5]) 91 sample = dict( 92 size=int( 93 ( 94 1 95 + np.minimum((800 / (100 * rng.random() + 35)), 25) 96 * (0.8 if is_episodic else 1) 97 ) 98 ), 99 n_starting_states=rng.randint(1, 5), 100 p_rand=p_rand, 101 p_lazy=p_lazy, 102 make_reward_stochastic=rng.choice([True, False]), 103 reward_variance_multiplier=2 * rng.random() + 0.005, 104 ) 105 sample["p_rand"] = None if sample["p_rand"] < 0.01 else sample["p_rand"] 106 sample["p_lazy"] = None if sample["p_lazy"] < 0.01 else sample["p_lazy"] 107 108 sample["reward_type"] = rng.randint(4) 109 110 if sample["make_reward_stochastic"]: 111 sample["sub_optimal_distribution"] = ( 112 "beta", 113 ( 114 sample["reward_variance_multiplier"], 115 sample["reward_variance_multiplier"] * (10 / 0.2 - 1), 116 ), 117 ) 118 sample["optimal_distribution"] = ( 119 "beta", 120 ( 121 sample["reward_variance_multiplier"], 122 sample["reward_variance_multiplier"] * (1 / 0.9 - 1), 123 ), 124 ) 125 sample["other_distribution"] = ( 126 "beta", 127 ( 128 sample["reward_variance_multiplier"], 129 sample["reward_variance_multiplier"] * (1 / 0.2 - 1), 130 ), 131 ) 132 else: 133 sample["sub_optimal_distribution"] = ("deterministic", (0.0,)) 134 sample["optimal_distribution"] = ("deterministic", (1.0,)) 135 sample["other_distribution"] = ("deterministic", (0.5,)) 136 137 samples.append(rounding_nested_structure(sample)) 138 return samples
Returns
- List[Dict[str, Any]]: n sampled parameters that can be used to construct an MDP in a reasonable amount of time.
@staticmethod
def
get_node_class() -> Type[Union[colosseum.mdp.custom_mdp.CustomNode, colosseum.mdp.river_swim.base.RiverSwimNode, colosseum.mdp.deep_sea.base.DeepSeaNode, colosseum.mdp.frozen_lake.base.FrozenLakeNode, colosseum.mdp.simple_grid.base.SimpleGridNode, colosseum.mdp.minigrid_empty.base.MiniGridEmptyNode, colosseum.mdp.minigrid_rooms.base.MiniGridRoomsNode, colosseum.mdp.taxi.base.TaxiNode]]:
Returns
- Type["NODE_TYPE"]: The class of the nodes of the MDP.
def
get_gin_parameters(self, index: int) -> str:
144 def get_gin_parameters(self, index: int) -> str: 145 prms = dict( 146 size=self._size, 147 n_starting_states=self._n_starting_states, 148 reward_type=int(self._reward_type), 149 make_reward_stochastic=self._make_reward_stochastic, 150 reward_variance_multiplier=self._reward_variance_multiplier, 151 sub_optimal_distribution=( 152 self._sub_optimal_distribution.dist.name, 153 self._sub_optimal_distribution.args, 154 ), 155 optimal_distribution=( 156 self._optimal_distribution.dist.name, 157 self._optimal_distribution.args, 158 ), 159 other_distribution=( 160 self._other_distribution.dist.name, 161 self._other_distribution.args, 162 ), 163 ) 164 if self._p_rand is not None: 165 prms["p_rand"] = self._p_rand 166 167 return SimpleGridMDP.produce_gin_file_from_mdp_parameters( 168 prms, type(self).__name__, index 169 )
Returns
- str: The gin config of the MDP instance.
Inherited Members
- colosseum.mdp.base.BaseMDP
- get_available_hardness_measures
- produce_gin_file_from_mdp_parameters
- is_episodic
- sample_parameters
- get_grid_representation
- get_gin_config
- get_node_labels
- get_node_action_labels
- hash
- instantiate_MDP
- T
- R
- recurrent_nodes_set
- communication_class
- get_optimal_policy
- get_worst_policy
- get_value_functions
- optimal_value_functions
- worst_value_functions
- random_value_functions
- optimal_transition_probabilities
- worst_transition_probabilities
- random_transition_probabilities
- optimal_markov_chain
- worst_markov_chain
- random_markov_chain
- get_stationary_distribution
- optimal_stationary_distribution
- worst_stationary_distribution
- random_stationary_distribution
- optimal_average_rewards
- worst_average_rewards
- random_average_rewards
- get_average_reward
- optimal_average_reward
- worst_average_reward
- random_average_reward
- transition_matrix_and_rewards
- graph_layout
- graph_metrics
- diameter
- sum_reciprocals_suboptimality_gaps
- discounted_value_norm
- undiscounted_value_norm
- value_norm
- measures_of_hardness
- summary
- hardness_report
- get_info_class
- get_transition_distributions
- get_reward_distribution
- sample_reward
- get_measure_from_name
- action_spec
- observation_spec
- get_observation
- reset
- step
- random_steps
- random_step
- get_visitation_counts
- reset_visitation_counts
- get_value_node_labels
- dm_env._environment.Environment
- reward_spec
- discount_spec
- close