colosseum.mdp.deep_sea.base
1import abc 2from dataclasses import dataclass 3from enum import IntEnum 4from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Type, Union 5 6import numpy as np 7from scipy.stats import beta, rv_continuous 8 9from colosseum.mdp import BaseMDP 10from colosseum.mdp.utils.custom_samplers import NextStateSampler 11from colosseum.utils.miscellanea import ( 12 check_distributions, 13 deterministic, 14 get_dist, 15 rounding_nested_structure, 16) 17 18if TYPE_CHECKING: 19 from colosseum.mdp import ACTION_TYPE, NODE_TYPE 20 21 22@dataclass(frozen=True) 23class DeepSeaNode: 24 """ 25 The node for the DeepSea MDP. 26 """ 27 28 X: int 29 """x coordinate.""" 30 Y: int 31 """y coordinate.""" 32 33 def __str__(self): 34 return f"X={self.X},Y={self.Y}" 35 36 def __iter__(self): 37 return iter((self.X, self.Y)) 38 39 40class DeepSeaAction(IntEnum): 41 """ 42 The actions available in the DeepSea MDP. 43 """ 44 45 LEFT = 0 46 """Move towards the left.""" 47 RIGHT = 1 48 """Move towards the right.""" 49 50 51class DeepSeaMDP(BaseMDP, abc.ABC): 52 """ 53 The base class for the DeepSea family. 54 """ 55 56 @staticmethod 57 def get_unique_symbols() -> List[str]: 58 return ["A", " "] 59 60 @staticmethod 61 def does_seed_change_MDP_structure() -> bool: 62 return False 63 64 @staticmethod 65 def sample_mdp_parameters( 66 n: int, is_episodic: bool, seed: int = None 67 ) -> List[Dict[str, Any]]: 68 rng = np.random.RandomState(np.random.randint(10_000) if seed is None else seed) 69 samples = [] 70 for _ in range(n): 71 sample = dict( 72 size=int( 73 (1 + np.minimum((800 / (100 * rng.random() + 35)), 25)) 74 * (0.8 if is_episodic else 1) 75 ), 76 p_rand=min(2 / (8 * rng.random() + 3), 0.95), 77 make_reward_stochastic=rng.choice([True, False]), 78 reward_variance_multiplier=2 * rng.random() + 0.005, 79 ) 80 sample["p_rand"] = None if sample["p_rand"] < 0.01 else sample["p_rand"] 81 82 if sample["make_reward_stochastic"]: 83 sample["sub_optimal_distribution"] = ( 84 "beta", 85 ( 86 sample["reward_variance_multiplier"], 87 sample["reward_variance_multiplier"] 88 * (sample["size"] / 0.5 - 1), 89 ), 90 ) 91 sample["optimal_distribution"] = ( 92 "beta", 93 ( 94 sample["reward_variance_multiplier"] * (sample["size"] / 1 - 1), 95 sample["reward_variance_multiplier"], 96 ), 97 ) 98 sample["other_distribution"] = ( 99 "beta", 100 ( 101 sample["reward_variance_multiplier"], 102 sample["reward_variance_multiplier"] 103 * 10 104 * (sample["size"] / 0.5 - 1), 105 ), 106 ) 107 else: 108 sample["sub_optimal_distribution"] = ( 109 "deterministic", 110 (1.0 / (sample["size"] ** 2),), 111 ) 112 sample["optimal_distribution"] = ("deterministic", (1.0,)) 113 sample["other_distribution"] = ("deterministic", (0.0,)) 114 115 samples.append(rounding_nested_structure(sample)) 116 117 return samples 118 119 @staticmethod 120 def get_node_class() -> Type[DeepSeaNode]: 121 return DeepSeaNode 122 123 def get_gin_parameters(self, index: int) -> str: 124 prms = dict( 125 size=self._size, 126 make_reward_stochastic=self._make_reward_stochastic, 127 reward_variance_multiplier=self._reward_variance_multiplier, 128 sub_optimal_distribution=( 129 self._sub_optimal_distribution.dist.name, 130 self._sub_optimal_distribution.args, 131 ), 132 optimal_distribution=( 133 self._optimal_distribution.dist.name, 134 self._optimal_distribution.args, 135 ), 136 other_distribution=( 137 self._other_distribution.dist.name, 138 self._other_distribution.args, 139 ), 140 ) 141 if self._p_rand is not None: 142 prms["p_rand"] = self._p_rand 143 144 return DeepSeaMDP.produce_gin_file_from_mdp_parameters( 145 prms, type(self).__name__, index 146 ) 147 148 @property 149 def n_actions(self) -> int: 150 return len(DeepSeaAction) 151 152 def __init__( 153 self, 154 seed: int, 155 size: int, 156 optimal_return: float = 1.0, 157 suboptimal_return: float = 0.5, 158 optimal_distribution: Union[Tuple, rv_continuous] = None, 159 sub_optimal_distribution: Union[Tuple, rv_continuous] = None, 160 other_distribution: Union[Tuple, rv_continuous] = None, 161 make_reward_stochastic=False, 162 reward_variance_multiplier: float = 1.0, 163 **kwargs, 164 ): 165 """ 166 Parameters 167 ---------- 168 seed : int 169 The seed used for sampling rewards and next states. 170 size : int 171 The size of the grid. 172 optimal_return : float 173 If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory. 174 By default, it is set to 1. 175 suboptimal_return: float 176 If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories. 177 By default, it is set to 0.5. 178 optimal_distribution : Union[Tuple, rv_continuous] 179 The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters 180 or as a rv_continuous object. 181 sub_optimal_distribution : Union[Tuple, rv_continuous] 182 The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta 183 parameters or as a rv_continuous object. 184 other_distribution : Union[Tuple, rv_continuous] 185 The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a 186 rv_continuous object. 187 make_reward_stochastic : bool 188 If True, the rewards of the MDP will be stochastic. By default, it is set to False. 189 reward_variance_multiplier : float 190 A constant that can be used to increase the variance of the reward distributions without changing their means. 191 The lower the value, the higher the variance. By default, it is set to 1. 192 """ 193 194 if type(sub_optimal_distribution) == tuple: 195 sub_optimal_distribution = get_dist( 196 sub_optimal_distribution[0], sub_optimal_distribution[1] 197 ) 198 if type(optimal_distribution) == tuple: 199 optimal_distribution = get_dist( 200 optimal_distribution[0], optimal_distribution[1] 201 ) 202 if type(other_distribution) == tuple: 203 other_distribution = get_dist(other_distribution[0], other_distribution[1]) 204 205 self._size = size 206 self._optimal_return = optimal_return 207 self._suboptimal_return = suboptimal_return 208 self._optimal_distribution = optimal_distribution 209 self._sub_optimal_distribution = sub_optimal_distribution 210 self._other_distribution = other_distribution 211 212 dists = [ 213 sub_optimal_distribution, 214 optimal_distribution, 215 other_distribution, 216 ] 217 if dists.count(None) == 0: 218 self._sub_optimal_distribution = sub_optimal_distribution 219 self._optimal_distribution = optimal_distribution 220 self._other_distribution = other_distribution 221 else: 222 if make_reward_stochastic: 223 self._sub_optimal_distribution = beta( 224 reward_variance_multiplier, 225 reward_variance_multiplier * (size / self._suboptimal_return - 1), 226 ) 227 self._optimal_distribution = beta( 228 reward_variance_multiplier * (size / self._optimal_return - 1), 229 reward_variance_multiplier, 230 ) 231 self._other_distribution = beta( 232 reward_variance_multiplier, 233 reward_variance_multiplier 234 * 10 235 * (size / self._suboptimal_return - 1), 236 ) 237 else: 238 self._sub_optimal_distribution = deterministic(1.0 / (size ** 2)) 239 self._optimal_distribution = deterministic(1.0) 240 self._other_distribution = deterministic(0.0) 241 242 super(DeepSeaMDP, self).__init__( 243 seed=seed, 244 reward_variance_multiplier=reward_variance_multiplier, 245 make_reward_stochastic=make_reward_stochastic, 246 **kwargs, 247 ) 248 249 @property 250 def _possible_starting_nodes(self) -> List[DeepSeaNode]: 251 return [DeepSeaNode(0, self._size - 1)] 252 253 def _get_next_nodes_parameters( 254 self, node: "NODE_TYPE", action: "ACTION_TYPE" 255 ) -> Tuple[Tuple[dict, float], ...]: 256 if node.Y == 0: 257 return ((dict(X=0, Y=self._size - 1), 1.0),) 258 259 return ( 260 ( 261 dict( 262 X=min(node.X + 1, self._size - 1) 263 if action == DeepSeaAction.RIGHT 264 else max(node.X - 1, 0), 265 Y=max(0, node.Y - 1), 266 ), 267 1.0, 268 ), 269 ) 270 271 def _get_reward_distribution( 272 self, node: "NODE_TYPE", action: "ACTION_TYPE", next_node: "NODE_TYPE" 273 ) -> rv_continuous: 274 return ( 275 self._optimal_distribution 276 if node.X == self._size - 1 277 and node.Y == 0 278 and action == DeepSeaAction.RIGHT 279 else ( 280 self._sub_optimal_distribution 281 if action == DeepSeaAction.LEFT 282 else self._other_distribution 283 ) 284 ) 285 286 def _get_starting_node_sampler(self) -> NextStateSampler: 287 return NextStateSampler(next_nodes=self._possible_starting_nodes) 288 289 def _check_parameters_in_input(self): 290 super(DeepSeaMDP, self)._check_parameters_in_input() 291 292 assert self._size > 1 293 294 # No lazy mechanic for DeepSea 295 assert self._p_lazy is None 296 297 assert self._suboptimal_return < self._optimal_return - 0.1 298 299 dists = [ 300 self._sub_optimal_distribution, 301 self._optimal_distribution, 302 self._other_distribution, 303 ] 304 check_distributions( 305 dists, 306 self._make_reward_stochastic, 307 ) 308 309 def _get_grid_representation(self, node: "NODE_TYPE") -> np.ndarray: 310 grid = np.zeros((self._size, self._size), dtype=str) 311 grid[:, :] = " " 312 grid[node.Y, node.X] = "A" 313 return grid[::-1, :] 314 315 @property 316 def parameters(self) -> Dict[str, Any]: 317 return { 318 **super(DeepSeaMDP, self).parameters, 319 **dict( 320 size=self._size, 321 optimal_return=self._optimal_return, 322 suboptimal_return=self._suboptimal_return, 323 optimal_distribution=self._optimal_distribution, 324 sub_optimal_distribution=self._sub_optimal_distribution, 325 other_distribution=self._other_distribution, 326 ), 327 }
@dataclass(frozen=True)
class
DeepSeaNode:
23@dataclass(frozen=True) 24class DeepSeaNode: 25 """ 26 The node for the DeepSea MDP. 27 """ 28 29 X: int 30 """x coordinate.""" 31 Y: int 32 """y coordinate.""" 33 34 def __str__(self): 35 return f"X={self.X},Y={self.Y}" 36 37 def __iter__(self): 38 return iter((self.X, self.Y))
The node for the DeepSea MDP.
class
DeepSeaAction(enum.IntEnum):
41class DeepSeaAction(IntEnum): 42 """ 43 The actions available in the DeepSea MDP. 44 """ 45 46 LEFT = 0 47 """Move towards the left.""" 48 RIGHT = 1 49 """Move towards the right."""
The actions available in the DeepSea MDP.
Inherited Members
- enum.Enum
- name
- value
- builtins.int
- conjugate
- bit_length
- to_bytes
- from_bytes
- as_integer_ratio
- real
- imag
- numerator
- denominator
52class DeepSeaMDP(BaseMDP, abc.ABC): 53 """ 54 The base class for the DeepSea family. 55 """ 56 57 @staticmethod 58 def get_unique_symbols() -> List[str]: 59 return ["A", " "] 60 61 @staticmethod 62 def does_seed_change_MDP_structure() -> bool: 63 return False 64 65 @staticmethod 66 def sample_mdp_parameters( 67 n: int, is_episodic: bool, seed: int = None 68 ) -> List[Dict[str, Any]]: 69 rng = np.random.RandomState(np.random.randint(10_000) if seed is None else seed) 70 samples = [] 71 for _ in range(n): 72 sample = dict( 73 size=int( 74 (1 + np.minimum((800 / (100 * rng.random() + 35)), 25)) 75 * (0.8 if is_episodic else 1) 76 ), 77 p_rand=min(2 / (8 * rng.random() + 3), 0.95), 78 make_reward_stochastic=rng.choice([True, False]), 79 reward_variance_multiplier=2 * rng.random() + 0.005, 80 ) 81 sample["p_rand"] = None if sample["p_rand"] < 0.01 else sample["p_rand"] 82 83 if sample["make_reward_stochastic"]: 84 sample["sub_optimal_distribution"] = ( 85 "beta", 86 ( 87 sample["reward_variance_multiplier"], 88 sample["reward_variance_multiplier"] 89 * (sample["size"] / 0.5 - 1), 90 ), 91 ) 92 sample["optimal_distribution"] = ( 93 "beta", 94 ( 95 sample["reward_variance_multiplier"] * (sample["size"] / 1 - 1), 96 sample["reward_variance_multiplier"], 97 ), 98 ) 99 sample["other_distribution"] = ( 100 "beta", 101 ( 102 sample["reward_variance_multiplier"], 103 sample["reward_variance_multiplier"] 104 * 10 105 * (sample["size"] / 0.5 - 1), 106 ), 107 ) 108 else: 109 sample["sub_optimal_distribution"] = ( 110 "deterministic", 111 (1.0 / (sample["size"] ** 2),), 112 ) 113 sample["optimal_distribution"] = ("deterministic", (1.0,)) 114 sample["other_distribution"] = ("deterministic", (0.0,)) 115 116 samples.append(rounding_nested_structure(sample)) 117 118 return samples 119 120 @staticmethod 121 def get_node_class() -> Type[DeepSeaNode]: 122 return DeepSeaNode 123 124 def get_gin_parameters(self, index: int) -> str: 125 prms = dict( 126 size=self._size, 127 make_reward_stochastic=self._make_reward_stochastic, 128 reward_variance_multiplier=self._reward_variance_multiplier, 129 sub_optimal_distribution=( 130 self._sub_optimal_distribution.dist.name, 131 self._sub_optimal_distribution.args, 132 ), 133 optimal_distribution=( 134 self._optimal_distribution.dist.name, 135 self._optimal_distribution.args, 136 ), 137 other_distribution=( 138 self._other_distribution.dist.name, 139 self._other_distribution.args, 140 ), 141 ) 142 if self._p_rand is not None: 143 prms["p_rand"] = self._p_rand 144 145 return DeepSeaMDP.produce_gin_file_from_mdp_parameters( 146 prms, type(self).__name__, index 147 ) 148 149 @property 150 def n_actions(self) -> int: 151 return len(DeepSeaAction) 152 153 def __init__( 154 self, 155 seed: int, 156 size: int, 157 optimal_return: float = 1.0, 158 suboptimal_return: float = 0.5, 159 optimal_distribution: Union[Tuple, rv_continuous] = None, 160 sub_optimal_distribution: Union[Tuple, rv_continuous] = None, 161 other_distribution: Union[Tuple, rv_continuous] = None, 162 make_reward_stochastic=False, 163 reward_variance_multiplier: float = 1.0, 164 **kwargs, 165 ): 166 """ 167 Parameters 168 ---------- 169 seed : int 170 The seed used for sampling rewards and next states. 171 size : int 172 The size of the grid. 173 optimal_return : float 174 If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory. 175 By default, it is set to 1. 176 suboptimal_return: float 177 If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories. 178 By default, it is set to 0.5. 179 optimal_distribution : Union[Tuple, rv_continuous] 180 The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters 181 or as a rv_continuous object. 182 sub_optimal_distribution : Union[Tuple, rv_continuous] 183 The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta 184 parameters or as a rv_continuous object. 185 other_distribution : Union[Tuple, rv_continuous] 186 The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a 187 rv_continuous object. 188 make_reward_stochastic : bool 189 If True, the rewards of the MDP will be stochastic. By default, it is set to False. 190 reward_variance_multiplier : float 191 A constant that can be used to increase the variance of the reward distributions without changing their means. 192 The lower the value, the higher the variance. By default, it is set to 1. 193 """ 194 195 if type(sub_optimal_distribution) == tuple: 196 sub_optimal_distribution = get_dist( 197 sub_optimal_distribution[0], sub_optimal_distribution[1] 198 ) 199 if type(optimal_distribution) == tuple: 200 optimal_distribution = get_dist( 201 optimal_distribution[0], optimal_distribution[1] 202 ) 203 if type(other_distribution) == tuple: 204 other_distribution = get_dist(other_distribution[0], other_distribution[1]) 205 206 self._size = size 207 self._optimal_return = optimal_return 208 self._suboptimal_return = suboptimal_return 209 self._optimal_distribution = optimal_distribution 210 self._sub_optimal_distribution = sub_optimal_distribution 211 self._other_distribution = other_distribution 212 213 dists = [ 214 sub_optimal_distribution, 215 optimal_distribution, 216 other_distribution, 217 ] 218 if dists.count(None) == 0: 219 self._sub_optimal_distribution = sub_optimal_distribution 220 self._optimal_distribution = optimal_distribution 221 self._other_distribution = other_distribution 222 else: 223 if make_reward_stochastic: 224 self._sub_optimal_distribution = beta( 225 reward_variance_multiplier, 226 reward_variance_multiplier * (size / self._suboptimal_return - 1), 227 ) 228 self._optimal_distribution = beta( 229 reward_variance_multiplier * (size / self._optimal_return - 1), 230 reward_variance_multiplier, 231 ) 232 self._other_distribution = beta( 233 reward_variance_multiplier, 234 reward_variance_multiplier 235 * 10 236 * (size / self._suboptimal_return - 1), 237 ) 238 else: 239 self._sub_optimal_distribution = deterministic(1.0 / (size ** 2)) 240 self._optimal_distribution = deterministic(1.0) 241 self._other_distribution = deterministic(0.0) 242 243 super(DeepSeaMDP, self).__init__( 244 seed=seed, 245 reward_variance_multiplier=reward_variance_multiplier, 246 make_reward_stochastic=make_reward_stochastic, 247 **kwargs, 248 ) 249 250 @property 251 def _possible_starting_nodes(self) -> List[DeepSeaNode]: 252 return [DeepSeaNode(0, self._size - 1)] 253 254 def _get_next_nodes_parameters( 255 self, node: "NODE_TYPE", action: "ACTION_TYPE" 256 ) -> Tuple[Tuple[dict, float], ...]: 257 if node.Y == 0: 258 return ((dict(X=0, Y=self._size - 1), 1.0),) 259 260 return ( 261 ( 262 dict( 263 X=min(node.X + 1, self._size - 1) 264 if action == DeepSeaAction.RIGHT 265 else max(node.X - 1, 0), 266 Y=max(0, node.Y - 1), 267 ), 268 1.0, 269 ), 270 ) 271 272 def _get_reward_distribution( 273 self, node: "NODE_TYPE", action: "ACTION_TYPE", next_node: "NODE_TYPE" 274 ) -> rv_continuous: 275 return ( 276 self._optimal_distribution 277 if node.X == self._size - 1 278 and node.Y == 0 279 and action == DeepSeaAction.RIGHT 280 else ( 281 self._sub_optimal_distribution 282 if action == DeepSeaAction.LEFT 283 else self._other_distribution 284 ) 285 ) 286 287 def _get_starting_node_sampler(self) -> NextStateSampler: 288 return NextStateSampler(next_nodes=self._possible_starting_nodes) 289 290 def _check_parameters_in_input(self): 291 super(DeepSeaMDP, self)._check_parameters_in_input() 292 293 assert self._size > 1 294 295 # No lazy mechanic for DeepSea 296 assert self._p_lazy is None 297 298 assert self._suboptimal_return < self._optimal_return - 0.1 299 300 dists = [ 301 self._sub_optimal_distribution, 302 self._optimal_distribution, 303 self._other_distribution, 304 ] 305 check_distributions( 306 dists, 307 self._make_reward_stochastic, 308 ) 309 310 def _get_grid_representation(self, node: "NODE_TYPE") -> np.ndarray: 311 grid = np.zeros((self._size, self._size), dtype=str) 312 grid[:, :] = " " 313 grid[node.Y, node.X] = "A" 314 return grid[::-1, :] 315 316 @property 317 def parameters(self) -> Dict[str, Any]: 318 return { 319 **super(DeepSeaMDP, self).parameters, 320 **dict( 321 size=self._size, 322 optimal_return=self._optimal_return, 323 suboptimal_return=self._suboptimal_return, 324 optimal_distribution=self._optimal_distribution, 325 sub_optimal_distribution=self._sub_optimal_distribution, 326 other_distribution=self._other_distribution, 327 ), 328 }
The base class for the DeepSea family.
DeepSeaMDP( seed: int, size: int, optimal_return: float = 1.0, suboptimal_return: float = 0.5, optimal_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, sub_optimal_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, other_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, make_reward_stochastic=False, reward_variance_multiplier: float = 1.0, **kwargs)
153 def __init__( 154 self, 155 seed: int, 156 size: int, 157 optimal_return: float = 1.0, 158 suboptimal_return: float = 0.5, 159 optimal_distribution: Union[Tuple, rv_continuous] = None, 160 sub_optimal_distribution: Union[Tuple, rv_continuous] = None, 161 other_distribution: Union[Tuple, rv_continuous] = None, 162 make_reward_stochastic=False, 163 reward_variance_multiplier: float = 1.0, 164 **kwargs, 165 ): 166 """ 167 Parameters 168 ---------- 169 seed : int 170 The seed used for sampling rewards and next states. 171 size : int 172 The size of the grid. 173 optimal_return : float 174 If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory. 175 By default, it is set to 1. 176 suboptimal_return: float 177 If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories. 178 By default, it is set to 0.5. 179 optimal_distribution : Union[Tuple, rv_continuous] 180 The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters 181 or as a rv_continuous object. 182 sub_optimal_distribution : Union[Tuple, rv_continuous] 183 The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta 184 parameters or as a rv_continuous object. 185 other_distribution : Union[Tuple, rv_continuous] 186 The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a 187 rv_continuous object. 188 make_reward_stochastic : bool 189 If True, the rewards of the MDP will be stochastic. By default, it is set to False. 190 reward_variance_multiplier : float 191 A constant that can be used to increase the variance of the reward distributions without changing their means. 192 The lower the value, the higher the variance. By default, it is set to 1. 193 """ 194 195 if type(sub_optimal_distribution) == tuple: 196 sub_optimal_distribution = get_dist( 197 sub_optimal_distribution[0], sub_optimal_distribution[1] 198 ) 199 if type(optimal_distribution) == tuple: 200 optimal_distribution = get_dist( 201 optimal_distribution[0], optimal_distribution[1] 202 ) 203 if type(other_distribution) == tuple: 204 other_distribution = get_dist(other_distribution[0], other_distribution[1]) 205 206 self._size = size 207 self._optimal_return = optimal_return 208 self._suboptimal_return = suboptimal_return 209 self._optimal_distribution = optimal_distribution 210 self._sub_optimal_distribution = sub_optimal_distribution 211 self._other_distribution = other_distribution 212 213 dists = [ 214 sub_optimal_distribution, 215 optimal_distribution, 216 other_distribution, 217 ] 218 if dists.count(None) == 0: 219 self._sub_optimal_distribution = sub_optimal_distribution 220 self._optimal_distribution = optimal_distribution 221 self._other_distribution = other_distribution 222 else: 223 if make_reward_stochastic: 224 self._sub_optimal_distribution = beta( 225 reward_variance_multiplier, 226 reward_variance_multiplier * (size / self._suboptimal_return - 1), 227 ) 228 self._optimal_distribution = beta( 229 reward_variance_multiplier * (size / self._optimal_return - 1), 230 reward_variance_multiplier, 231 ) 232 self._other_distribution = beta( 233 reward_variance_multiplier, 234 reward_variance_multiplier 235 * 10 236 * (size / self._suboptimal_return - 1), 237 ) 238 else: 239 self._sub_optimal_distribution = deterministic(1.0 / (size ** 2)) 240 self._optimal_distribution = deterministic(1.0) 241 self._other_distribution = deterministic(0.0) 242 243 super(DeepSeaMDP, self).__init__( 244 seed=seed, 245 reward_variance_multiplier=reward_variance_multiplier, 246 make_reward_stochastic=make_reward_stochastic, 247 **kwargs, 248 )
Parameters
- seed (int): The seed used for sampling rewards and next states.
- size (int): The size of the grid.
- optimal_return (float): If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory. By default, it is set to 1.
- suboptimal_return (float): If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories. By default, it is set to 0.5.
- optimal_distribution (Union[Tuple, rv_continuous]): The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
- sub_optimal_distribution (Union[Tuple, rv_continuous]): The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
- other_distribution (Union[Tuple, rv_continuous]): The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
- make_reward_stochastic (bool): If True, the rewards of the MDP will be stochastic. By default, it is set to False.
- reward_variance_multiplier (float): A constant that can be used to increase the variance of the reward distributions without changing their means. The lower the value, the higher the variance. By default, it is set to 1.
@staticmethod
def
get_unique_symbols() -> List[str]:
Returns
- List[str]: the unique symbols of the grid representation of the MDP.
@staticmethod
def
does_seed_change_MDP_structure() -> bool:
Returns
- bool: True if when changing the seed the transition matrix and/or rewards matrix change. This for example may
- happen when there are fewer starting states that possible one and the effective starting states are picked
- randomly based on the seed.
@staticmethod
def
sample_mdp_parameters(n: int, is_episodic: bool, seed: int = None) -> List[Dict[str, Any]]:
65 @staticmethod 66 def sample_mdp_parameters( 67 n: int, is_episodic: bool, seed: int = None 68 ) -> List[Dict[str, Any]]: 69 rng = np.random.RandomState(np.random.randint(10_000) if seed is None else seed) 70 samples = [] 71 for _ in range(n): 72 sample = dict( 73 size=int( 74 (1 + np.minimum((800 / (100 * rng.random() + 35)), 25)) 75 * (0.8 if is_episodic else 1) 76 ), 77 p_rand=min(2 / (8 * rng.random() + 3), 0.95), 78 make_reward_stochastic=rng.choice([True, False]), 79 reward_variance_multiplier=2 * rng.random() + 0.005, 80 ) 81 sample["p_rand"] = None if sample["p_rand"] < 0.01 else sample["p_rand"] 82 83 if sample["make_reward_stochastic"]: 84 sample["sub_optimal_distribution"] = ( 85 "beta", 86 ( 87 sample["reward_variance_multiplier"], 88 sample["reward_variance_multiplier"] 89 * (sample["size"] / 0.5 - 1), 90 ), 91 ) 92 sample["optimal_distribution"] = ( 93 "beta", 94 ( 95 sample["reward_variance_multiplier"] * (sample["size"] / 1 - 1), 96 sample["reward_variance_multiplier"], 97 ), 98 ) 99 sample["other_distribution"] = ( 100 "beta", 101 ( 102 sample["reward_variance_multiplier"], 103 sample["reward_variance_multiplier"] 104 * 10 105 * (sample["size"] / 0.5 - 1), 106 ), 107 ) 108 else: 109 sample["sub_optimal_distribution"] = ( 110 "deterministic", 111 (1.0 / (sample["size"] ** 2),), 112 ) 113 sample["optimal_distribution"] = ("deterministic", (1.0,)) 114 sample["other_distribution"] = ("deterministic", (0.0,)) 115 116 samples.append(rounding_nested_structure(sample)) 117 118 return samples
Returns
- List[Dict[str, Any]]: n sampled parameters that can be used to construct an MDP in a reasonable amount of time.
def
get_gin_parameters(self, index: int) -> str:
124 def get_gin_parameters(self, index: int) -> str: 125 prms = dict( 126 size=self._size, 127 make_reward_stochastic=self._make_reward_stochastic, 128 reward_variance_multiplier=self._reward_variance_multiplier, 129 sub_optimal_distribution=( 130 self._sub_optimal_distribution.dist.name, 131 self._sub_optimal_distribution.args, 132 ), 133 optimal_distribution=( 134 self._optimal_distribution.dist.name, 135 self._optimal_distribution.args, 136 ), 137 other_distribution=( 138 self._other_distribution.dist.name, 139 self._other_distribution.args, 140 ), 141 ) 142 if self._p_rand is not None: 143 prms["p_rand"] = self._p_rand 144 145 return DeepSeaMDP.produce_gin_file_from_mdp_parameters( 146 prms, type(self).__name__, index 147 )
Returns
- str: The gin config of the MDP instance.
Inherited Members
- colosseum.mdp.base.BaseMDP
- get_available_hardness_measures
- produce_gin_file_from_mdp_parameters
- is_episodic
- sample_parameters
- get_grid_representation
- get_gin_config
- get_node_labels
- get_node_action_labels
- hash
- instantiate_MDP
- T
- R
- recurrent_nodes_set
- communication_class
- get_optimal_policy
- get_worst_policy
- get_value_functions
- optimal_value_functions
- worst_value_functions
- random_value_functions
- optimal_transition_probabilities
- worst_transition_probabilities
- random_transition_probabilities
- optimal_markov_chain
- worst_markov_chain
- random_markov_chain
- get_stationary_distribution
- optimal_stationary_distribution
- worst_stationary_distribution
- random_stationary_distribution
- optimal_average_rewards
- worst_average_rewards
- random_average_rewards
- get_average_reward
- optimal_average_reward
- worst_average_reward
- random_average_reward
- transition_matrix_and_rewards
- graph_layout
- graph_metrics
- diameter
- sum_reciprocals_suboptimality_gaps
- discounted_value_norm
- undiscounted_value_norm
- value_norm
- measures_of_hardness
- summary
- hardness_report
- get_info_class
- get_transition_distributions
- get_reward_distribution
- sample_reward
- get_measure_from_name
- action_spec
- observation_spec
- get_observation
- reset
- step
- random_steps
- random_step
- get_visitation_counts
- reset_visitation_counts
- get_value_node_labels
- dm_env._environment.Environment
- reward_spec
- discount_spec
- close