colosseum.mdp.river_swim.base
1import abc 2from dataclasses import dataclass 3from enum import IntEnum 4from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Type, Union 5 6import numpy as np 7from scipy.stats import beta, rv_continuous 8 9from colosseum.mdp import BaseMDP 10from colosseum.mdp.utils.custom_samplers import NextStateSampler 11from colosseum.utils.miscellanea import ( 12 check_distributions, 13 deterministic, 14 get_dist, 15 rounding_nested_structure, 16) 17 18if TYPE_CHECKING: 19 from colosseum.mdp import ACTION_TYPE, NODE_TYPE 20 21 22@dataclass(frozen=True) 23class RiverSwimNode: 24 """ 25 The node for the RiverSwim MDP. 26 """ 27 28 X: int 29 """x coordinate.""" 30 31 def __str__(self): 32 return f"X={self.X}" 33 34 def __iter__(self): 35 return iter((self.X, self.X)) 36 37 38class RiverSwimAction(IntEnum): 39 """ 40 The actions available in the RiverSwim MDP. 41 """ 42 43 LEFT = 0 44 RIGHT = 1 45 46 47class RiverSwimMDP(BaseMDP, abc.ABC): 48 """ 49 The base class for the RiverSwim family. 50 """ 51 52 @staticmethod 53 def get_action_class() -> RiverSwimAction: 54 return RiverSwimAction 55 56 @staticmethod 57 def get_unique_symbols() -> List[str]: 58 return [" ", "A", "S", "G"] 59 60 @staticmethod 61 def does_seed_change_MDP_structure() -> bool: 62 return False 63 64 @staticmethod 65 def sample_mdp_parameters( 66 n: int, is_episodic: bool, seed: int = None 67 ) -> List[Dict[str, Any]]: 68 rng = np.random.RandomState(np.random.randint(10_000) if seed is None else seed) 69 samples = [] 70 for _ in range(n): 71 p_rand, p_lazy, _ = 0.9 * rng.dirichlet([0.2, 0.2, 5]) 72 sample = dict( 73 size=int(np.minimum(2.5 + (200 / (45 * rng.random() + 11)), 25)) 74 if is_episodic 75 else int((6 * rng.random() + 2) ** 2.2), 76 make_reward_stochastic=rng.choice([True, False]), 77 p_rand=p_rand, 78 p_lazy=p_lazy, 79 reward_variance_multiplier=2 * rng.random() + 0.005, 80 ) 81 sample["p_rand"] = None if sample["p_rand"] < 0.01 else sample["p_rand"] 82 sample["p_lazy"] = None if sample["p_lazy"] < 0.01 else sample["p_lazy"] 83 84 if sample["make_reward_stochastic"]: 85 sample["sub_optimal_distribution"] = ( 86 "beta", 87 ( 88 sample["reward_variance_multiplier"], 89 sample["reward_variance_multiplier"] * (1 / 0.2 - 1), 90 ), 91 ) 92 sample["optimal_distribution"] = ( 93 "beta", 94 ( 95 sample["reward_variance_multiplier"], 96 sample["reward_variance_multiplier"] * (1 / 0.9 - 1), 97 ), 98 ) 99 sample["other_distribution"] = ( 100 "beta", 101 ( 102 sample["reward_variance_multiplier"], 103 sample["reward_variance_multiplier"] * (10 / 0.2 - 1), 104 ), 105 ) 106 else: 107 sample["sub_optimal_distribution"] = ( 108 "deterministic", 109 (round(5 / 1000, 3),), 110 ) 111 sample["optimal_distribution"] = ("deterministic", (1.0,)) 112 sample["other_distribution"] = ("deterministic", (0.0,)) 113 114 samples.append(rounding_nested_structure(sample)) 115 return samples 116 117 @staticmethod 118 def get_node_class() -> Type[RiverSwimNode]: 119 return RiverSwimNode 120 121 def get_gin_parameters(self, index: int) -> str: 122 prms = dict( 123 size=self._size, 124 make_reward_stochastic=self._make_reward_stochastic, 125 reward_variance_multiplier=self._reward_variance_multiplier, 126 optimal_distribution=( 127 self._optimal_distribution.dist.name, 128 self._optimal_distribution.args, 129 ), 130 other_distribution=( 131 self._other_distribution.dist.name, 132 self._other_distribution.args, 133 ), 134 sub_optimal_distribution=( 135 self._sub_optimal_distribution.dist.name, 136 self._sub_optimal_distribution.args, 137 ), 138 ) 139 140 if self._p_rand is not None: 141 prms["p_rand"] = self._p_rand 142 if self._p_lazy is not None: 143 prms["p_lazy"] = self._p_lazy 144 145 return RiverSwimMDP.produce_gin_file_from_mdp_parameters( 146 prms, type(self).__name__, index 147 ) 148 149 @property 150 def n_actions(self) -> int: 151 return len(RiverSwimAction) 152 153 def __init__( 154 self, 155 seed: int, 156 size: int, 157 optimal_mean_reward: float = 0.9, 158 sub_optimal_mean_reward: float = 0.2, 159 sub_optimal_distribution: Union[Tuple, rv_continuous] = None, 160 optimal_distribution: Union[Tuple, rv_continuous] = None, 161 other_distribution: Union[Tuple, rv_continuous] = None, 162 make_reward_stochastic=False, 163 reward_variance_multiplier: float = 1.0, 164 **kwargs, 165 ): 166 """ 167 Parameters 168 ---------- 169 seed : int 170 The seed used for sampling rewards and next states. 171 size : int 172 The length of the chain. 173 optimal_mean_reward : float 174 If the rewards are made stochastic, this parameter controls the mean reward for the highly rewarding states. 175 By default, it is set to 0.9. 176 sub_optimal_mean_reward : float 177 If the rewards are made stochastic, this parameter controls the mean reward for the suboptimal states. 178 By default, it is set to 0.2. 179 sub_optimal_distribution : Union[Tuple, rv_continuous] 180 The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta 181 parameters or as a rv_continuous object. 182 optimal_distribution : Union[Tuple, rv_continuous] 183 The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters 184 or as a rv_continuous object. 185 other_distribution : Union[Tuple, rv_continuous] 186 The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a 187 rv_continuous object. 188 make_reward_stochastic : bool 189 If True, the rewards of the MDP will be stochastic. By default, it is set to False. 190 reward_variance_multiplier : float 191 A constant that can be used to increase the variance of the reward distributions without changing their means. 192 The lower the value, the higher the variance. By default, it is set to 1. 193 """ 194 195 if type(sub_optimal_distribution) == tuple: 196 sub_optimal_distribution = get_dist( 197 sub_optimal_distribution[0], sub_optimal_distribution[1] 198 ) 199 if type(optimal_distribution) == tuple: 200 optimal_distribution = get_dist( 201 optimal_distribution[0], optimal_distribution[1] 202 ) 203 if type(other_distribution) == tuple: 204 other_distribution = get_dist(other_distribution[0], other_distribution[1]) 205 206 self._size = size 207 self._optimal_mean_reward = optimal_mean_reward 208 self._sub_optimal_mean_reward = sub_optimal_mean_reward 209 self._optimal_distribution = optimal_distribution 210 self._sub_optimal_distribution = sub_optimal_distribution 211 self._other_distribution = other_distribution 212 213 dists = [ 214 sub_optimal_distribution, 215 optimal_distribution, 216 other_distribution, 217 ] 218 if dists.count(None) == 0: 219 self._sub_optimal_distribution = sub_optimal_distribution 220 self._optimal_distribution = optimal_distribution 221 self._other_distribution = other_distribution 222 else: 223 if make_reward_stochastic: 224 if self.is_episodic(): 225 sub_optimal_mean_reward /= self._size 226 self._sub_optimal_distribution = beta( 227 reward_variance_multiplier, 228 reward_variance_multiplier * (1 / sub_optimal_mean_reward - 1), 229 ) 230 self._optimal_distribution = beta( 231 reward_variance_multiplier, 232 reward_variance_multiplier * (1 / optimal_mean_reward - 1), 233 ) 234 self._other_distribution = beta( 235 reward_variance_multiplier, 236 reward_variance_multiplier * (10 / sub_optimal_mean_reward - 1), 237 ) 238 else: 239 self._sub_optimal_distribution = deterministic(5 / 1000) 240 self._optimal_distribution = deterministic(1.0) 241 self._other_distribution = deterministic(0.0) 242 243 super(RiverSwimMDP, self).__init__( 244 seed=seed, 245 reward_variance_multiplier=reward_variance_multiplier, 246 make_reward_stochastic=make_reward_stochastic, 247 **kwargs, 248 ) 249 250 def _get_next_nodes_parameters( 251 self, node: "NODE_TYPE", action: "ACTION_TYPE" 252 ) -> Tuple[Tuple[dict, float], ...]: 253 return ( 254 ( 255 dict( 256 X=min(node.X + 1, self._size - 1) 257 if action == RiverSwimAction.RIGHT 258 else max(node.X - 1, 0), 259 ), 260 1.0, 261 ), 262 ) 263 264 def _get_reward_distribution( 265 self, node: "NODE_TYPE", action: "ACTION_TYPE", next_node: "NODE_TYPE" 266 ) -> rv_continuous: 267 return ( 268 self._optimal_distribution 269 if node.X == self._size - 1 and action == RiverSwimAction.RIGHT 270 else ( 271 self._sub_optimal_distribution 272 if node.X == 0 and action == RiverSwimAction.LEFT 273 else self._other_distribution 274 ) 275 ) 276 277 def _get_starting_node_sampler(self) -> NextStateSampler: 278 return NextStateSampler(next_nodes=self._possible_starting_nodes) 279 280 def _check_parameters_in_input(self): 281 super(RiverSwimMDP, self)._check_parameters_in_input() 282 283 assert self._size > 1 284 assert self._optimal_mean_reward - 0.1 > self._sub_optimal_mean_reward 285 286 dists = [ 287 self._sub_optimal_distribution, 288 self._optimal_distribution, 289 self._other_distribution, 290 ] 291 check_distributions( 292 dists, 293 self._make_reward_stochastic, 294 ) 295 296 def _get_grid_representation(self, node: "NODE_TYPE") -> np.ndarray: 297 grid = np.zeros((1, self._size), dtype=str) 298 grid[:, :] = " " 299 grid[0, 0] = "S" 300 grid[0, -1] = "G" 301 grid[0, node.X] = "A" 302 return grid 303 304 @property 305 def _possible_starting_nodes(self) -> List["NODE_TYPE"]: 306 return [RiverSwimNode(0)] 307 308 @property 309 def parameters(self) -> Dict[str, Any]: 310 return { 311 **super(RiverSwimMDP, self).parameters, 312 **dict( 313 size=self._size, 314 optimal_mean_reward=self._optimal_mean_reward, 315 sub_optimal_mean_reward=self._sub_optimal_mean_reward, 316 optimal_distribution=self._optimal_distribution, 317 sub_optimal_distribution=self._sub_optimal_distribution, 318 other_distribution=self._other_distribution, 319 ), 320 }
@dataclass(frozen=True)
class
RiverSwimNode:
23@dataclass(frozen=True) 24class RiverSwimNode: 25 """ 26 The node for the RiverSwim MDP. 27 """ 28 29 X: int 30 """x coordinate.""" 31 32 def __str__(self): 33 return f"X={self.X}" 34 35 def __iter__(self): 36 return iter((self.X, self.X))
The node for the RiverSwim MDP.
class
RiverSwimAction(enum.IntEnum):
39class RiverSwimAction(IntEnum): 40 """ 41 The actions available in the RiverSwim MDP. 42 """ 43 44 LEFT = 0 45 RIGHT = 1
The actions available in the RiverSwim MDP.
Inherited Members
- enum.Enum
- name
- value
- builtins.int
- conjugate
- bit_length
- to_bytes
- from_bytes
- as_integer_ratio
- real
- imag
- numerator
- denominator
48class RiverSwimMDP(BaseMDP, abc.ABC): 49 """ 50 The base class for the RiverSwim family. 51 """ 52 53 @staticmethod 54 def get_action_class() -> RiverSwimAction: 55 return RiverSwimAction 56 57 @staticmethod 58 def get_unique_symbols() -> List[str]: 59 return [" ", "A", "S", "G"] 60 61 @staticmethod 62 def does_seed_change_MDP_structure() -> bool: 63 return False 64 65 @staticmethod 66 def sample_mdp_parameters( 67 n: int, is_episodic: bool, seed: int = None 68 ) -> List[Dict[str, Any]]: 69 rng = np.random.RandomState(np.random.randint(10_000) if seed is None else seed) 70 samples = [] 71 for _ in range(n): 72 p_rand, p_lazy, _ = 0.9 * rng.dirichlet([0.2, 0.2, 5]) 73 sample = dict( 74 size=int(np.minimum(2.5 + (200 / (45 * rng.random() + 11)), 25)) 75 if is_episodic 76 else int((6 * rng.random() + 2) ** 2.2), 77 make_reward_stochastic=rng.choice([True, False]), 78 p_rand=p_rand, 79 p_lazy=p_lazy, 80 reward_variance_multiplier=2 * rng.random() + 0.005, 81 ) 82 sample["p_rand"] = None if sample["p_rand"] < 0.01 else sample["p_rand"] 83 sample["p_lazy"] = None if sample["p_lazy"] < 0.01 else sample["p_lazy"] 84 85 if sample["make_reward_stochastic"]: 86 sample["sub_optimal_distribution"] = ( 87 "beta", 88 ( 89 sample["reward_variance_multiplier"], 90 sample["reward_variance_multiplier"] * (1 / 0.2 - 1), 91 ), 92 ) 93 sample["optimal_distribution"] = ( 94 "beta", 95 ( 96 sample["reward_variance_multiplier"], 97 sample["reward_variance_multiplier"] * (1 / 0.9 - 1), 98 ), 99 ) 100 sample["other_distribution"] = ( 101 "beta", 102 ( 103 sample["reward_variance_multiplier"], 104 sample["reward_variance_multiplier"] * (10 / 0.2 - 1), 105 ), 106 ) 107 else: 108 sample["sub_optimal_distribution"] = ( 109 "deterministic", 110 (round(5 / 1000, 3),), 111 ) 112 sample["optimal_distribution"] = ("deterministic", (1.0,)) 113 sample["other_distribution"] = ("deterministic", (0.0,)) 114 115 samples.append(rounding_nested_structure(sample)) 116 return samples 117 118 @staticmethod 119 def get_node_class() -> Type[RiverSwimNode]: 120 return RiverSwimNode 121 122 def get_gin_parameters(self, index: int) -> str: 123 prms = dict( 124 size=self._size, 125 make_reward_stochastic=self._make_reward_stochastic, 126 reward_variance_multiplier=self._reward_variance_multiplier, 127 optimal_distribution=( 128 self._optimal_distribution.dist.name, 129 self._optimal_distribution.args, 130 ), 131 other_distribution=( 132 self._other_distribution.dist.name, 133 self._other_distribution.args, 134 ), 135 sub_optimal_distribution=( 136 self._sub_optimal_distribution.dist.name, 137 self._sub_optimal_distribution.args, 138 ), 139 ) 140 141 if self._p_rand is not None: 142 prms["p_rand"] = self._p_rand 143 if self._p_lazy is not None: 144 prms["p_lazy"] = self._p_lazy 145 146 return RiverSwimMDP.produce_gin_file_from_mdp_parameters( 147 prms, type(self).__name__, index 148 ) 149 150 @property 151 def n_actions(self) -> int: 152 return len(RiverSwimAction) 153 154 def __init__( 155 self, 156 seed: int, 157 size: int, 158 optimal_mean_reward: float = 0.9, 159 sub_optimal_mean_reward: float = 0.2, 160 sub_optimal_distribution: Union[Tuple, rv_continuous] = None, 161 optimal_distribution: Union[Tuple, rv_continuous] = None, 162 other_distribution: Union[Tuple, rv_continuous] = None, 163 make_reward_stochastic=False, 164 reward_variance_multiplier: float = 1.0, 165 **kwargs, 166 ): 167 """ 168 Parameters 169 ---------- 170 seed : int 171 The seed used for sampling rewards and next states. 172 size : int 173 The length of the chain. 174 optimal_mean_reward : float 175 If the rewards are made stochastic, this parameter controls the mean reward for the highly rewarding states. 176 By default, it is set to 0.9. 177 sub_optimal_mean_reward : float 178 If the rewards are made stochastic, this parameter controls the mean reward for the suboptimal states. 179 By default, it is set to 0.2. 180 sub_optimal_distribution : Union[Tuple, rv_continuous] 181 The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta 182 parameters or as a rv_continuous object. 183 optimal_distribution : Union[Tuple, rv_continuous] 184 The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters 185 or as a rv_continuous object. 186 other_distribution : Union[Tuple, rv_continuous] 187 The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a 188 rv_continuous object. 189 make_reward_stochastic : bool 190 If True, the rewards of the MDP will be stochastic. By default, it is set to False. 191 reward_variance_multiplier : float 192 A constant that can be used to increase the variance of the reward distributions without changing their means. 193 The lower the value, the higher the variance. By default, it is set to 1. 194 """ 195 196 if type(sub_optimal_distribution) == tuple: 197 sub_optimal_distribution = get_dist( 198 sub_optimal_distribution[0], sub_optimal_distribution[1] 199 ) 200 if type(optimal_distribution) == tuple: 201 optimal_distribution = get_dist( 202 optimal_distribution[0], optimal_distribution[1] 203 ) 204 if type(other_distribution) == tuple: 205 other_distribution = get_dist(other_distribution[0], other_distribution[1]) 206 207 self._size = size 208 self._optimal_mean_reward = optimal_mean_reward 209 self._sub_optimal_mean_reward = sub_optimal_mean_reward 210 self._optimal_distribution = optimal_distribution 211 self._sub_optimal_distribution = sub_optimal_distribution 212 self._other_distribution = other_distribution 213 214 dists = [ 215 sub_optimal_distribution, 216 optimal_distribution, 217 other_distribution, 218 ] 219 if dists.count(None) == 0: 220 self._sub_optimal_distribution = sub_optimal_distribution 221 self._optimal_distribution = optimal_distribution 222 self._other_distribution = other_distribution 223 else: 224 if make_reward_stochastic: 225 if self.is_episodic(): 226 sub_optimal_mean_reward /= self._size 227 self._sub_optimal_distribution = beta( 228 reward_variance_multiplier, 229 reward_variance_multiplier * (1 / sub_optimal_mean_reward - 1), 230 ) 231 self._optimal_distribution = beta( 232 reward_variance_multiplier, 233 reward_variance_multiplier * (1 / optimal_mean_reward - 1), 234 ) 235 self._other_distribution = beta( 236 reward_variance_multiplier, 237 reward_variance_multiplier * (10 / sub_optimal_mean_reward - 1), 238 ) 239 else: 240 self._sub_optimal_distribution = deterministic(5 / 1000) 241 self._optimal_distribution = deterministic(1.0) 242 self._other_distribution = deterministic(0.0) 243 244 super(RiverSwimMDP, self).__init__( 245 seed=seed, 246 reward_variance_multiplier=reward_variance_multiplier, 247 make_reward_stochastic=make_reward_stochastic, 248 **kwargs, 249 ) 250 251 def _get_next_nodes_parameters( 252 self, node: "NODE_TYPE", action: "ACTION_TYPE" 253 ) -> Tuple[Tuple[dict, float], ...]: 254 return ( 255 ( 256 dict( 257 X=min(node.X + 1, self._size - 1) 258 if action == RiverSwimAction.RIGHT 259 else max(node.X - 1, 0), 260 ), 261 1.0, 262 ), 263 ) 264 265 def _get_reward_distribution( 266 self, node: "NODE_TYPE", action: "ACTION_TYPE", next_node: "NODE_TYPE" 267 ) -> rv_continuous: 268 return ( 269 self._optimal_distribution 270 if node.X == self._size - 1 and action == RiverSwimAction.RIGHT 271 else ( 272 self._sub_optimal_distribution 273 if node.X == 0 and action == RiverSwimAction.LEFT 274 else self._other_distribution 275 ) 276 ) 277 278 def _get_starting_node_sampler(self) -> NextStateSampler: 279 return NextStateSampler(next_nodes=self._possible_starting_nodes) 280 281 def _check_parameters_in_input(self): 282 super(RiverSwimMDP, self)._check_parameters_in_input() 283 284 assert self._size > 1 285 assert self._optimal_mean_reward - 0.1 > self._sub_optimal_mean_reward 286 287 dists = [ 288 self._sub_optimal_distribution, 289 self._optimal_distribution, 290 self._other_distribution, 291 ] 292 check_distributions( 293 dists, 294 self._make_reward_stochastic, 295 ) 296 297 def _get_grid_representation(self, node: "NODE_TYPE") -> np.ndarray: 298 grid = np.zeros((1, self._size), dtype=str) 299 grid[:, :] = " " 300 grid[0, 0] = "S" 301 grid[0, -1] = "G" 302 grid[0, node.X] = "A" 303 return grid 304 305 @property 306 def _possible_starting_nodes(self) -> List["NODE_TYPE"]: 307 return [RiverSwimNode(0)] 308 309 @property 310 def parameters(self) -> Dict[str, Any]: 311 return { 312 **super(RiverSwimMDP, self).parameters, 313 **dict( 314 size=self._size, 315 optimal_mean_reward=self._optimal_mean_reward, 316 sub_optimal_mean_reward=self._sub_optimal_mean_reward, 317 optimal_distribution=self._optimal_distribution, 318 sub_optimal_distribution=self._sub_optimal_distribution, 319 other_distribution=self._other_distribution, 320 ), 321 }
The base class for the RiverSwim family.
RiverSwimMDP( seed: int, size: int, optimal_mean_reward: float = 0.9, sub_optimal_mean_reward: float = 0.2, sub_optimal_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, optimal_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, other_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, make_reward_stochastic=False, reward_variance_multiplier: float = 1.0, **kwargs)
154 def __init__( 155 self, 156 seed: int, 157 size: int, 158 optimal_mean_reward: float = 0.9, 159 sub_optimal_mean_reward: float = 0.2, 160 sub_optimal_distribution: Union[Tuple, rv_continuous] = None, 161 optimal_distribution: Union[Tuple, rv_continuous] = None, 162 other_distribution: Union[Tuple, rv_continuous] = None, 163 make_reward_stochastic=False, 164 reward_variance_multiplier: float = 1.0, 165 **kwargs, 166 ): 167 """ 168 Parameters 169 ---------- 170 seed : int 171 The seed used for sampling rewards and next states. 172 size : int 173 The length of the chain. 174 optimal_mean_reward : float 175 If the rewards are made stochastic, this parameter controls the mean reward for the highly rewarding states. 176 By default, it is set to 0.9. 177 sub_optimal_mean_reward : float 178 If the rewards are made stochastic, this parameter controls the mean reward for the suboptimal states. 179 By default, it is set to 0.2. 180 sub_optimal_distribution : Union[Tuple, rv_continuous] 181 The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta 182 parameters or as a rv_continuous object. 183 optimal_distribution : Union[Tuple, rv_continuous] 184 The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters 185 or as a rv_continuous object. 186 other_distribution : Union[Tuple, rv_continuous] 187 The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a 188 rv_continuous object. 189 make_reward_stochastic : bool 190 If True, the rewards of the MDP will be stochastic. By default, it is set to False. 191 reward_variance_multiplier : float 192 A constant that can be used to increase the variance of the reward distributions without changing their means. 193 The lower the value, the higher the variance. By default, it is set to 1. 194 """ 195 196 if type(sub_optimal_distribution) == tuple: 197 sub_optimal_distribution = get_dist( 198 sub_optimal_distribution[0], sub_optimal_distribution[1] 199 ) 200 if type(optimal_distribution) == tuple: 201 optimal_distribution = get_dist( 202 optimal_distribution[0], optimal_distribution[1] 203 ) 204 if type(other_distribution) == tuple: 205 other_distribution = get_dist(other_distribution[0], other_distribution[1]) 206 207 self._size = size 208 self._optimal_mean_reward = optimal_mean_reward 209 self._sub_optimal_mean_reward = sub_optimal_mean_reward 210 self._optimal_distribution = optimal_distribution 211 self._sub_optimal_distribution = sub_optimal_distribution 212 self._other_distribution = other_distribution 213 214 dists = [ 215 sub_optimal_distribution, 216 optimal_distribution, 217 other_distribution, 218 ] 219 if dists.count(None) == 0: 220 self._sub_optimal_distribution = sub_optimal_distribution 221 self._optimal_distribution = optimal_distribution 222 self._other_distribution = other_distribution 223 else: 224 if make_reward_stochastic: 225 if self.is_episodic(): 226 sub_optimal_mean_reward /= self._size 227 self._sub_optimal_distribution = beta( 228 reward_variance_multiplier, 229 reward_variance_multiplier * (1 / sub_optimal_mean_reward - 1), 230 ) 231 self._optimal_distribution = beta( 232 reward_variance_multiplier, 233 reward_variance_multiplier * (1 / optimal_mean_reward - 1), 234 ) 235 self._other_distribution = beta( 236 reward_variance_multiplier, 237 reward_variance_multiplier * (10 / sub_optimal_mean_reward - 1), 238 ) 239 else: 240 self._sub_optimal_distribution = deterministic(5 / 1000) 241 self._optimal_distribution = deterministic(1.0) 242 self._other_distribution = deterministic(0.0) 243 244 super(RiverSwimMDP, self).__init__( 245 seed=seed, 246 reward_variance_multiplier=reward_variance_multiplier, 247 make_reward_stochastic=make_reward_stochastic, 248 **kwargs, 249 )
Parameters
- seed (int): The seed used for sampling rewards and next states.
- size (int): The length of the chain.
- optimal_mean_reward (float): If the rewards are made stochastic, this parameter controls the mean reward for the highly rewarding states. By default, it is set to 0.9.
- sub_optimal_mean_reward (float): If the rewards are made stochastic, this parameter controls the mean reward for the suboptimal states. By default, it is set to 0.2.
- sub_optimal_distribution (Union[Tuple, rv_continuous]): The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
- optimal_distribution (Union[Tuple, rv_continuous]): The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
- other_distribution (Union[Tuple, rv_continuous]): The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
- make_reward_stochastic (bool): If True, the rewards of the MDP will be stochastic. By default, it is set to False.
- reward_variance_multiplier (float): A constant that can be used to increase the variance of the reward distributions without changing their means. The lower the value, the higher the variance. By default, it is set to 1.
@staticmethod
def
get_unique_symbols() -> List[str]:
Returns
- List[str]: the unique symbols of the grid representation of the MDP.
@staticmethod
def
does_seed_change_MDP_structure() -> bool:
Returns
- bool: True if when changing the seed the transition matrix and/or rewards matrix change. This for example may
- happen when there are fewer starting states that possible one and the effective starting states are picked
- randomly based on the seed.
@staticmethod
def
sample_mdp_parameters(n: int, is_episodic: bool, seed: int = None) -> List[Dict[str, Any]]:
65 @staticmethod 66 def sample_mdp_parameters( 67 n: int, is_episodic: bool, seed: int = None 68 ) -> List[Dict[str, Any]]: 69 rng = np.random.RandomState(np.random.randint(10_000) if seed is None else seed) 70 samples = [] 71 for _ in range(n): 72 p_rand, p_lazy, _ = 0.9 * rng.dirichlet([0.2, 0.2, 5]) 73 sample = dict( 74 size=int(np.minimum(2.5 + (200 / (45 * rng.random() + 11)), 25)) 75 if is_episodic 76 else int((6 * rng.random() + 2) ** 2.2), 77 make_reward_stochastic=rng.choice([True, False]), 78 p_rand=p_rand, 79 p_lazy=p_lazy, 80 reward_variance_multiplier=2 * rng.random() + 0.005, 81 ) 82 sample["p_rand"] = None if sample["p_rand"] < 0.01 else sample["p_rand"] 83 sample["p_lazy"] = None if sample["p_lazy"] < 0.01 else sample["p_lazy"] 84 85 if sample["make_reward_stochastic"]: 86 sample["sub_optimal_distribution"] = ( 87 "beta", 88 ( 89 sample["reward_variance_multiplier"], 90 sample["reward_variance_multiplier"] * (1 / 0.2 - 1), 91 ), 92 ) 93 sample["optimal_distribution"] = ( 94 "beta", 95 ( 96 sample["reward_variance_multiplier"], 97 sample["reward_variance_multiplier"] * (1 / 0.9 - 1), 98 ), 99 ) 100 sample["other_distribution"] = ( 101 "beta", 102 ( 103 sample["reward_variance_multiplier"], 104 sample["reward_variance_multiplier"] * (10 / 0.2 - 1), 105 ), 106 ) 107 else: 108 sample["sub_optimal_distribution"] = ( 109 "deterministic", 110 (round(5 / 1000, 3),), 111 ) 112 sample["optimal_distribution"] = ("deterministic", (1.0,)) 113 sample["other_distribution"] = ("deterministic", (0.0,)) 114 115 samples.append(rounding_nested_structure(sample)) 116 return samples
Returns
- List[Dict[str, Any]]: n sampled parameters that can be used to construct an MDP in a reasonable amount of time.
def
get_gin_parameters(self, index: int) -> str:
122 def get_gin_parameters(self, index: int) -> str: 123 prms = dict( 124 size=self._size, 125 make_reward_stochastic=self._make_reward_stochastic, 126 reward_variance_multiplier=self._reward_variance_multiplier, 127 optimal_distribution=( 128 self._optimal_distribution.dist.name, 129 self._optimal_distribution.args, 130 ), 131 other_distribution=( 132 self._other_distribution.dist.name, 133 self._other_distribution.args, 134 ), 135 sub_optimal_distribution=( 136 self._sub_optimal_distribution.dist.name, 137 self._sub_optimal_distribution.args, 138 ), 139 ) 140 141 if self._p_rand is not None: 142 prms["p_rand"] = self._p_rand 143 if self._p_lazy is not None: 144 prms["p_lazy"] = self._p_lazy 145 146 return RiverSwimMDP.produce_gin_file_from_mdp_parameters( 147 prms, type(self).__name__, index 148 )
Returns
- str: The gin config of the MDP instance.
Inherited Members
- colosseum.mdp.base.BaseMDP
- get_available_hardness_measures
- produce_gin_file_from_mdp_parameters
- is_episodic
- sample_parameters
- get_grid_representation
- get_gin_config
- get_node_labels
- get_node_action_labels
- hash
- instantiate_MDP
- T
- R
- recurrent_nodes_set
- communication_class
- get_optimal_policy
- get_worst_policy
- get_value_functions
- optimal_value_functions
- worst_value_functions
- random_value_functions
- optimal_transition_probabilities
- worst_transition_probabilities
- random_transition_probabilities
- optimal_markov_chain
- worst_markov_chain
- random_markov_chain
- get_stationary_distribution
- optimal_stationary_distribution
- worst_stationary_distribution
- random_stationary_distribution
- optimal_average_rewards
- worst_average_rewards
- random_average_rewards
- get_average_reward
- optimal_average_reward
- worst_average_reward
- random_average_reward
- transition_matrix_and_rewards
- graph_layout
- graph_metrics
- diameter
- sum_reciprocals_suboptimality_gaps
- discounted_value_norm
- undiscounted_value_norm
- value_norm
- measures_of_hardness
- summary
- hardness_report
- get_info_class
- get_transition_distributions
- get_reward_distribution
- sample_reward
- get_measure_from_name
- action_spec
- observation_spec
- get_observation
- reset
- step
- random_steps
- random_step
- get_visitation_counts
- reset_visitation_counts
- get_value_node_labels
- dm_env._environment.Environment
- reward_spec
- discount_spec
- close