colosseum.mdp.frozen_lake.infinite_horizon
1from typing import Any, Dict, List, Tuple 2 3import gin 4 5from colosseum.mdp import ContinuousMDP 6from colosseum.mdp.frozen_lake.base import FrozenLakeMDP, FrozenLakeNode 7 8 9@gin.configurable 10class FrozenLakeContinuous(ContinuousMDP, FrozenLakeMDP): 11 """ 12 The FrozenLake continuous MDP. 13 """ 14 15 @staticmethod 16 def sample_parameters(n: int, seed: int = None) -> List[Dict[str, Any]]: 17 return FrozenLakeMDP.sample_mdp_parameters(n, False, seed) 18 19 def custom_graph_layout(self) -> Dict[FrozenLakeNode, Tuple[int, int]]: 20 """ 21 Returns 22 ------- 23 Dict[FrozenLakeNode, Tuple[int, int]] 24 The custom layout to draw a nx.Graph. 25 """ 26 return {node: tuple(node) for node in self.G}
@gin.configurable
class
FrozenLakeContinuous10@gin.configurable 11class FrozenLakeContinuous(ContinuousMDP, FrozenLakeMDP): 12 """ 13 The FrozenLake continuous MDP. 14 """ 15 16 @staticmethod 17 def sample_parameters(n: int, seed: int = None) -> List[Dict[str, Any]]: 18 return FrozenLakeMDP.sample_mdp_parameters(n, False, seed) 19 20 def custom_graph_layout(self) -> Dict[FrozenLakeNode, Tuple[int, int]]: 21 """ 22 Returns 23 ------- 24 Dict[FrozenLakeNode, Tuple[int, int]] 25 The custom layout to draw a nx.Graph. 26 """ 27 return {node: tuple(node) for node in self.G}
The FrozenLake continuous MDP.
FrozenLakeContinuous( seed: int, size: int, p_frozen: float, optimal_return: float = 1.0, suboptimal_return: float = 0.1, is_slippery: bool = True, goal_r: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, default_r: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, make_reward_stochastic=False, reward_variance_multiplier: float = 1.0, **kwargs)
227 def __init__( 228 self, 229 seed: int, 230 size: int, 231 p_frozen: float, 232 optimal_return: float = 1.0, 233 suboptimal_return: float = 0.1, 234 is_slippery: bool = True, 235 goal_r: Union[Tuple, rv_continuous] = None, 236 default_r: Union[Tuple, rv_continuous] = None, 237 make_reward_stochastic=False, 238 reward_variance_multiplier: float = 1.0, 239 **kwargs, 240 ): 241 """ 242 243 Parameters 244 ---------- 245 seed : int 246 The seed used for sampling rewards and next states. 247 size : int 248 The size of the grid. 249 p_frozen : float 250 The probability that a tile of the lake is frozen and does not contain a hole. 251 optimal_return: float 252 If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory. 253 By default, it is set to 1. 254 suboptimal_return: float 255 If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories. 256 By default, it is set to 0.1. 257 is_slippery : bool 258 If True, the outcome of the action is stochastic due to the frozen tiles being slippery. By default, it is 259 set to True. 260 goal_r : Union[Tuple, rv_continuous] 261 The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters 262 or as a rv_continuous object. 263 default_r : Union[Tuple, rv_continuous] 264 The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a 265 rv_continuous object. 266 make_reward_stochastic : bool 267 If True, the rewards of the MDP will be stochastic. By default, it is set to False. 268 reward_variance_multiplier : float 269 A constant that can be used to increase the variance of the reward distributions without changing their means. 270 The lower the value, the higher the variance. By default, it is set to 1. 271 """ 272 273 if type(goal_r) == tuple: 274 goal_r = get_dist(goal_r[0], goal_r[1]) 275 if type(default_r) == tuple: 276 default_r = get_dist(default_r[0], default_r[1]) 277 278 self._size = size 279 self._p_frozen = p_frozen 280 self._optimal_return = optimal_return 281 self._suboptimal_return = suboptimal_return 282 self._is_slippery = is_slippery 283 self._goal_r = goal_r 284 self._default_r = default_r 285 286 np.random.seed(seed) 287 self.lake = np.array( 288 list( 289 map( 290 lambda x: list(x), 291 generate_random_map(size=self._size, p=self._p_frozen), 292 ) 293 ) 294 ) 295 296 if (default_r, goal_r).count(None) == 0: 297 self._default_r = default_r 298 self._goal_r = goal_r 299 else: 300 if make_reward_stochastic: 301 self._default_r = beta( 302 reward_variance_multiplier, 303 reward_variance_multiplier 304 * (size ** 2 / self._suboptimal_return - 1), 305 ) 306 self._goal_r = beta( 307 reward_variance_multiplier * (size ** 2 / self._optimal_return - 1), 308 reward_variance_multiplier, 309 ) 310 else: 311 self._default_r = deterministic(0.0) 312 self._goal_r = deterministic(1.0) 313 314 super(FrozenLakeMDP, self).__init__( 315 seed=seed, 316 reward_variance_multiplier=reward_variance_multiplier, 317 make_reward_stochastic=make_reward_stochastic, 318 **kwargs, 319 )
Parameters
- seed (int): The seed used for sampling rewards and next states.
- size (int): The size of the grid.
- p_frozen (float): The probability that a tile of the lake is frozen and does not contain a hole.
- optimal_return (float): If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory. By default, it is set to 1.
- suboptimal_return (float): If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories. By default, it is set to 0.1.
- is_slippery (bool): If True, the outcome of the action is stochastic due to the frozen tiles being slippery. By default, it is set to True.
- goal_r (Union[Tuple, rv_continuous]): The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
- default_r (Union[Tuple, rv_continuous]): The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
- make_reward_stochastic (bool): If True, the rewards of the MDP will be stochastic. By default, it is set to False.
- reward_variance_multiplier (float): A constant that can be used to increase the variance of the reward distributions without changing their means. The lower the value, the higher the variance. By default, it is set to 1.
@staticmethod
def
sample_parameters(n: int, seed: int = None) -> List[Dict[str, Any]]:
16 @staticmethod 17 def sample_parameters(n: int, seed: int = None) -> List[Dict[str, Any]]: 18 return FrozenLakeMDP.sample_mdp_parameters(n, False, seed)
Returns
- List[Dict[str, Any]]: n sampled parameters that can be used to construct an MDP in a reasonable amount of time.
def
custom_graph_layout( self) -> Dict[colosseum.mdp.frozen_lake.base.FrozenLakeNode, Tuple[int, int]]:
20 def custom_graph_layout(self) -> Dict[FrozenLakeNode, Tuple[int, int]]: 21 """ 22 Returns 23 ------- 24 Dict[FrozenLakeNode, Tuple[int, int]] 25 The custom layout to draw a nx.Graph. 26 """ 27 return {node: tuple(node) for node in self.G}
Returns
- Dict[FrozenLakeNode, Tuple[int, int]]: The custom layout to draw a nx.Graph.
Inherited Members
- colosseum.mdp.frozen_lake.base.FrozenLakeMDP
- get_unique_symbols
- does_seed_change_MDP_structure
- sample_mdp_parameters
- get_node_class
- get_gin_parameters
- n_actions
- parameters
- colosseum.mdp.base.BaseMDP
- get_available_hardness_measures
- produce_gin_file_from_mdp_parameters
- get_gin_config
- get_node_labels
- get_node_action_labels
- hash
- instantiate_MDP
- T
- R
- recurrent_nodes_set
- communication_class
- get_optimal_policy
- get_worst_policy
- get_value_functions
- optimal_value_functions
- worst_value_functions
- random_value_functions
- optimal_transition_probabilities
- worst_transition_probabilities
- random_transition_probabilities
- optimal_markov_chain
- worst_markov_chain
- random_markov_chain
- get_stationary_distribution
- optimal_stationary_distribution
- worst_stationary_distribution
- random_stationary_distribution
- optimal_average_rewards
- worst_average_rewards
- random_average_rewards
- get_average_reward
- optimal_average_reward
- worst_average_reward
- random_average_reward
- transition_matrix_and_rewards
- graph_layout
- graph_metrics
- diameter
- sum_reciprocals_suboptimality_gaps
- discounted_value_norm
- undiscounted_value_norm
- value_norm
- measures_of_hardness
- summary
- hardness_report
- get_info_class
- get_transition_distributions
- get_reward_distribution
- sample_reward
- get_measure_from_name
- action_spec
- observation_spec
- get_observation
- reset
- step
- random_steps
- random_step
- get_visitation_counts
- reset_visitation_counts
- get_value_node_labels
- dm_env._environment.Environment
- reward_spec
- discount_spec
- close