colosseum.mdp.simple_grid.infinite_horizon
1from typing import Any, Dict, List, Tuple, TYPE_CHECKING 2 3import gin 4 5from colosseum.mdp import ContinuousMDP 6from colosseum.mdp.simple_grid.base import SimpleGridMDP 7 8if TYPE_CHECKING: 9 from colosseum.mdp import NODE_TYPE 10 11 12@gin.configurable 13class SimpleGridContinuous(ContinuousMDP, SimpleGridMDP): 14 @staticmethod 15 def sample_parameters(n: int, seed: int = None) -> List[Dict[str, Any]]: 16 return SimpleGridMDP.sample_mdp_parameters(n, False, seed) 17 18 def custom_graph_layout(self) -> Dict["NODE_TYPE", Tuple[float, float]]: 19 return {node: list(node) for node in self.G}
@gin.configurable
class
SimpleGridContinuous13@gin.configurable 14class SimpleGridContinuous(ContinuousMDP, SimpleGridMDP): 15 @staticmethod 16 def sample_parameters(n: int, seed: int = None) -> List[Dict[str, Any]]: 17 return SimpleGridMDP.sample_mdp_parameters(n, False, seed) 18 19 def custom_graph_layout(self) -> Dict["NODE_TYPE", Tuple[float, float]]: 20 return {node: list(node) for node in self.G}
The base class for continuous MDPs.
SimpleGridContinuous( seed: int, size: int, reward_type: colosseum.mdp.simple_grid.base.SimpleGridReward = <SimpleGridReward.XOR: 3>, n_starting_states: int = 1, optimal_mean_reward: float = 0.9, sub_optimal_mean_reward: float = 0.2, optimal_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, sub_optimal_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, other_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, make_reward_stochastic=False, reward_variance_multiplier: float = 1.0, **kwargs)
317 def __init__( 318 self, 319 seed: int, 320 size: int, 321 reward_type: SimpleGridReward = SimpleGridReward.XOR, 322 n_starting_states: int = 1, 323 optimal_mean_reward: float = 0.9, 324 sub_optimal_mean_reward: float = 0.2, 325 optimal_distribution: Union[Tuple, rv_continuous] = None, 326 sub_optimal_distribution: Union[Tuple, rv_continuous] = None, 327 other_distribution: Union[Tuple, rv_continuous] = None, 328 make_reward_stochastic=False, 329 reward_variance_multiplier: float = 1.0, 330 **kwargs, 331 ): 332 """ 333 334 Parameters 335 ---------- 336 seed : int 337 The seed used for sampling rewards and next states. 338 size : int 339 The size of the grid. 340 reward_type : SimpleGridReward 341 The type of reward for the MDP. By default, the XOR type is used. 342 n_starting_states : int 343 The number of possible starting states. 344 optimal_mean_reward : float 345 If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory. 346 By default, it is set to 0.9. 347 sub_optimal_mean_reward : float 348 If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories. 349 By default, it is set to 0.2. 350 optimal_distribution : Union[Tuple, rv_continuous] 351 The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters 352 or as a rv_continuous object. 353 sub_optimal_distribution : Union[Tuple, rv_continuous] 354 The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta 355 parameters or as a rv_continuous object. 356 other_distribution : Union[Tuple, rv_continuous] 357 The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a 358 rv_continuous object. 359 make_reward_stochastic : bool 360 If True, the rewards of the MDP will be stochastic. By default, it is set to False. 361 reward_variance_multiplier : float 362 A constant that can be used to increase the variance of the reward distributions without changing their means. 363 The lower the value, the higher the variance. By default, it is set to 1. 364 """ 365 366 if type(sub_optimal_distribution) == tuple: 367 sub_optimal_distribution = get_dist( 368 sub_optimal_distribution[0], sub_optimal_distribution[1] 369 ) 370 if type(optimal_distribution) == tuple: 371 optimal_distribution = get_dist( 372 optimal_distribution[0], optimal_distribution[1] 373 ) 374 if type(other_distribution) == tuple: 375 other_distribution = get_dist(other_distribution[0], other_distribution[1]) 376 377 self._size = size 378 self._reward_type = SimpleGridReward(reward_type) 379 self._n_starting_states = n_starting_states 380 self._optimal_mean_reward = optimal_mean_reward 381 self._sub_optimal_mean_reward = sub_optimal_mean_reward 382 dists = [ 383 sub_optimal_distribution, 384 optimal_distribution, 385 other_distribution, 386 ] 387 388 if dists.count(None) == 0: 389 self._sub_optimal_distribution = sub_optimal_distribution 390 self._optimal_distribution = optimal_distribution 391 self._other_distribution = other_distribution 392 else: 393 if make_reward_stochastic: 394 self._sub_optimal_distribution = beta( 395 reward_variance_multiplier, 396 reward_variance_multiplier * (10 / sub_optimal_mean_reward - 1), 397 ) 398 self._optimal_distribution = beta( 399 reward_variance_multiplier, 400 reward_variance_multiplier * (1 / optimal_mean_reward - 1), 401 ) 402 self._other_distribution = beta( 403 reward_variance_multiplier, 404 reward_variance_multiplier * (1 / sub_optimal_mean_reward - 1), 405 ) 406 else: 407 self._sub_optimal_distribution = deterministic(0.0) 408 self._optimal_distribution = deterministic(1.0) 409 self._other_distribution = deterministic(0.5) 410 411 super(SimpleGridMDP, self).__init__( 412 seed=seed, 413 reward_variance_multiplier=reward_variance_multiplier, 414 make_reward_stochastic=make_reward_stochastic, 415 **kwargs, 416 )
Parameters
- seed (int): The seed used for sampling rewards and next states.
- size (int): The size of the grid.
- reward_type (SimpleGridReward): The type of reward for the MDP. By default, the XOR type is used.
- n_starting_states (int): The number of possible starting states.
- optimal_mean_reward (float): If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory. By default, it is set to 0.9.
- sub_optimal_mean_reward (float): If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories. By default, it is set to 0.2.
- optimal_distribution (Union[Tuple, rv_continuous]): The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
- sub_optimal_distribution (Union[Tuple, rv_continuous]): The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
- other_distribution (Union[Tuple, rv_continuous]): The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
- make_reward_stochastic (bool): If True, the rewards of the MDP will be stochastic. By default, it is set to False.
- reward_variance_multiplier (float): A constant that can be used to increase the variance of the reward distributions without changing their means. The lower the value, the higher the variance. By default, it is set to 1.
@staticmethod
def
sample_parameters(n: int, seed: int = None) -> List[Dict[str, Any]]:
15 @staticmethod 16 def sample_parameters(n: int, seed: int = None) -> List[Dict[str, Any]]: 17 return SimpleGridMDP.sample_mdp_parameters(n, False, seed)
Returns
- List[Dict[str, Any]]: n sampled parameters that can be used to construct an MDP in a reasonable amount of time.
def
custom_graph_layout( self) -> Dict[Union[colosseum.mdp.custom_mdp.CustomNode, colosseum.mdp.river_swim.base.RiverSwimNode, colosseum.mdp.deep_sea.base.DeepSeaNode, colosseum.mdp.frozen_lake.base.FrozenLakeNode, colosseum.mdp.simple_grid.base.SimpleGridNode, colosseum.mdp.minigrid_empty.base.MiniGridEmptyNode, colosseum.mdp.minigrid_rooms.base.MiniGridRoomsNode, colosseum.mdp.taxi.base.TaxiNode], Tuple[float, float]]:
Inherited Members
- colosseum.mdp.simple_grid.base.SimpleGridMDP
- get_action_class
- get_unique_symbols
- does_seed_change_MDP_structure
- sample_mdp_parameters
- get_node_class
- get_gin_parameters
- n_actions
- parameters
- colosseum.mdp.base.BaseMDP
- get_available_hardness_measures
- produce_gin_file_from_mdp_parameters
- get_gin_config
- get_node_labels
- get_node_action_labels
- hash
- instantiate_MDP
- T
- R
- recurrent_nodes_set
- communication_class
- get_optimal_policy
- get_worst_policy
- get_value_functions
- optimal_value_functions
- worst_value_functions
- random_value_functions
- optimal_transition_probabilities
- worst_transition_probabilities
- random_transition_probabilities
- optimal_markov_chain
- worst_markov_chain
- random_markov_chain
- get_stationary_distribution
- optimal_stationary_distribution
- worst_stationary_distribution
- random_stationary_distribution
- optimal_average_rewards
- worst_average_rewards
- random_average_rewards
- get_average_reward
- optimal_average_reward
- worst_average_reward
- random_average_reward
- transition_matrix_and_rewards
- graph_layout
- graph_metrics
- diameter
- sum_reciprocals_suboptimality_gaps
- discounted_value_norm
- undiscounted_value_norm
- value_norm
- measures_of_hardness
- summary
- hardness_report
- get_info_class
- get_transition_distributions
- get_reward_distribution
- sample_reward
- get_measure_from_name
- action_spec
- observation_spec
- get_observation
- reset
- step
- random_steps
- random_step
- get_visitation_counts
- reset_visitation_counts
- get_value_node_labels
- dm_env._environment.Environment
- reward_spec
- discount_spec
- close