colosseum.mdp.taxi.infinite_horizon
1from typing import Any, Dict, List 2 3import gin 4 5from colosseum.mdp import ContinuousMDP 6from colosseum.mdp.taxi.base import TaxiMDP 7 8 9@gin.configurable 10class TaxiContinuous(ContinuousMDP, TaxiMDP): 11 @staticmethod 12 def sample_parameters(n: int, seed: int = None) -> List[Dict[str, Any]]: 13 return TaxiMDP.sample_mdp_parameters(n, False, seed)
@gin.configurable
class
TaxiContinuous10@gin.configurable 11class TaxiContinuous(ContinuousMDP, TaxiMDP): 12 @staticmethod 13 def sample_parameters(n: int, seed: int = None) -> List[Dict[str, Any]]: 14 return TaxiMDP.sample_mdp_parameters(n, False, seed)
The base class for continuous MDPs.
TaxiContinuous( seed: int, size: int, length=2, width=1, space=1, n_locations=4, optimal_mean_reward: float = 0.9, sub_optimal_mean_reward: float = 0.2, default_r: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, successfully_delivery_r: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, failure_delivery_r: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, make_reward_stochastic=False, reward_variance_multiplier: float = 1.0, **kwargs)
390 def __init__( 391 self, 392 seed: int, 393 size: int, 394 length=2, 395 width=1, 396 space=1, 397 n_locations=2 ** 2, 398 optimal_mean_reward: float = 0.9, 399 sub_optimal_mean_reward: float = 0.2, 400 default_r: Union[Tuple, rv_continuous] = None, 401 successfully_delivery_r: Union[Tuple, rv_continuous] = None, 402 failure_delivery_r: Union[Tuple, rv_continuous] = None, 403 make_reward_stochastic=False, 404 reward_variance_multiplier: float = 1.0, 405 **kwargs, 406 ): 407 """ 408 Parameters 409 ---------- 410 seed : int 411 The seed used for sampling rewards and next states. 412 size : int 413 The size of the grid. 414 length : int 415 The length of the walls. 416 width : int 417 The width of the walls. 418 space : int 419 The space between walls. 420 n_locations : int 421 The number of possible spawn locations. It must be a squared number. 422 optimal_mean_reward : float 423 If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory. 424 By default, it is set to 0.9. 425 sub_optimal_mean_reward: float 426 If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories. 427 By default, it is set to 0.1. 428 default_r 429 successfully_delivery_r : Union[Tuple, rv_continuous] 430 The reward distribution for successfully delivering a passenger. It can be either passed as a tuple 431 containing Beta parameters or as a rv_continuous object. 432 failure_delivery_r 433 The reward distribution for failing to deliver a passenger. It can be either passed as a tuple containing 434 Beta parameters or as a rv_continuous object. 435 make_reward_stochastic : bool 436 If True, the rewards of the MDP will be stochastic. By default, it is set to False. 437 reward_variance_multiplier : float 438 A constant that can be used to increase the variance of the reward distributions without changing their means. 439 The lower the value, the higher the variance. By default, it is set to 1. 440 """ 441 442 if type(successfully_delivery_r) == tuple: 443 successfully_delivery_r = get_dist( 444 successfully_delivery_r[0], successfully_delivery_r[1] 445 ) 446 if type(failure_delivery_r) == tuple: 447 failure_delivery_r = get_dist(failure_delivery_r[0], failure_delivery_r[1]) 448 449 if type(default_r) == tuple: 450 default_r = get_dist(default_r[0], default_r[1]) 451 452 self._size = size 453 self._length = length 454 self._width = width 455 self._space = space 456 self.n_locations = n_locations 457 self._n_locations = int(np.ceil(n_locations ** 0.5) ** 2) 458 self._optimal_mean_reward = optimal_mean_reward 459 self._sub_optimal_mean_reward = sub_optimal_mean_reward 460 self._locations = [] 461 462 dists = [default_r, successfully_delivery_r, failure_delivery_r] 463 if dists.count(None) == 0: 464 self._default_r = default_r 465 self._successfully_delivery_r = successfully_delivery_r 466 self._failure_delivery_r = failure_delivery_r 467 else: 468 if make_reward_stochastic: 469 self._default_r = beta( 470 reward_variance_multiplier, 471 reward_variance_multiplier * (1 / sub_optimal_mean_reward - 1), 472 ) 473 self._successfully_delivery_r = beta( 474 reward_variance_multiplier, 475 reward_variance_multiplier * (1 / optimal_mean_reward - 1), 476 ) 477 self._failure_delivery_r = beta( 478 reward_variance_multiplier, 479 reward_variance_multiplier * (10 / sub_optimal_mean_reward - 1), 480 ) 481 else: 482 self._default_r = deterministic(0.1) 483 self._successfully_delivery_r = deterministic(1) 484 self._failure_delivery_r = deterministic(0) 485 486 kwargs[ 487 "randomize_actions" 488 ] = False # TODO : double check whether this is actually necessary or not 489 490 super(TaxiMDP, self).__init__( 491 seed=seed, 492 reward_variance_multiplier=reward_variance_multiplier, 493 make_reward_stochastic=make_reward_stochastic, 494 **kwargs, 495 )
Parameters
- seed (int): The seed used for sampling rewards and next states.
- size (int): The size of the grid.
- length (int): The length of the walls.
- width (int): The width of the walls.
- space (int): The space between walls.
- n_locations (int): The number of possible spawn locations. It must be a squared number.
- optimal_mean_reward (float): If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory. By default, it is set to 0.9.
- sub_optimal_mean_reward (float): If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories. By default, it is set to 0.1.
- default_r
- successfully_delivery_r (Union[Tuple, rv_continuous]): The reward distribution for successfully delivering a passenger. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
- failure_delivery_r: The reward distribution for failing to deliver a passenger. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
- make_reward_stochastic (bool): If True, the rewards of the MDP will be stochastic. By default, it is set to False.
- reward_variance_multiplier (float): A constant that can be used to increase the variance of the reward distributions without changing their means. The lower the value, the higher the variance. By default, it is set to 1.
@staticmethod
def
sample_parameters(n: int, seed: int = None) -> List[Dict[str, Any]]:
12 @staticmethod 13 def sample_parameters(n: int, seed: int = None) -> List[Dict[str, Any]]: 14 return TaxiMDP.sample_mdp_parameters(n, False, seed)
Returns
- List[Dict[str, Any]]: n sampled parameters that can be used to construct an MDP in a reasonable amount of time.
Inherited Members
- colosseum.mdp.taxi.base.TaxiMDP
- get_unique_symbols
- does_seed_change_MDP_structure
- sample_mdp_parameters
- get_node_class
- get_gin_parameters
- n_actions
- parameters
- colosseum.mdp.base.BaseMDP
- get_available_hardness_measures
- produce_gin_file_from_mdp_parameters
- get_gin_config
- get_node_labels
- get_node_action_labels
- hash
- instantiate_MDP
- T
- R
- recurrent_nodes_set
- communication_class
- get_optimal_policy
- get_worst_policy
- get_value_functions
- optimal_value_functions
- worst_value_functions
- random_value_functions
- optimal_transition_probabilities
- worst_transition_probabilities
- random_transition_probabilities
- optimal_markov_chain
- worst_markov_chain
- random_markov_chain
- get_stationary_distribution
- optimal_stationary_distribution
- worst_stationary_distribution
- random_stationary_distribution
- optimal_average_rewards
- worst_average_rewards
- random_average_rewards
- get_average_reward
- optimal_average_reward
- worst_average_reward
- random_average_reward
- transition_matrix_and_rewards
- graph_layout
- graph_metrics
- diameter
- sum_reciprocals_suboptimality_gaps
- discounted_value_norm
- undiscounted_value_norm
- value_norm
- measures_of_hardness
- summary
- hardness_report
- get_info_class
- get_transition_distributions
- get_reward_distribution
- sample_reward
- get_measure_from_name
- action_spec
- observation_spec
- get_observation
- reset
- step
- random_steps
- random_step
- get_visitation_counts
- reset_visitation_counts
- get_value_node_labels
- dm_env._environment.Environment
- reward_spec
- discount_spec
- close