colosseum.mdp.deep_sea.infinite_horizon
1from typing import Any, Dict, List, Tuple 2 3import gin 4 5from colosseum.mdp import ContinuousMDP 6from colosseum.mdp.deep_sea.base import DeepSeaMDP, DeepSeaNode 7 8 9@gin.configurable 10class DeepSeaContinuous(ContinuousMDP, DeepSeaMDP): 11 """ 12 The continuous DeepSea MDP class. 13 """ 14 15 @staticmethod 16 def sample_parameters(n: int, seed: int = None) -> List[Dict[str, Any]]: 17 return DeepSeaMDP.sample_mdp_parameters(n, False, seed) 18 19 def custom_graph_layout(self) -> Dict[DeepSeaNode, Tuple[int, int]]: 20 """ 21 Returns 22 ------- 23 Dict[DeepSeaNode, Tuple[int, int]] 24 The custom layout to draw a nx.Graph. 25 """ 26 return {node: tuple(node) for node in self.G}
@gin.configurable
class
DeepSeaContinuous10@gin.configurable 11class DeepSeaContinuous(ContinuousMDP, DeepSeaMDP): 12 """ 13 The continuous DeepSea MDP class. 14 """ 15 16 @staticmethod 17 def sample_parameters(n: int, seed: int = None) -> List[Dict[str, Any]]: 18 return DeepSeaMDP.sample_mdp_parameters(n, False, seed) 19 20 def custom_graph_layout(self) -> Dict[DeepSeaNode, Tuple[int, int]]: 21 """ 22 Returns 23 ------- 24 Dict[DeepSeaNode, Tuple[int, int]] 25 The custom layout to draw a nx.Graph. 26 """ 27 return {node: tuple(node) for node in self.G}
The continuous DeepSea MDP class.
DeepSeaContinuous( seed: int, size: int, optimal_return: float = 1.0, suboptimal_return: float = 0.5, optimal_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, sub_optimal_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, other_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, make_reward_stochastic=False, reward_variance_multiplier: float = 1.0, **kwargs)
153 def __init__( 154 self, 155 seed: int, 156 size: int, 157 optimal_return: float = 1.0, 158 suboptimal_return: float = 0.5, 159 optimal_distribution: Union[Tuple, rv_continuous] = None, 160 sub_optimal_distribution: Union[Tuple, rv_continuous] = None, 161 other_distribution: Union[Tuple, rv_continuous] = None, 162 make_reward_stochastic=False, 163 reward_variance_multiplier: float = 1.0, 164 **kwargs, 165 ): 166 """ 167 Parameters 168 ---------- 169 seed : int 170 The seed used for sampling rewards and next states. 171 size : int 172 The size of the grid. 173 optimal_return : float 174 If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory. 175 By default, it is set to 1. 176 suboptimal_return: float 177 If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories. 178 By default, it is set to 0.5. 179 optimal_distribution : Union[Tuple, rv_continuous] 180 The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters 181 or as a rv_continuous object. 182 sub_optimal_distribution : Union[Tuple, rv_continuous] 183 The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta 184 parameters or as a rv_continuous object. 185 other_distribution : Union[Tuple, rv_continuous] 186 The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a 187 rv_continuous object. 188 make_reward_stochastic : bool 189 If True, the rewards of the MDP will be stochastic. By default, it is set to False. 190 reward_variance_multiplier : float 191 A constant that can be used to increase the variance of the reward distributions without changing their means. 192 The lower the value, the higher the variance. By default, it is set to 1. 193 """ 194 195 if type(sub_optimal_distribution) == tuple: 196 sub_optimal_distribution = get_dist( 197 sub_optimal_distribution[0], sub_optimal_distribution[1] 198 ) 199 if type(optimal_distribution) == tuple: 200 optimal_distribution = get_dist( 201 optimal_distribution[0], optimal_distribution[1] 202 ) 203 if type(other_distribution) == tuple: 204 other_distribution = get_dist(other_distribution[0], other_distribution[1]) 205 206 self._size = size 207 self._optimal_return = optimal_return 208 self._suboptimal_return = suboptimal_return 209 self._optimal_distribution = optimal_distribution 210 self._sub_optimal_distribution = sub_optimal_distribution 211 self._other_distribution = other_distribution 212 213 dists = [ 214 sub_optimal_distribution, 215 optimal_distribution, 216 other_distribution, 217 ] 218 if dists.count(None) == 0: 219 self._sub_optimal_distribution = sub_optimal_distribution 220 self._optimal_distribution = optimal_distribution 221 self._other_distribution = other_distribution 222 else: 223 if make_reward_stochastic: 224 self._sub_optimal_distribution = beta( 225 reward_variance_multiplier, 226 reward_variance_multiplier * (size / self._suboptimal_return - 1), 227 ) 228 self._optimal_distribution = beta( 229 reward_variance_multiplier * (size / self._optimal_return - 1), 230 reward_variance_multiplier, 231 ) 232 self._other_distribution = beta( 233 reward_variance_multiplier, 234 reward_variance_multiplier 235 * 10 236 * (size / self._suboptimal_return - 1), 237 ) 238 else: 239 self._sub_optimal_distribution = deterministic(1.0 / (size ** 2)) 240 self._optimal_distribution = deterministic(1.0) 241 self._other_distribution = deterministic(0.0) 242 243 super(DeepSeaMDP, self).__init__( 244 seed=seed, 245 reward_variance_multiplier=reward_variance_multiplier, 246 make_reward_stochastic=make_reward_stochastic, 247 **kwargs, 248 )
Parameters
- seed (int): The seed used for sampling rewards and next states.
- size (int): The size of the grid.
- optimal_return (float): If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory. By default, it is set to 1.
- suboptimal_return (float): If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories. By default, it is set to 0.5.
- optimal_distribution (Union[Tuple, rv_continuous]): The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
- sub_optimal_distribution (Union[Tuple, rv_continuous]): The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
- other_distribution (Union[Tuple, rv_continuous]): The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
- make_reward_stochastic (bool): If True, the rewards of the MDP will be stochastic. By default, it is set to False.
- reward_variance_multiplier (float): A constant that can be used to increase the variance of the reward distributions without changing their means. The lower the value, the higher the variance. By default, it is set to 1.
@staticmethod
def
sample_parameters(n: int, seed: int = None) -> List[Dict[str, Any]]:
16 @staticmethod 17 def sample_parameters(n: int, seed: int = None) -> List[Dict[str, Any]]: 18 return DeepSeaMDP.sample_mdp_parameters(n, False, seed)
Returns
- List[Dict[str, Any]]: n sampled parameters that can be used to construct an MDP in a reasonable amount of time.
20 def custom_graph_layout(self) -> Dict[DeepSeaNode, Tuple[int, int]]: 21 """ 22 Returns 23 ------- 24 Dict[DeepSeaNode, Tuple[int, int]] 25 The custom layout to draw a nx.Graph. 26 """ 27 return {node: tuple(node) for node in self.G}
Returns
- Dict[DeepSeaNode, Tuple[int, int]]: The custom layout to draw a nx.Graph.
Inherited Members
- colosseum.mdp.deep_sea.base.DeepSeaMDP
- get_unique_symbols
- does_seed_change_MDP_structure
- sample_mdp_parameters
- get_node_class
- get_gin_parameters
- n_actions
- parameters
- colosseum.mdp.base.BaseMDP
- get_available_hardness_measures
- produce_gin_file_from_mdp_parameters
- get_gin_config
- get_node_labels
- get_node_action_labels
- hash
- instantiate_MDP
- T
- R
- recurrent_nodes_set
- communication_class
- get_optimal_policy
- get_worst_policy
- get_value_functions
- optimal_value_functions
- worst_value_functions
- random_value_functions
- optimal_transition_probabilities
- worst_transition_probabilities
- random_transition_probabilities
- optimal_markov_chain
- worst_markov_chain
- random_markov_chain
- get_stationary_distribution
- optimal_stationary_distribution
- worst_stationary_distribution
- random_stationary_distribution
- optimal_average_rewards
- worst_average_rewards
- random_average_rewards
- get_average_reward
- optimal_average_reward
- worst_average_reward
- random_average_reward
- transition_matrix_and_rewards
- graph_layout
- graph_metrics
- diameter
- sum_reciprocals_suboptimality_gaps
- discounted_value_norm
- undiscounted_value_norm
- value_norm
- measures_of_hardness
- summary
- hardness_report
- get_info_class
- get_transition_distributions
- get_reward_distribution
- sample_reward
- get_measure_from_name
- action_spec
- observation_spec
- get_observation
- reset
- step
- random_steps
- random_step
- get_visitation_counts
- reset_visitation_counts
- get_value_node_labels
- dm_env._environment.Environment
- reward_spec
- discount_spec
- close