colosseum.mdp.river_swim.infinite_horizon
1from typing import Any, Dict, List, Tuple 2 3import gin 4 5from colosseum.mdp import ContinuousMDP 6from colosseum.mdp.river_swim.base import RiverSwimMDP, RiverSwimNode 7 8 9@gin.configurable 10class RiverSwimContinuous(ContinuousMDP, RiverSwimMDP): 11 """ 12 The continuous RiverSwim MDP. 13 """ 14 15 @staticmethod 16 def sample_parameters(n: int, seed: int = None) -> List[Dict[str, Any]]: 17 return RiverSwimMDP.sample_mdp_parameters(n, False, seed) 18 19 def custom_graph_layout(self) -> Dict[RiverSwimNode, Tuple[int, int]]: 20 """ 21 Returns 22 ------- 23 Dict[RiverSwimNode, Tuple[int, int]] 24 The custom layout to draw a nx.Graph. 25 """ 26 return {node: tuple(node) for node in self.G}
@gin.configurable
class
RiverSwimContinuous10@gin.configurable 11class RiverSwimContinuous(ContinuousMDP, RiverSwimMDP): 12 """ 13 The continuous RiverSwim MDP. 14 """ 15 16 @staticmethod 17 def sample_parameters(n: int, seed: int = None) -> List[Dict[str, Any]]: 18 return RiverSwimMDP.sample_mdp_parameters(n, False, seed) 19 20 def custom_graph_layout(self) -> Dict[RiverSwimNode, Tuple[int, int]]: 21 """ 22 Returns 23 ------- 24 Dict[RiverSwimNode, Tuple[int, int]] 25 The custom layout to draw a nx.Graph. 26 """ 27 return {node: tuple(node) for node in self.G}
The continuous RiverSwim MDP.
RiverSwimContinuous( seed: int, size: int, optimal_mean_reward: float = 0.9, sub_optimal_mean_reward: float = 0.2, sub_optimal_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, optimal_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, other_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, make_reward_stochastic=False, reward_variance_multiplier: float = 1.0, **kwargs)
154 def __init__( 155 self, 156 seed: int, 157 size: int, 158 optimal_mean_reward: float = 0.9, 159 sub_optimal_mean_reward: float = 0.2, 160 sub_optimal_distribution: Union[Tuple, rv_continuous] = None, 161 optimal_distribution: Union[Tuple, rv_continuous] = None, 162 other_distribution: Union[Tuple, rv_continuous] = None, 163 make_reward_stochastic=False, 164 reward_variance_multiplier: float = 1.0, 165 **kwargs, 166 ): 167 """ 168 Parameters 169 ---------- 170 seed : int 171 The seed used for sampling rewards and next states. 172 size : int 173 The length of the chain. 174 optimal_mean_reward : float 175 If the rewards are made stochastic, this parameter controls the mean reward for the highly rewarding states. 176 By default, it is set to 0.9. 177 sub_optimal_mean_reward : float 178 If the rewards are made stochastic, this parameter controls the mean reward for the suboptimal states. 179 By default, it is set to 0.2. 180 sub_optimal_distribution : Union[Tuple, rv_continuous] 181 The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta 182 parameters or as a rv_continuous object. 183 optimal_distribution : Union[Tuple, rv_continuous] 184 The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters 185 or as a rv_continuous object. 186 other_distribution : Union[Tuple, rv_continuous] 187 The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a 188 rv_continuous object. 189 make_reward_stochastic : bool 190 If True, the rewards of the MDP will be stochastic. By default, it is set to False. 191 reward_variance_multiplier : float 192 A constant that can be used to increase the variance of the reward distributions without changing their means. 193 The lower the value, the higher the variance. By default, it is set to 1. 194 """ 195 196 if type(sub_optimal_distribution) == tuple: 197 sub_optimal_distribution = get_dist( 198 sub_optimal_distribution[0], sub_optimal_distribution[1] 199 ) 200 if type(optimal_distribution) == tuple: 201 optimal_distribution = get_dist( 202 optimal_distribution[0], optimal_distribution[1] 203 ) 204 if type(other_distribution) == tuple: 205 other_distribution = get_dist(other_distribution[0], other_distribution[1]) 206 207 self._size = size 208 self._optimal_mean_reward = optimal_mean_reward 209 self._sub_optimal_mean_reward = sub_optimal_mean_reward 210 self._optimal_distribution = optimal_distribution 211 self._sub_optimal_distribution = sub_optimal_distribution 212 self._other_distribution = other_distribution 213 214 dists = [ 215 sub_optimal_distribution, 216 optimal_distribution, 217 other_distribution, 218 ] 219 if dists.count(None) == 0: 220 self._sub_optimal_distribution = sub_optimal_distribution 221 self._optimal_distribution = optimal_distribution 222 self._other_distribution = other_distribution 223 else: 224 if make_reward_stochastic: 225 if self.is_episodic(): 226 sub_optimal_mean_reward /= self._size 227 self._sub_optimal_distribution = beta( 228 reward_variance_multiplier, 229 reward_variance_multiplier * (1 / sub_optimal_mean_reward - 1), 230 ) 231 self._optimal_distribution = beta( 232 reward_variance_multiplier, 233 reward_variance_multiplier * (1 / optimal_mean_reward - 1), 234 ) 235 self._other_distribution = beta( 236 reward_variance_multiplier, 237 reward_variance_multiplier * (10 / sub_optimal_mean_reward - 1), 238 ) 239 else: 240 self._sub_optimal_distribution = deterministic(5 / 1000) 241 self._optimal_distribution = deterministic(1.0) 242 self._other_distribution = deterministic(0.0) 243 244 super(RiverSwimMDP, self).__init__( 245 seed=seed, 246 reward_variance_multiplier=reward_variance_multiplier, 247 make_reward_stochastic=make_reward_stochastic, 248 **kwargs, 249 )
Parameters
- seed (int): The seed used for sampling rewards and next states.
- size (int): The length of the chain.
- optimal_mean_reward (float): If the rewards are made stochastic, this parameter controls the mean reward for the highly rewarding states. By default, it is set to 0.9.
- sub_optimal_mean_reward (float): If the rewards are made stochastic, this parameter controls the mean reward for the suboptimal states. By default, it is set to 0.2.
- sub_optimal_distribution (Union[Tuple, rv_continuous]): The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
- optimal_distribution (Union[Tuple, rv_continuous]): The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
- other_distribution (Union[Tuple, rv_continuous]): The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
- make_reward_stochastic (bool): If True, the rewards of the MDP will be stochastic. By default, it is set to False.
- reward_variance_multiplier (float): A constant that can be used to increase the variance of the reward distributions without changing their means. The lower the value, the higher the variance. By default, it is set to 1.
@staticmethod
def
sample_parameters(n: int, seed: int = None) -> List[Dict[str, Any]]:
16 @staticmethod 17 def sample_parameters(n: int, seed: int = None) -> List[Dict[str, Any]]: 18 return RiverSwimMDP.sample_mdp_parameters(n, False, seed)
Returns
- List[Dict[str, Any]]: n sampled parameters that can be used to construct an MDP in a reasonable amount of time.
def
custom_graph_layout( self) -> Dict[colosseum.mdp.river_swim.base.RiverSwimNode, Tuple[int, int]]:
20 def custom_graph_layout(self) -> Dict[RiverSwimNode, Tuple[int, int]]: 21 """ 22 Returns 23 ------- 24 Dict[RiverSwimNode, Tuple[int, int]] 25 The custom layout to draw a nx.Graph. 26 """ 27 return {node: tuple(node) for node in self.G}
Returns
- Dict[RiverSwimNode, Tuple[int, int]]: The custom layout to draw a nx.Graph.
Inherited Members
- colosseum.mdp.river_swim.base.RiverSwimMDP
- get_action_class
- get_unique_symbols
- does_seed_change_MDP_structure
- sample_mdp_parameters
- get_node_class
- get_gin_parameters
- n_actions
- parameters
- colosseum.mdp.base.BaseMDP
- get_available_hardness_measures
- produce_gin_file_from_mdp_parameters
- get_gin_config
- get_node_labels
- get_node_action_labels
- hash
- instantiate_MDP
- T
- R
- recurrent_nodes_set
- communication_class
- get_optimal_policy
- get_worst_policy
- get_value_functions
- optimal_value_functions
- worst_value_functions
- random_value_functions
- optimal_transition_probabilities
- worst_transition_probabilities
- random_transition_probabilities
- optimal_markov_chain
- worst_markov_chain
- random_markov_chain
- get_stationary_distribution
- optimal_stationary_distribution
- worst_stationary_distribution
- random_stationary_distribution
- optimal_average_rewards
- worst_average_rewards
- random_average_rewards
- get_average_reward
- optimal_average_reward
- worst_average_reward
- random_average_reward
- transition_matrix_and_rewards
- graph_layout
- graph_metrics
- diameter
- sum_reciprocals_suboptimality_gaps
- discounted_value_norm
- undiscounted_value_norm
- value_norm
- measures_of_hardness
- summary
- hardness_report
- get_info_class
- get_transition_distributions
- get_reward_distribution
- sample_reward
- get_measure_from_name
- action_spec
- observation_spec
- get_observation
- reset
- step
- random_steps
- random_step
- get_visitation_counts
- reset_visitation_counts
- get_value_node_labels
- dm_env._environment.Environment
- reward_spec
- discount_spec
- close