colosseum.mdp.river_swim.infinite_horizon

 1from typing import Any, Dict, List, Tuple
 2
 3import gin
 4
 5from colosseum.mdp import ContinuousMDP
 6from colosseum.mdp.river_swim.base import RiverSwimMDP, RiverSwimNode
 7
 8
 9@gin.configurable
10class RiverSwimContinuous(ContinuousMDP, RiverSwimMDP):
11    """
12    The continuous RiverSwim MDP.
13    """
14
15    @staticmethod
16    def sample_parameters(n: int, seed: int = None) -> List[Dict[str, Any]]:
17        return RiverSwimMDP.sample_mdp_parameters(n, False, seed)
18
19    def custom_graph_layout(self) -> Dict[RiverSwimNode, Tuple[int, int]]:
20        """
21        Returns
22        -------
23        Dict[RiverSwimNode, Tuple[int, int]]
24            The custom layout to draw a nx.Graph.
25        """
26        return {node: tuple(node) for node in self.G}
@gin.configurable
class RiverSwimContinuous(colosseum.mdp.base_infinite.ContinuousMDP, colosseum.mdp.river_swim.base.RiverSwimMDP):
10@gin.configurable
11class RiverSwimContinuous(ContinuousMDP, RiverSwimMDP):
12    """
13    The continuous RiverSwim MDP.
14    """
15
16    @staticmethod
17    def sample_parameters(n: int, seed: int = None) -> List[Dict[str, Any]]:
18        return RiverSwimMDP.sample_mdp_parameters(n, False, seed)
19
20    def custom_graph_layout(self) -> Dict[RiverSwimNode, Tuple[int, int]]:
21        """
22        Returns
23        -------
24        Dict[RiverSwimNode, Tuple[int, int]]
25            The custom layout to draw a nx.Graph.
26        """
27        return {node: tuple(node) for node in self.G}

The continuous RiverSwim MDP.

RiverSwimContinuous( seed: int, size: int, optimal_mean_reward: float = 0.9, sub_optimal_mean_reward: float = 0.2, sub_optimal_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, optimal_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, other_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, make_reward_stochastic=False, reward_variance_multiplier: float = 1.0, **kwargs)
154    def __init__(
155        self,
156        seed: int,
157        size: int,
158        optimal_mean_reward: float = 0.9,
159        sub_optimal_mean_reward: float = 0.2,
160        sub_optimal_distribution: Union[Tuple, rv_continuous] = None,
161        optimal_distribution: Union[Tuple, rv_continuous] = None,
162        other_distribution: Union[Tuple, rv_continuous] = None,
163        make_reward_stochastic=False,
164        reward_variance_multiplier: float = 1.0,
165        **kwargs,
166    ):
167        """
168        Parameters
169        ----------
170        seed : int
171            The seed used for sampling rewards and next states.
172        size : int
173            The length of the chain.
174        optimal_mean_reward : float
175            If the rewards are made stochastic, this parameter controls the mean reward for the highly rewarding states.
176            By default, it is set to 0.9.
177        sub_optimal_mean_reward : float
178            If the rewards are made stochastic, this parameter controls the mean reward for the suboptimal states.
179            By default, it is set to 0.2.
180        sub_optimal_distribution : Union[Tuple, rv_continuous]
181            The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta
182            parameters or as a rv_continuous object.
183        optimal_distribution : Union[Tuple, rv_continuous]
184            The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters
185            or as a rv_continuous object.
186        other_distribution : Union[Tuple, rv_continuous]
187            The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a
188            rv_continuous object.
189        make_reward_stochastic : bool
190            If True, the rewards of the MDP will be stochastic. By default, it is set to False.
191        reward_variance_multiplier : float
192            A constant that can be used to increase the variance of the reward distributions without changing their means.
193            The lower the value, the higher the variance. By default, it is set to 1.
194        """
195
196        if type(sub_optimal_distribution) == tuple:
197            sub_optimal_distribution = get_dist(
198                sub_optimal_distribution[0], sub_optimal_distribution[1]
199            )
200        if type(optimal_distribution) == tuple:
201            optimal_distribution = get_dist(
202                optimal_distribution[0], optimal_distribution[1]
203            )
204        if type(other_distribution) == tuple:
205            other_distribution = get_dist(other_distribution[0], other_distribution[1])
206
207        self._size = size
208        self._optimal_mean_reward = optimal_mean_reward
209        self._sub_optimal_mean_reward = sub_optimal_mean_reward
210        self._optimal_distribution = optimal_distribution
211        self._sub_optimal_distribution = sub_optimal_distribution
212        self._other_distribution = other_distribution
213
214        dists = [
215            sub_optimal_distribution,
216            optimal_distribution,
217            other_distribution,
218        ]
219        if dists.count(None) == 0:
220            self._sub_optimal_distribution = sub_optimal_distribution
221            self._optimal_distribution = optimal_distribution
222            self._other_distribution = other_distribution
223        else:
224            if make_reward_stochastic:
225                if self.is_episodic():
226                    sub_optimal_mean_reward /= self._size
227                self._sub_optimal_distribution = beta(
228                    reward_variance_multiplier,
229                    reward_variance_multiplier * (1 / sub_optimal_mean_reward - 1),
230                )
231                self._optimal_distribution = beta(
232                    reward_variance_multiplier,
233                    reward_variance_multiplier * (1 / optimal_mean_reward - 1),
234                )
235                self._other_distribution = beta(
236                    reward_variance_multiplier,
237                    reward_variance_multiplier * (10 / sub_optimal_mean_reward - 1),
238                )
239            else:
240                self._sub_optimal_distribution = deterministic(5 / 1000)
241                self._optimal_distribution = deterministic(1.0)
242                self._other_distribution = deterministic(0.0)
243
244        super(RiverSwimMDP, self).__init__(
245            seed=seed,
246            reward_variance_multiplier=reward_variance_multiplier,
247            make_reward_stochastic=make_reward_stochastic,
248            **kwargs,
249        )
Parameters
  • seed (int): The seed used for sampling rewards and next states.
  • size (int): The length of the chain.
  • optimal_mean_reward (float): If the rewards are made stochastic, this parameter controls the mean reward for the highly rewarding states. By default, it is set to 0.9.
  • sub_optimal_mean_reward (float): If the rewards are made stochastic, this parameter controls the mean reward for the suboptimal states. By default, it is set to 0.2.
  • sub_optimal_distribution (Union[Tuple, rv_continuous]): The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
  • optimal_distribution (Union[Tuple, rv_continuous]): The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
  • other_distribution (Union[Tuple, rv_continuous]): The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
  • make_reward_stochastic (bool): If True, the rewards of the MDP will be stochastic. By default, it is set to False.
  • reward_variance_multiplier (float): A constant that can be used to increase the variance of the reward distributions without changing their means. The lower the value, the higher the variance. By default, it is set to 1.
@staticmethod
def sample_parameters(n: int, seed: int = None) -> List[Dict[str, Any]]:
16    @staticmethod
17    def sample_parameters(n: int, seed: int = None) -> List[Dict[str, Any]]:
18        return RiverSwimMDP.sample_mdp_parameters(n, False, seed)
Returns
  • List[Dict[str, Any]]: n sampled parameters that can be used to construct an MDP in a reasonable amount of time.
def custom_graph_layout( self) -> Dict[colosseum.mdp.river_swim.base.RiverSwimNode, Tuple[int, int]]:
20    def custom_graph_layout(self) -> Dict[RiverSwimNode, Tuple[int, int]]:
21        """
22        Returns
23        -------
24        Dict[RiverSwimNode, Tuple[int, int]]
25            The custom layout to draw a nx.Graph.
26        """
27        return {node: tuple(node) for node in self.G}
Returns
  • Dict[RiverSwimNode, Tuple[int, int]]: The custom layout to draw a nx.Graph.
Inherited Members
colosseum.mdp.base_infinite.ContinuousMDP
is_episodic
get_grid_representation
colosseum.mdp.river_swim.base.RiverSwimMDP
get_action_class
get_unique_symbols
does_seed_change_MDP_structure
sample_mdp_parameters
get_node_class
get_gin_parameters
n_actions
parameters
colosseum.mdp.base.BaseMDP
get_available_hardness_measures
produce_gin_file_from_mdp_parameters
get_gin_config
get_node_labels
get_node_action_labels
hash
instantiate_MDP
T
R
recurrent_nodes_set
communication_class
get_optimal_policy
get_worst_policy
get_value_functions
optimal_value_functions
worst_value_functions
random_value_functions
optimal_transition_probabilities
worst_transition_probabilities
random_transition_probabilities
optimal_markov_chain
worst_markov_chain
random_markov_chain
get_stationary_distribution
optimal_stationary_distribution
worst_stationary_distribution
random_stationary_distribution
optimal_average_rewards
worst_average_rewards
random_average_rewards
get_average_reward
optimal_average_reward
worst_average_reward
random_average_reward
transition_matrix_and_rewards
graph_layout
graph_metrics
diameter
sum_reciprocals_suboptimality_gaps
discounted_value_norm
undiscounted_value_norm
value_norm
measures_of_hardness
summary
hardness_report
get_info_class
get_transition_distributions
get_reward_distribution
sample_reward
get_measure_from_name
action_spec
observation_spec
get_observation
reset
step
random_steps
random_step
get_visitation_counts
reset_visitation_counts
get_value_node_labels
dm_env._environment.Environment
reward_spec
discount_spec
close