colosseum.mdp.simple_grid.infinite_horizon

 1from typing import Any, Dict, List, Tuple, TYPE_CHECKING
 2
 3import gin
 4
 5from colosseum.mdp import ContinuousMDP
 6from colosseum.mdp.simple_grid.base import SimpleGridMDP
 7
 8if TYPE_CHECKING:
 9    from colosseum.mdp import NODE_TYPE
10
11
12@gin.configurable
13class SimpleGridContinuous(ContinuousMDP, SimpleGridMDP):
14    @staticmethod
15    def sample_parameters(n: int, seed: int = None) -> List[Dict[str, Any]]:
16        return SimpleGridMDP.sample_mdp_parameters(n, False, seed)
17
18    def custom_graph_layout(self) -> Dict["NODE_TYPE", Tuple[float, float]]:
19        return {node: list(node) for node in self.G}
@gin.configurable
class SimpleGridContinuous(colosseum.mdp.base_infinite.ContinuousMDP, colosseum.mdp.simple_grid.base.SimpleGridMDP):
13@gin.configurable
14class SimpleGridContinuous(ContinuousMDP, SimpleGridMDP):
15    @staticmethod
16    def sample_parameters(n: int, seed: int = None) -> List[Dict[str, Any]]:
17        return SimpleGridMDP.sample_mdp_parameters(n, False, seed)
18
19    def custom_graph_layout(self) -> Dict["NODE_TYPE", Tuple[float, float]]:
20        return {node: list(node) for node in self.G}

The base class for continuous MDPs.

SimpleGridContinuous( seed: int, size: int, reward_type: colosseum.mdp.simple_grid.base.SimpleGridReward = <SimpleGridReward.XOR: 3>, n_starting_states: int = 1, optimal_mean_reward: float = 0.9, sub_optimal_mean_reward: float = 0.2, optimal_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, sub_optimal_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, other_distribution: Union[Tuple, scipy.stats._distn_infrastructure.rv_continuous] = None, make_reward_stochastic=False, reward_variance_multiplier: float = 1.0, **kwargs)
317    def __init__(
318        self,
319        seed: int,
320        size: int,
321        reward_type: SimpleGridReward = SimpleGridReward.XOR,
322        n_starting_states: int = 1,
323        optimal_mean_reward: float = 0.9,
324        sub_optimal_mean_reward: float = 0.2,
325        optimal_distribution: Union[Tuple, rv_continuous] = None,
326        sub_optimal_distribution: Union[Tuple, rv_continuous] = None,
327        other_distribution: Union[Tuple, rv_continuous] = None,
328        make_reward_stochastic=False,
329        reward_variance_multiplier: float = 1.0,
330        **kwargs,
331    ):
332        """
333
334        Parameters
335        ----------
336        seed : int
337            The seed used for sampling rewards and next states.
338        size : int
339            The size of the grid.
340        reward_type : SimpleGridReward
341            The type of reward for the MDP. By default, the XOR type is used.
342        n_starting_states : int
343            The number of possible starting states.
344        optimal_mean_reward : float
345            If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory.
346            By default, it is set to 0.9.
347        sub_optimal_mean_reward : float
348            If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories.
349            By default, it is set to 0.2.
350        optimal_distribution : Union[Tuple, rv_continuous]
351            The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters
352            or as a rv_continuous object.
353        sub_optimal_distribution : Union[Tuple, rv_continuous]
354            The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta
355            parameters or as a rv_continuous object.
356        other_distribution : Union[Tuple, rv_continuous]
357            The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a
358            rv_continuous object.
359        make_reward_stochastic : bool
360            If True, the rewards of the MDP will be stochastic. By default, it is set to False.
361        reward_variance_multiplier : float
362            A constant that can be used to increase the variance of the reward distributions without changing their means.
363            The lower the value, the higher the variance. By default, it is set to 1.
364        """
365
366        if type(sub_optimal_distribution) == tuple:
367            sub_optimal_distribution = get_dist(
368                sub_optimal_distribution[0], sub_optimal_distribution[1]
369            )
370        if type(optimal_distribution) == tuple:
371            optimal_distribution = get_dist(
372                optimal_distribution[0], optimal_distribution[1]
373            )
374        if type(other_distribution) == tuple:
375            other_distribution = get_dist(other_distribution[0], other_distribution[1])
376
377        self._size = size
378        self._reward_type = SimpleGridReward(reward_type)
379        self._n_starting_states = n_starting_states
380        self._optimal_mean_reward = optimal_mean_reward
381        self._sub_optimal_mean_reward = sub_optimal_mean_reward
382        dists = [
383            sub_optimal_distribution,
384            optimal_distribution,
385            other_distribution,
386        ]
387
388        if dists.count(None) == 0:
389            self._sub_optimal_distribution = sub_optimal_distribution
390            self._optimal_distribution = optimal_distribution
391            self._other_distribution = other_distribution
392        else:
393            if make_reward_stochastic:
394                self._sub_optimal_distribution = beta(
395                    reward_variance_multiplier,
396                    reward_variance_multiplier * (10 / sub_optimal_mean_reward - 1),
397                )
398                self._optimal_distribution = beta(
399                    reward_variance_multiplier,
400                    reward_variance_multiplier * (1 / optimal_mean_reward - 1),
401                )
402                self._other_distribution = beta(
403                    reward_variance_multiplier,
404                    reward_variance_multiplier * (1 / sub_optimal_mean_reward - 1),
405                )
406            else:
407                self._sub_optimal_distribution = deterministic(0.0)
408                self._optimal_distribution = deterministic(1.0)
409                self._other_distribution = deterministic(0.5)
410
411        super(SimpleGridMDP, self).__init__(
412            seed=seed,
413            reward_variance_multiplier=reward_variance_multiplier,
414            make_reward_stochastic=make_reward_stochastic,
415            **kwargs,
416        )
Parameters
  • seed (int): The seed used for sampling rewards and next states.
  • size (int): The size of the grid.
  • reward_type (SimpleGridReward): The type of reward for the MDP. By default, the XOR type is used.
  • n_starting_states (int): The number of possible starting states.
  • optimal_mean_reward (float): If the rewards are made stochastic, this parameter controls the mean reward for the optimal trajectory. By default, it is set to 0.9.
  • sub_optimal_mean_reward (float): If the rewards are made stochastic, this parameter controls the mean reward for suboptimal trajectories. By default, it is set to 0.2.
  • optimal_distribution (Union[Tuple, rv_continuous]): The distribution of the highly rewarding state. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
  • sub_optimal_distribution (Union[Tuple, rv_continuous]): The distribution of the suboptimal rewarding states. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
  • other_distribution (Union[Tuple, rv_continuous]): The distribution of the other states. It can be either passed as a tuple containing Beta parameters or as a rv_continuous object.
  • make_reward_stochastic (bool): If True, the rewards of the MDP will be stochastic. By default, it is set to False.
  • reward_variance_multiplier (float): A constant that can be used to increase the variance of the reward distributions without changing their means. The lower the value, the higher the variance. By default, it is set to 1.
@staticmethod
def sample_parameters(n: int, seed: int = None) -> List[Dict[str, Any]]:
15    @staticmethod
16    def sample_parameters(n: int, seed: int = None) -> List[Dict[str, Any]]:
17        return SimpleGridMDP.sample_mdp_parameters(n, False, seed)
Returns
  • List[Dict[str, Any]]: n sampled parameters that can be used to construct an MDP in a reasonable amount of time.
Inherited Members
colosseum.mdp.base_infinite.ContinuousMDP
is_episodic
get_grid_representation
colosseum.mdp.simple_grid.base.SimpleGridMDP
get_action_class
get_unique_symbols
does_seed_change_MDP_structure
sample_mdp_parameters
get_node_class
get_gin_parameters
n_actions
parameters
colosseum.mdp.base.BaseMDP
get_available_hardness_measures
produce_gin_file_from_mdp_parameters
get_gin_config
get_node_labels
get_node_action_labels
hash
instantiate_MDP
T
R
recurrent_nodes_set
communication_class
get_optimal_policy
get_worst_policy
get_value_functions
optimal_value_functions
worst_value_functions
random_value_functions
optimal_transition_probabilities
worst_transition_probabilities
random_transition_probabilities
optimal_markov_chain
worst_markov_chain
random_markov_chain
get_stationary_distribution
optimal_stationary_distribution
worst_stationary_distribution
random_stationary_distribution
optimal_average_rewards
worst_average_rewards
random_average_rewards
get_average_reward
optimal_average_reward
worst_average_reward
random_average_reward
transition_matrix_and_rewards
graph_layout
graph_metrics
diameter
sum_reciprocals_suboptimality_gaps
discounted_value_norm
undiscounted_value_norm
value_norm
measures_of_hardness
summary
hardness_report
get_info_class
get_transition_distributions
get_reward_distribution
sample_reward
get_measure_from_name
action_spec
observation_spec
get_observation
reset
step
random_steps
random_step
get_visitation_counts
reset_visitation_counts
get_value_node_labels
dm_env._environment.Environment
reward_spec
discount_spec
close