colosseum.experiment.indicators
1from typing import Tuple 2 3import numpy as np 4 5from colosseum.dynamic_programming import episodic_policy_evaluation 6from colosseum.dynamic_programming import episodic_value_iteration 7 8 9def get_episodic_regret_at_time_zero( 10 H: int, 11 T: np.ndarray, 12 R: np.ndarray, 13 policy: np.ndarray, 14 optimal_value: np.ndarray = None, 15) -> np.ndarray: 16 """ 17 Returns 18 ------- 19 np.ndarray 20 The regret for the states at in-episode time of zero. 21 """ 22 assert T.ndim == 3, "We don't need the episodic transition matrix here." 23 _, V = episodic_policy_evaluation(H, T, R, policy) 24 if optimal_value is None: 25 _, optimal_value = episodic_value_iteration(H, T, R) 26 return optimal_value[0] - V[0] 27 28 29def get_episodic_regrets_and_average_reward_at_time_zero( 30 H, T, R, policy, starting_state_distribution, optimal_value: np.ndarray = None 31) -> Tuple[np.ndarray, float]: 32 """ 33 Returns 34 ------- 35 np.ndarray 36 The regret for the states at in-episode time of zero. 37 float 38 The average value at time zero 39 """ 40 _, V = episodic_policy_evaluation(H, T, R, policy) 41 episodic_agent_average_reward = sum(V[0] * starting_state_distribution) 42 if optimal_value is None: 43 _, optimal_value = episodic_value_iteration(H, T, R) 44 regret_at_time_zero = np.maximum(optimal_value[0] - V[0], 0.0) 45 return regret_at_time_zero, episodic_agent_average_reward
def
get_episodic_regret_at_time_zero( H: int, T: numpy.ndarray, R: numpy.ndarray, policy: numpy.ndarray, optimal_value: numpy.ndarray = None) -> numpy.ndarray:
10def get_episodic_regret_at_time_zero( 11 H: int, 12 T: np.ndarray, 13 R: np.ndarray, 14 policy: np.ndarray, 15 optimal_value: np.ndarray = None, 16) -> np.ndarray: 17 """ 18 Returns 19 ------- 20 np.ndarray 21 The regret for the states at in-episode time of zero. 22 """ 23 assert T.ndim == 3, "We don't need the episodic transition matrix here." 24 _, V = episodic_policy_evaluation(H, T, R, policy) 25 if optimal_value is None: 26 _, optimal_value = episodic_value_iteration(H, T, R) 27 return optimal_value[0] - V[0]
Returns
- np.ndarray: The regret for the states at in-episode time of zero.
def
get_episodic_regrets_and_average_reward_at_time_zero( H, T, R, policy, starting_state_distribution, optimal_value: numpy.ndarray = None) -> Tuple[numpy.ndarray, float]:
30def get_episodic_regrets_and_average_reward_at_time_zero( 31 H, T, R, policy, starting_state_distribution, optimal_value: np.ndarray = None 32) -> Tuple[np.ndarray, float]: 33 """ 34 Returns 35 ------- 36 np.ndarray 37 The regret for the states at in-episode time of zero. 38 float 39 The average value at time zero 40 """ 41 _, V = episodic_policy_evaluation(H, T, R, policy) 42 episodic_agent_average_reward = sum(V[0] * starting_state_distribution) 43 if optimal_value is None: 44 _, optimal_value = episodic_value_iteration(H, T, R) 45 regret_at_time_zero = np.maximum(optimal_value[0] - V[0], 0.0) 46 return regret_at_time_zero, episodic_agent_average_reward
Returns
- np.ndarray: The regret for the states at in-episode time of zero.
- float: The average value at time zero