import gymnasium as gym from stable_baselines3 import PPO from stable_baselines3.common.evaluation import evaluate_policy from stable_baselines3.common.monitor import Monitor from stable_baselines3.common.env_util import make_vec_env import random def eval_model_with_seed(model_fp, env_id, seed, n_eval_episodes=10, n_envs=1): eval_env = make_vec_env(env_id, seed=seed, n_envs=n_envs) return eval_model(model_fp, eval_env, n_eval_episodes) def eval_model_random(model_fp, env_id, n_eval_episodes=10): eval_env = Monitor(gym.make(env_id)) return eval_model(model_fp, eval_env, n_eval_episodes) def eval_model_random_with_average( model_fp, env_id, n_eval_episodes=10, n_average=10, verbose=False ): result_sum = 0 mean_reward_sum = 0 std_reward_sum = 0 for i in range(n_average): if verbose and i % 100 == 0: print(f"Progress: {i}/{n_average}") result, mean_reward, std_reward = eval_model_random( model_fp, env_id, n_eval_episodes ) result_sum += result mean_reward_sum += mean_reward std_reward_sum += std_reward return ( result_sum / n_average, mean_reward_sum / n_average, std_reward_sum / n_average, ) def eval_model(model_fp, eval_env, n_eval_episodes=10): model = PPO.load(model_fp, env=eval_env) mean_reward, std_reward = evaluate_policy( model, eval_env, n_eval_episodes=n_eval_episodes, deterministic=True ) result = mean_reward - std_reward return result, mean_reward, std_reward def search_for_best_seed( model_fp, env_id, n_eval_episodes=10, n_total_envs_to_search=1000, max_n_envs=16, verbose=False, ): best_result = 0 best_seed = 0 best_n_envs = 0 for i in range(n_total_envs_to_search): if verbose and i % 100 == 0: print(f"Progress: {i}/{n_total_envs_to_search}") seed = random.randint(0, 1000000000000) n_envs = random.randint(1, max_n_envs) result, _, _ = eval_model_with_seed( model_fp, env_id, seed, n_eval_episodes, n_envs ) if result > best_result: best_result = result best_seed = seed best_n_envs = n_envs return best_result, best_seed, best_n_envs def search_for_best_seed_in_range(model_fp, env_id, range=range(0, 1000)): best_result = 0 best_seed = 0 best_n_envs = 0 for seed in range: for n_envs in [1, 2, 4, 8, 16, 32]: result, _, _ = eval_model_with_seed(model_fp, env_id, seed, 10, n_envs) if result > best_result: best_result = result best_seed = seed best_n_envs = n_envs print(best_result, seed, n_envs) print(best_result, best_seed, best_n_envs) return best_result, best_seed, best_n_envs