import argparse import os import matplotlib.pyplot as plt import json import time import numpy as np import torch from pathlib import Path from utils.babyai_utils.baby_agent import load_agent from utils.storage import get_status from utils.env import make_env from utils.other import seed from utils.storage import get_model_dir from models import * from utils.env import env_args_str_to_dict import gym from termcolor import cprint os.makedirs("./evaluation", exist_ok=True) start = time.time() # Parse arguments parser = argparse.ArgumentParser() parser.add_argument("--test-set-seed", type=int, default=0, help="random seed (default: 0)") parser.add_argument("--random-agent", action="store_true", default=False, help="random actions") parser.add_argument("--quiet", "-q", action="store_true", default=False, help="quiet") parser.add_argument("--eval-env", type=str, default=None, help="env to evaluate on") parser.add_argument("--model-to-evaluate", type=str, default=None, help="model to evaluate") parser.add_argument("--model-label", type=str, default=None, help="model to evaluate") parser.add_argument("--max-steps", type=int, default=None, help="max num of steps") parser.add_argument("--argmax", action="store_true", default=False, help="select the action with highest probability (default: False)") parser.add_argument("--episodes", type=int, default=1000, help="number of episodes to test") parser.add_argument("--test-p", type=float, default=0.05, help="p value") parser.add_argument("--n-seeds", type=int, default=8, help="number of episodes to test") parser.add_argument("--subsample-step", type=int, default=1, help="subsample step") parser.add_argument("--start-step", type=int, default=1, help="at which step to start the curves") parser.add_argument("--env_args", nargs='*', default=None) args = parser.parse_args() # Set seed for all randomness sources seed(args.test_set_seed) assert args.test_set_seed == 1 # turn on for testing # assert not args.argmax # assert args.num_frames == 28000000 # assert args.episodes == 1000 test_p = args.test_p n_seeds = args.n_seeds assert n_seeds in [16, 8, 4] cprint("n seeds: {}".format(n_seeds), "red") subsample_step = args.subsample_step start_step = args.start_step # Set device def qprint(*a, **kwargs): if not args.quiet: print(*a, **kwargs) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") qprint(f"Device: {device}\n") # what to load if args.model_to_evaluate is None: models_to_evaluate = [ "19-05_500K_HELP_env_MiniGrid-Exiter-8x8-v0_multi-modal-babyai11-agent_arch_original_endpool_res_custom-ppo-2" ] label_parser_dict = { "19-05_500K_HELP_env_MiniGrid-Exiter-8x8-v0_multi-modal-babyai11-agent_arch_original_endpool_res_custom-ppo-2": "Exiter_EB", } else: model_name = args.model_to_evaluate.replace("./storage/", "").replace("storage/", "") models_to_evaluate = [ model_name ] if args.model_label: label_parser_dict = { model_name: args.model_label, } else: label_parser_dict = { model_name: model_name, } qprint("evaluating models: ", models_to_evaluate) # how do to stat tests compare = { # "MH-BabyAI-ExpBonus": "Abl-MH-BabyAI-ExpBonus", } COLORS = ["red", "blue", "green", "black", "purpule", "brown", "orange", "gray"] label_color_dict = {l: c for l, c in zip(label_parser_dict.values(), COLORS)} test_set_check_path = Path("test_set_check_{}_nep_{}.json".format(args.test_set_seed, args.episodes)) def calc_perf_for_seed(i, model_name, seed, argmax, episodes, random_agent=False, num_frames=None): qprint("seed {}".format(i)) model = Path(model_name) / str(i) model_dir = get_model_dir(model) if test_set_check_path.exists(): with open(test_set_check_path, "r") as f: check_loaded = json.load(f) qprint("check loaded") else: qprint("check not loaded") check_loaded = None # Load environment with open(model_dir+"/config.json") as f: conf = json.load(f) if args.eval_env is None: qprint("evaluating on the original env") env_name = conf["env"] else: qprint("evaluating on a different env") env_name = args.eval_env env = gym.make(env_name, **env_args_str_to_dict(args.env_args)) qprint("Environment loaded\n") # load agent agent = load_agent(env, model_dir, argmax) status = get_status(model_dir) qprint("Agent loaded at {} steps.".format(status.get("num_frames", -1))) check = {} seed_rewards = [] seed_sr = [] for episode in range(episodes): qprint("[{}/{}]: ".format(episode, episodes), end="", flush=True) obs = env.reset() # check envs are the same during seeds if episode in check: assert check[episode] == int(obs['image'].sum()) else: check[episode] = int(obs['image'].sum()) if check_loaded is not None: assert check[episode] == int(obs['image'].sum()) i = 0 tot_reward = 0 while True: i+=1 if random_agent: action = agent.get_random_action(obs) else: action = agent.get_action(obs) obs, reward, done, info = env.step(action) if reward: qprint("*", end="", flush=True) else: qprint(".", end="", flush=True) agent.analyze_feedback(reward, done) tot_reward += reward if done: seed_rewards.append(tot_reward) seed_sr.append(info["success"]) break if args.max_steps is not None: if i > args.max_steps: seed_rewards.append(tot_reward) seed_sr.append(info["success"]) break qprint() seed_rewards = np.array(seed_rewards) seed_success_rates = np.array(seed_sr) if not test_set_check_path.exists(): with open(test_set_check_path, "w") as f: json.dump(check, f) qprint("check saved") qprint("seed success rate:", seed_success_rates.mean()) qprint("seed reward:", seed_rewards.mean()) return seed_rewards.mean(), seed_success_rates.mean() def get_available_steps(model): model_dir = Path(get_model_dir(model)) per_seed_available_steps = {} for seed_dir in model_dir.glob("*"): per_seed_available_steps[seed_dir] = sorted([ int(str(p.with_suffix("")).split("status_")[-1]) for p in seed_dir.glob("status_*") ]) num_steps = min([len(steps) for steps in per_seed_available_steps.values()]) steps = list(per_seed_available_steps.values())[0][:num_steps] for available_steps in per_seed_available_steps.values(): s_steps = available_steps[:num_steps] assert steps == s_steps return steps def plot_with_shade(subplot_nb, ax, x, y, err, color, shade_color, label, legend=False, leg_size=30, leg_loc='best', title=None, ylim=[0, 100], xlim=[0, 40], leg_args={}, leg_linewidth=8.0, linewidth=7.0, ticksize=30, zorder=None, xlabel='perf', ylabel='env steps', smooth_factor=1000): # plt.rcParams.update({'font.size': 15}) ax.locator_params(axis='x', nbins=6) ax.locator_params(axis='y', nbins=5) ax.tick_params(axis='both', which='major', labelsize=ticksize) # smoothing def smooth(x_, n=50): return np.array([x_[max(i - n, 0):i + 1].mean() for i in range(len(x_))]) if smooth_factor > 0: y = smooth(y, n=smooth_factor) err = smooth(err, n=smooth_factor) ax.plot(x, y, color=color, label=label, linewidth=linewidth, zorder=zorder) ax.fill_between(x, y - err, y + err, color=shade_color, alpha=0.2) if legend: leg = ax.legend(loc=leg_loc, fontsize=leg_size, **leg_args) # 34 for legobj in leg.legendHandles: legobj.set_linewidth(leg_linewidth) ax.set_xlabel(xlabel, fontsize=30) if subplot_nb == 0: ax.set_ylabel(ylabel, fontsize=30) ax.set_xlim(xmin=xlim[0], xmax=xlim[1]) ax.set_ylim(bottom=ylim[0], top=ylim[1]) if title: ax.set_title(title, fontsize=22) def label_parser(label, label_parser_dict): if sum([1 for k, v in label_parser_dict.items() if k in label]) != 1: qprint("ERROR") qprint(label) exit() for k, v in label_parser_dict.items(): if k in label: return v return label f, ax = plt.subplots(1, 1, figsize=(10.0, 6.0)) ax = [ax] performances = {} per_seed_performances = {} stds = {} label_parser_dict_reverse = {v: k for k, v in label_parser_dict.items()} assert len(label_parser_dict_reverse) == len(label_parser_dict) label_to_model = {} # evaluate and draw curves for model in models_to_evaluate: label = label_parser(model, label_parser_dict) label_to_model[label] = model color = label_color_dict[label] performances[label] = [] per_seed_performances[label] = [] stds[label] = [] final_perf = True if final_perf: results = [] for s in range(n_seeds): results.append(calc_perf_for_seed( s, model_name=model, num_frames=None, seed=args.test_set_seed, argmax=args.argmax, episodes=args.episodes, )) rewards, success_rates = zip(*results) # dump per seed performance np.save("./evaluation/{}".format(label), success_rates) rewards = np.array(rewards) success_rates = np.array(success_rates) success_rate_mean = success_rates.mean() succes_rate_std = success_rates.std() label = label_parser(str(model), label_parser_dict) cprint("{}: {} +- std {}".format(label, success_rate_mean, succes_rate_std), "red") else: steps = get_available_steps(model) steps = steps[::subsample_step] steps = [s for s in steps if s > start_step] qprint("steps:", steps) for step in steps: results = [] for s in range(n_seeds): results.append(calc_perf_for_seed( s, model_name=model, num_frames=step, seed=args.test_set_seed, argmax=args.argmax, episodes=args.episodes, )) rewards, success_rates = zip(*results) rewards = np.array(rewards) success_rates = np.array(success_rates) per_seed_performances[label].append(success_rates) performances[label].append(success_rates.mean()) stds[label].append(success_rates.std()) means = np.array(performances[label]) err = np.array(stds[label]) label = label_parser(str(model), label_parser_dict) max_steps = np.max(steps) min_steps = np.min(steps) min_y = 0.0 max_y = 1.0 ylabel = "performance" smooth_factor = 0 plot_with_shade(0, ax[0], steps, means, err, color, color, label, legend=True, xlim=[min_steps, max_steps], ylim=[min_y, max_y], leg_size=20, xlabel="Env steps (millions)", ylabel=ylabel, linewidth=5.0, smooth_factor=smooth_factor) assert len(label_to_model) == len(models_to_evaluate) def get_compatible_steps(model1, model2, subsample_step): steps_1 = get_available_steps(model1)[::subsample_step] steps_2 = get_available_steps(model2)[::subsample_step] min_steps = min(len(steps_1), len(steps_2)) steps_1 = steps_1[:min_steps] steps_2 = steps_2[:min_steps] assert steps_1 == steps_2 return steps_1 # # stat tests # for k, v in compare.items(): # dist_1_steps = per_seed_performances[k] # dist_2_steps = per_seed_performances[v] # # model_k = label_to_model[k] # model_v = label_to_model[v] # steps = get_compatible_steps(model_k, model_v, subsample_step) # steps = [s for s in steps if s > start_step] # # for step, dist_1, dist_2 in zip(steps, dist_1_steps, dist_2_steps): # assert len(dist_1) == n_seeds # assert len(dist_2) == n_seeds # # p = stats.ttest_ind( # dist_1, # dist_2, # equal_var=False # ).pvalue # # if np.isnan(p): # from IPython import embed; embed() # # if p < test_p: # plt.scatter(step, 0.8, color=label_color_dict[k], s=50, marker="x") # # print("{} (m:{}) <---> {} (m:{}) = p: {} result: {}".format( # k, np.mean(dist_1), v, np.mean(dist_2), p, # "Distributions different(p={})".format(test_p) if p < test_p else "Distributions same(p={})".format(test_p) # )) # print() # # f.savefig('graphics/test.png') # f.savefig('graphics/test.svg')