SocialAISchool / scripts /evaluate_new.py
grg's picture
Cleaned old git history
be5548b
import argparse
import os
import matplotlib.pyplot as plt
import json
import time
import numpy as np
import torch
from pathlib import Path
from utils.babyai_utils.baby_agent import load_agent
from utils.storage import get_status
from utils.env import make_env
from utils.other import seed
from utils.storage import get_model_dir
from models import *
from utils.env import env_args_str_to_dict
import gym
from termcolor import cprint
os.makedirs("./evaluation", exist_ok=True)
start = time.time()
# Parse arguments
parser = argparse.ArgumentParser()
parser.add_argument("--test-set-seed", type=int, default=0,
help="random seed (default: 0)")
parser.add_argument("--random-agent", action="store_true", default=False,
help="random actions")
parser.add_argument("--quiet", "-q", action="store_true", default=False,
help="quiet")
parser.add_argument("--eval-env", type=str, default=None,
help="env to evaluate on")
parser.add_argument("--model-to-evaluate", type=str, default=None,
help="model to evaluate")
parser.add_argument("--model-label", type=str, default=None,
help="model to evaluate")
parser.add_argument("--max-steps", type=int, default=None,
help="max num of steps")
parser.add_argument("--argmax", action="store_true", default=False,
help="select the action with highest probability (default: False)")
parser.add_argument("--episodes", type=int, default=1000,
help="number of episodes to test")
parser.add_argument("--test-p", type=float, default=0.05,
help="p value")
parser.add_argument("--n-seeds", type=int, default=8,
help="number of episodes to test")
parser.add_argument("--subsample-step", type=int, default=1,
help="subsample step")
parser.add_argument("--start-step", type=int, default=1,
help="at which step to start the curves")
parser.add_argument("--env_args", nargs='*', default=None)
args = parser.parse_args()
# Set seed for all randomness sources
seed(args.test_set_seed)
assert args.test_set_seed == 1 # turn on for testing
# assert not args.argmax
# assert args.num_frames == 28000000
# assert args.episodes == 1000
test_p = args.test_p
n_seeds = args.n_seeds
assert n_seeds in [16, 8, 4]
cprint("n seeds: {}".format(n_seeds), "red")
subsample_step = args.subsample_step
start_step = args.start_step
# Set device
def qprint(*a, **kwargs):
if not args.quiet:
print(*a, **kwargs)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
qprint(f"Device: {device}\n")
# what to load
if args.model_to_evaluate is None:
models_to_evaluate = [
"19-05_500K_HELP_env_MiniGrid-Exiter-8x8-v0_multi-modal-babyai11-agent_arch_original_endpool_res_custom-ppo-2"
]
label_parser_dict = {
"19-05_500K_HELP_env_MiniGrid-Exiter-8x8-v0_multi-modal-babyai11-agent_arch_original_endpool_res_custom-ppo-2": "Exiter_EB",
}
else:
model_name = args.model_to_evaluate.replace("./storage/", "").replace("storage/", "")
models_to_evaluate = [
model_name
]
if args.model_label:
label_parser_dict = {
model_name: args.model_label,
}
else:
label_parser_dict = {
model_name: model_name,
}
qprint("evaluating models: ", models_to_evaluate)
# how do to stat tests
compare = {
# "MH-BabyAI-ExpBonus": "Abl-MH-BabyAI-ExpBonus",
}
COLORS = ["red", "blue", "green", "black", "purpule", "brown", "orange", "gray"]
label_color_dict = {l: c for l, c in zip(label_parser_dict.values(), COLORS)}
test_set_check_path = Path("test_set_check_{}_nep_{}.json".format(args.test_set_seed, args.episodes))
def calc_perf_for_seed(i, model_name, seed, argmax, episodes, random_agent=False, num_frames=None):
qprint("seed {}".format(i))
model = Path(model_name) / str(i)
model_dir = get_model_dir(model)
if test_set_check_path.exists():
with open(test_set_check_path, "r") as f:
check_loaded = json.load(f)
qprint("check loaded")
else:
qprint("check not loaded")
check_loaded = None
# Load environment
with open(model_dir+"/config.json") as f:
conf = json.load(f)
if args.eval_env is None:
qprint("evaluating on the original env")
env_name = conf["env"]
else:
qprint("evaluating on a different env")
env_name = args.eval_env
env = gym.make(env_name, **env_args_str_to_dict(args.env_args))
qprint("Environment loaded\n")
# load agent
agent = load_agent(env, model_dir, argmax)
status = get_status(model_dir)
qprint("Agent loaded at {} steps.".format(status.get("num_frames", -1)))
check = {}
seed_rewards = []
seed_sr = []
for episode in range(episodes):
qprint("[{}/{}]: ".format(episode, episodes), end="", flush=True)
obs = env.reset()
# check envs are the same during seeds
if episode in check:
assert check[episode] == int(obs['image'].sum())
else:
check[episode] = int(obs['image'].sum())
if check_loaded is not None:
assert check[episode] == int(obs['image'].sum())
i = 0
tot_reward = 0
while True:
i+=1
if random_agent:
action = agent.get_random_action(obs)
else:
action = agent.get_action(obs)
obs, reward, done, info = env.step(action)
if reward:
qprint("*", end="", flush=True)
else:
qprint(".", end="", flush=True)
agent.analyze_feedback(reward, done)
tot_reward += reward
if done:
seed_rewards.append(tot_reward)
seed_sr.append(info["success"])
break
if args.max_steps is not None:
if i > args.max_steps:
seed_rewards.append(tot_reward)
seed_sr.append(info["success"])
break
qprint()
seed_rewards = np.array(seed_rewards)
seed_success_rates = np.array(seed_sr)
if not test_set_check_path.exists():
with open(test_set_check_path, "w") as f:
json.dump(check, f)
qprint("check saved")
qprint("seed success rate:", seed_success_rates.mean())
qprint("seed reward:", seed_rewards.mean())
return seed_rewards.mean(), seed_success_rates.mean()
def get_available_steps(model):
model_dir = Path(get_model_dir(model))
per_seed_available_steps = {}
for seed_dir in model_dir.glob("*"):
per_seed_available_steps[seed_dir] = sorted([
int(str(p.with_suffix("")).split("status_")[-1])
for p in seed_dir.glob("status_*")
])
num_steps = min([len(steps) for steps in per_seed_available_steps.values()])
steps = list(per_seed_available_steps.values())[0][:num_steps]
for available_steps in per_seed_available_steps.values():
s_steps = available_steps[:num_steps]
assert steps == s_steps
return steps
def plot_with_shade(subplot_nb, ax, x, y, err, color, shade_color, label,
legend=False, leg_size=30, leg_loc='best', title=None,
ylim=[0, 100], xlim=[0, 40], leg_args={}, leg_linewidth=8.0, linewidth=7.0, ticksize=30,
zorder=None, xlabel='perf', ylabel='env steps', smooth_factor=1000):
# plt.rcParams.update({'font.size': 15})
ax.locator_params(axis='x', nbins=6)
ax.locator_params(axis='y', nbins=5)
ax.tick_params(axis='both', which='major', labelsize=ticksize)
# smoothing
def smooth(x_, n=50):
return np.array([x_[max(i - n, 0):i + 1].mean() for i in range(len(x_))])
if smooth_factor > 0:
y = smooth(y, n=smooth_factor)
err = smooth(err, n=smooth_factor)
ax.plot(x, y, color=color, label=label, linewidth=linewidth, zorder=zorder)
ax.fill_between(x, y - err, y + err, color=shade_color, alpha=0.2)
if legend:
leg = ax.legend(loc=leg_loc, fontsize=leg_size, **leg_args) # 34
for legobj in leg.legendHandles:
legobj.set_linewidth(leg_linewidth)
ax.set_xlabel(xlabel, fontsize=30)
if subplot_nb == 0:
ax.set_ylabel(ylabel, fontsize=30)
ax.set_xlim(xmin=xlim[0], xmax=xlim[1])
ax.set_ylim(bottom=ylim[0], top=ylim[1])
if title:
ax.set_title(title, fontsize=22)
def label_parser(label, label_parser_dict):
if sum([1 for k, v in label_parser_dict.items() if k in label]) != 1:
qprint("ERROR")
qprint(label)
exit()
for k, v in label_parser_dict.items():
if k in label: return v
return label
f, ax = plt.subplots(1, 1, figsize=(10.0, 6.0))
ax = [ax]
performances = {}
per_seed_performances = {}
stds = {}
label_parser_dict_reverse = {v: k for k, v in label_parser_dict.items()}
assert len(label_parser_dict_reverse) == len(label_parser_dict)
label_to_model = {}
# evaluate and draw curves
for model in models_to_evaluate:
label = label_parser(model, label_parser_dict)
label_to_model[label] = model
color = label_color_dict[label]
performances[label] = []
per_seed_performances[label] = []
stds[label] = []
final_perf = True
if final_perf:
results = []
for s in range(n_seeds):
results.append(calc_perf_for_seed(
s,
model_name=model,
num_frames=None,
seed=args.test_set_seed,
argmax=args.argmax,
episodes=args.episodes,
))
rewards, success_rates = zip(*results)
# dump per seed performance
np.save("./evaluation/{}".format(label), success_rates)
rewards = np.array(rewards)
success_rates = np.array(success_rates)
success_rate_mean = success_rates.mean()
succes_rate_std = success_rates.std()
label = label_parser(str(model), label_parser_dict)
cprint("{}: {} +- std {}".format(label, success_rate_mean, succes_rate_std), "red")
else:
steps = get_available_steps(model)
steps = steps[::subsample_step]
steps = [s for s in steps if s > start_step]
qprint("steps:", steps)
for step in steps:
results = []
for s in range(n_seeds):
results.append(calc_perf_for_seed(
s,
model_name=model,
num_frames=step,
seed=args.test_set_seed,
argmax=args.argmax,
episodes=args.episodes,
))
rewards, success_rates = zip(*results)
rewards = np.array(rewards)
success_rates = np.array(success_rates)
per_seed_performances[label].append(success_rates)
performances[label].append(success_rates.mean())
stds[label].append(success_rates.std())
means = np.array(performances[label])
err = np.array(stds[label])
label = label_parser(str(model), label_parser_dict)
max_steps = np.max(steps)
min_steps = np.min(steps)
min_y = 0.0
max_y = 1.0
ylabel = "performance"
smooth_factor = 0
plot_with_shade(0, ax[0], steps, means, err, color, color, label,
legend=True, xlim=[min_steps, max_steps], ylim=[min_y, max_y],
leg_size=20, xlabel="Env steps (millions)", ylabel=ylabel, linewidth=5.0, smooth_factor=smooth_factor)
assert len(label_to_model) == len(models_to_evaluate)
def get_compatible_steps(model1, model2, subsample_step):
steps_1 = get_available_steps(model1)[::subsample_step]
steps_2 = get_available_steps(model2)[::subsample_step]
min_steps = min(len(steps_1), len(steps_2))
steps_1 = steps_1[:min_steps]
steps_2 = steps_2[:min_steps]
assert steps_1 == steps_2
return steps_1
# # stat tests
# for k, v in compare.items():
# dist_1_steps = per_seed_performances[k]
# dist_2_steps = per_seed_performances[v]
#
# model_k = label_to_model[k]
# model_v = label_to_model[v]
# steps = get_compatible_steps(model_k, model_v, subsample_step)
# steps = [s for s in steps if s > start_step]
#
# for step, dist_1, dist_2 in zip(steps, dist_1_steps, dist_2_steps):
# assert len(dist_1) == n_seeds
# assert len(dist_2) == n_seeds
#
# p = stats.ttest_ind(
# dist_1,
# dist_2,
# equal_var=False
# ).pvalue
#
# if np.isnan(p):
# from IPython import embed; embed()
#
# if p < test_p:
# plt.scatter(step, 0.8, color=label_color_dict[k], s=50, marker="x")
#
# print("{} (m:{}) <---> {} (m:{}) = p: {} result: {}".format(
# k, np.mean(dist_1), v, np.mean(dist_2), p,
# "Distributions different(p={})".format(test_p) if p < test_p else "Distributions same(p={})".format(test_p)
# ))
# print()
#
# f.savefig('graphics/test.png')
# f.savefig('graphics/test.svg')