#!/usr/bin/env python import re import itertools import math from itertools import chain import time # import seaborn import numpy as np import os from collections import OrderedDict, defaultdict import pandas as pd import matplotlib.pyplot as plt import sys from termcolor import cprint, colored from pathlib import Path import pickle from scipy import stats save = True show_plot = False metrics = [ 'success_rate_mean', # 'FPS', # 'extrinsic_return_mean', # 'exploration_bonus_mean', # 'NPC_intro', # 'curriculum_param_mean', # 'curriculum_max_success_rate_mean', # 'rreturn_mean' ] eval_metric = "test_success_rates" # eval_metric = "exploration_bonus_mean" super_title = "" # super_title = "PPO - No exploration bonus" # super_title = "Count Based exploration bonus (Grid Search)" # super_title = "PPO + RND" # super_title = "PPO + RIDE" # statistical evaluation p-value test_p = 0.05 agg_title = "" color_dict = None eval_filename = None max_frames = 20_000_000 legend_show_n_seeds = False draw_legend = True per_seed = False study_train = False study_eval = True plot_test = True plot_aggregated_test = True plot_only_aggregated_test = True xnbins = 4 ynbins = 3 steps_denom = 1e6 # Global vas for tracking and labeling data at load time. exp_idx = 0 label_parser_dict = None label_parser = lambda l, _, label_parser_dict: l smooth_factor = 10 # used # smooth_factor = 0 print("smooth factor:", smooth_factor) eval_smooth_factor = None leg_size = 30 def smooth(x_, n=50): if n is None: return x_ if type(x_) == list: x_ = np.array(x_) return np.array([x_[max(i - n, 0):i + 1].mean() for i in range(len(x_))]) sort_test = False def sort_test_set(env_name): helps = [ "LanguageFeedback", "LanguageColor", "Pointing", "Emulation", ] problems = [ "Boxes", "Switches", "Generators", "Marble", "Doors", "Levers", ] env_names = [] for p in problems: for h in helps: env_names.append(h+p) env_names.extend([ "LeverDoorColl", "MarblePushColl", "MarblePassColl", "AppleStealing" ]) for i, en in enumerate(env_names): if en in env_name: return i raise ValueError(f"Test env {env_name} not known") subsample_step = 1 load_subsample_step = 1 x_lim = 0 max_x_lim = np.inf summary_dict = {} summary_dict_colors = {} to_plot_dict = {} default_colors_ = ["blue","orange","green","magenta", "brown", "red",'black',"grey",u'#ff7f0e', "cyan", "pink",'purple', u'#1f77b4', "darkorchid","sienna","lightpink", "indigo","mediumseagreen",'aqua', 'deeppink','silver','khaki','goldenrod'] * 100 def get_eval_data(logdir, eval_metric): eval_data = defaultdict(lambda :defaultdict(list)) for root, _, files in os.walk(logdir): for file in files: if 'testing_' in file: assert ".pkl" in file test_env_name = file.lstrip("testing_").rstrip(".pkl") try: with open(root+"/"+file, "rb") as f: seed_eval_data = pickle.load(f) except: print("Pickle not loaded: ", root+"/"+file) time.sleep(1) continue eval_data[test_env_name]["values"].append(seed_eval_data[eval_metric]) eval_data[test_env_name]["steps"].append(seed_eval_data["test_step_nb"]) for test_env, seed_data in eval_data.items(): min_len_seed = min([len(s) for s in seed_data['steps']]) eval_data[test_env]["values"] = np.array([s[:min_len_seed] for s in eval_data[test_env]["values"]]) eval_data[test_env]["steps"] = np.array([s[:min_len_seed] for s in eval_data[test_env]["steps"]]) return eval_data def get_all_runs(logdir, load_subsample_step=1): """ Recursively look through logdir for output files produced by Assumes that any file "log.csv" is a valid hit. """ global exp_idx global units datasets = [] for root, _, files in os.walk(logdir): if 'log.csv' in files: if (Path(root) / 'log.csv').stat().st_size == 0: print("CSV {} empty".format(os.path.join(root, 'log.csv'))) continue run_name = root[8:] exp_name = None config = None exp_idx += 1 # load progress data try: exp_data = pd.read_csv(os.path.join(root, 'log.csv')) print("Loaded:", os.path.join(root, 'log.csv')) except: raise ValueError("CSV {} faulty".format(os.path.join(root, 'log.csv'))) exp_data = exp_data[::load_subsample_step] data_dict = exp_data.to_dict("list") data_dict['config'] = config nb_epochs = len(data_dict['frames']) if nb_epochs == 1: print(f'{run_name} -> {colored(f"nb_epochs {nb_epochs}", "red")}') else: print('{} -> nb_epochs {}'.format(run_name, nb_epochs)) datasets.append(data_dict) return datasets def get_datasets(rootdir, load_only="", load_subsample_step=1, ignore_patterns=("ignore"), require_patterns=()): _, models_list, _ = next(os.walk(rootdir)) for dir_name in models_list.copy(): # add "ignore" in a directory name to avoid loading its content for ignore_pattern in ignore_patterns: if ignore_pattern in dir_name or load_only not in dir_name: if dir_name in models_list: models_list.remove(dir_name) if len(require_patterns) > 0: if not any([require_pattern in dir_name for require_pattern in require_patterns]): if dir_name in models_list: models_list.remove(dir_name) for expe_name in list(labels.keys()): if expe_name not in models_list: del labels[expe_name] # setting per-model type colors for i, m_name in enumerate(models_list): for m_type, m_color in per_model_colors.items(): if m_type in m_name: colors[m_name] = m_color print("extracting data for {}...".format(m_name)) m_id = m_name models_saves[m_id] = OrderedDict() models_saves[m_id]['data'] = get_all_runs(rootdir+m_name, load_subsample_step=load_subsample_step) print("done") if m_name not in labels: labels[m_name] = m_name model_eval_data[m_id] = get_eval_data(logdir=rootdir+m_name, eval_metric=eval_metric) """ retrieve all experiences located in "data to vizu" folder """ labels = OrderedDict() per_model_colors = OrderedDict() # LOAD DATA models_saves = OrderedDict() colors = OrderedDict() model_eval_data = OrderedDict() static_lines = {} ignore_patterns = ["_ignore_"] to_compare = None load_pattern = sys.argv[1] test_envs_to_plot = None # plot all min_y, max_y = 0.0, 1.1 def label_parser(label): label = label.replace("04-01_Pointing_CB_heldout_doors", "PPO_CB") label = label.replace("19-01_Color_CB_heldout_doors", "PPO_CBL") label = label.replace("19-01_Feedback_CB_heldout_doors_20M", "PPO_CBL") label = label.replace("20-01_JA_Color_CB_heldout_doors", "JA_PPO_CBL") label = label.replace("05-01_scaffolding_50M_no_acl", "PPO_no_scaf") label = label.replace("05-01_scaffolding_50M_acl_4_acl-type_intro_seq", "PPO_scaf_4") label = label.replace("05-01_scaffolding_50M_acl_8_acl-type_intro_seq_scaf", "PPO_scaf_8") label = label.replace("03-01_RR_ft_single_CB_marble_pass_A_soc_exp", "PPO_CB_role_B") label = label.replace("03-01_RR_ft_single_CB_marble_pass_A_asoc_contr", "PPO_CB_asocial") label = label.replace("05-01_RR_ft_group_50M_CB_marble_pass_A_soc_exp", "PPO_CB_role_B") label = label.replace("05-01_RR_ft_group_50M_CB_marble_pass_A_asoc_contr", "PPO_CB_asocial") label = label.replace("20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__0.25_50", "PPO_CB_0.25") label = label.replace("20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__0.5_50", "PPO_CB_0.5") label = label.replace("20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__1_50", "PPO_CB_1") return label color_dict = { 'PPO_CB': "blue", 'PPO_CB(train)': "blue", "PPO_CB(test)": "orange", 'PPO_no_bonus': "orange", 'PPO_CBL': "blue", 'PPO_CBL(train)': "blue", "PPO_CBL(test)": "orange", 'JA_PPO_CBL': "green", "PPO_CB_role_B": "blue", "PPO_CB_asocial": "orange", 'PPO_CB_0.25': "blue", 'PPO_CB_0.5': "green", 'PPO_CB_1': "orange", } if load_pattern == "RR_single": save = False show_plot = True load_pattern = "_" plot_path = "../case_studies_final_figures/RR_dummy_single" require_patterns = [ "03-01_RR_ft_single_CB_marble_pass_A_asoc_contr", "03-01_RR_ft_single_CB_marble_pass_A_soc_exp", ] plot_aggregated_test = False plot_only_aggregated_test = False study_train = True study_eval = False elif load_pattern == "RR_group": load_pattern = "_" plot_path = "../case_studies_final_figures/RR_dummy_group" require_patterns = [ "05-01_RR_ft_group_50M_CB_marble_pass_A_asoc_contr", "05-01_RR_ft_group_50M_CB_marble_pass_A_soc_exp", ] plot_aggregated_test = False plot_only_aggregated_test = False study_train = True study_eval = False elif load_pattern == "scaffolding": load_pattern = "_" plot_path = "../case_studies_final_figures/Scaffolding_test" require_patterns = [ "05-01_scaffolding_50M_no_acl", "05-01_scaffolding_50M_acl_4_acl-type_intro_seq", "05-01_scaffolding_50M_acl_8_acl-type_intro_seq_scaf", ] test_envs_to_plot = None # aggregate all of them plot_aggregated_test = True plot_only_aggregated_test = True study_train = False study_eval = True to_compare = [ ("05-01_scaffolding_50M_acl_4_acl-type_intro_seq_agg_test", "05-01_scaffolding_50M_no_acl_agg_test", "auto_color"), ("05-01_scaffolding_50M_acl_8_acl-type_intro_seq_scaf_agg_test", "05-01_scaffolding_50M_no_acl_agg_test", "auto_color"), ] elif load_pattern == "pointing": study_train = True study_eval = True plot_aggregated_test = False plot_only_aggregated_test = False load_pattern = "_" test_envs_to_plot = [ "SocialAI-EPointingDoorsTestInformationSeekingParamEnv-v1", ] plot_path = "../case_studies_final_figures/Pointing_train_test" require_patterns = [ "04-01_Pointing_CB_heldout_doors", ] to_compare = [ ("04-01_Pointing_CB_heldout_doors", "04-01_Pointing_CB_heldout_doors_SocialAI-EPointingDoorsTestInformationSeekingParamEnv-v1", "black") ] elif load_pattern == "color": study_train = True study_eval = True plot_aggregated_test = False plot_only_aggregated_test = False max_x_lim = 18 load_pattern = "_" test_envs_to_plot = [ "SocialAI-ELangColorDoorsTestInformationSeekingParamEnv-v1", ] plot_path = "../case_studies_final_figures/Color_train_test" require_patterns = [ "19-01_Color_CB_heldout_doors", ] to_compare = [ ("19-01_Color_CB_heldout_doors", "19-01_Color_CB_heldout_doors_SocialAI-ELangColorDoorsTestInformationSeekingParamEnv-v1", "black") ] elif load_pattern == "ja_color": study_train = True study_eval = False plot_aggregated_test = False plot_only_aggregated_test = False max_x_lim = 18 load_pattern = "_" test_envs_to_plot = None plot_path = "../case_studies_final_figures/JA_Color_train" require_patterns = [ "19-01_Color_CB_heldout_doors", "20-01_JA_Color_CB_heldout_doors", ] to_compare = [ ("19-01_Color_CB_heldout_doors", "20-01_JA_Color_CB_heldout_doors", "black") ] elif load_pattern == "feedback_per_seed": study_train = True study_eval = False per_seed = True draw_legend = False plot_aggregated_test = False plot_only_aggregated_test = False max_x_lim = 18 load_pattern = "_" test_envs_to_plot = [ "SocialAI-ELangFeedbackDoorsTestInformationSeekingParamEnv-v1", ] plot_path = "../case_studies_final_figures/Feedback_train_per_seed" require_patterns = [ "19-01_Feedback_CB_heldout_doors", ] to_compare = None elif load_pattern == "feedback": study_train = True study_eval = True plot_aggregated_test = False plot_only_aggregated_test = False max_x_lim = 18 load_pattern = "_" test_envs_to_plot = [ "SocialAI-ELangFeedbackDoorsTestInformationSeekingParamEnv-v1", ] plot_path = "../case_studies_final_figures/Feedback_train_test" require_patterns = [ "19-01_Feedback_CB_heldout_doors", ] to_compare = [ ("19-01_Feedback_CB_heldout_doors_20M", "19-01_Feedback_CB_heldout_doors_20M_SocialAI-ELangFeedbackDoorsTestInformationSeekingParamEnv-v1", "black") ] elif load_pattern == "imitation_train": study_train = True study_eval = False plot_aggregated_test = False plot_only_aggregated_test = False max_x_lim = 18 load_pattern = "_" test_envs_to_plot = None plot_path = "../case_studies_final_figures/Imitation_train" require_patterns = [ "20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__0.25_50", "20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__0.5_50", "20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__1_50", ] # to_compare = [ # ("19-01_Color_CB_heldout_doors", "20-01_JA_Color_CB_heldout_doors", "black") # ] to_compare = None elif load_pattern == "imitation_train_intro": metrics = ["NPC_intro"] show_plot = False save = True study_train = True study_eval = False plot_aggregated_test = False plot_only_aggregated_test = False max_x_lim = 18 load_pattern = "_" test_envs_to_plot = None plot_path = "../case_studies_final_figures/Imitation_train_intro" require_patterns = [ "20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__0.25_50", "20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__0.5_50", "20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__1_50", ] # to_compare = [ # ("19-01_Color_CB_heldout_doors", "20-01_JA_Color_CB_heldout_doors", "black") # ] to_compare = None elif load_pattern == "imitation_test": study_train = False study_eval = True plot_aggregated_test = False plot_only_aggregated_test = False max_x_lim = 18 load_pattern = "_" test_envs_to_plot = None plot_path = "../case_studies_final_figures/Imitation_test" require_patterns = [ "20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__0.25_50", "20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__0.5_50", "20-01_Imitation_PPO_CB_exploration-bonus-type_cell_exploration-bonus-params__1_50", ] # to_compare = [ # ("19-01_Color_CB_heldout_doors", "20-01_JA_Color_CB_heldout_doors", "black") # ] to_compare = None elif load_pattern == "pilot_pointing": study_train = True study_eval = False show_plot = False save = True plot_path = "../case_studies_final_figures/pilot_pointing" load_pattern = "29-10_SAI_Pointing_CS_PPO_" require_patterns = [ "29-10_SAI_Pointing_CS_PPO_CB_env_SocialAI-EPointingInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4_exploration-bonus-type_cell_exploration-bonus-params__2_50_exploration-bonus-tanh_0.6", "29-10_SAI_Pointing_CS_PPO_CBL_env_SocialAI-EPointingInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4_exploration-bonus-type_lang_exploration-bonus-params__10_50_exploration-bonus-tanh_0.6", "29-10_SAI_Pointing_CS_PPO_no_env_SocialAI-EPointingInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4", "29-10_SAI_Pointing_CS_PPO_RIDE_env_SocialAI-EPointingInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4_exploration-bonus-type_ride_intrinsic-reward-coef_0.01", "29-10_SAI_Pointing_CS_PPO_RND_env_SocialAI-EPointingInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4_exploration-bonus-type_rnd_intrinsic-reward-coef_0.005", ] color_dict = { "PPO_RIDE": "orange", "PPO_RND": "magenta", "PPO_no": "maroon", "PPO_CBL": "green", "PPO_CB": "blue", } def label_parser(label): label = label.split("_env_")[0].split("SAI_")[1] label=label.replace("Pointing_CS_", "") return label to_compare = None elif load_pattern == "pilot_color": study_train = True study_eval = False show_plot = False save = True plot_path = "../case_studies_final_figures/pilot_color" load_pattern = "29-10_SAI_LangColor_CS" require_patterns = [ "29-10_SAI_LangColor_CS_PPO_CB_env_SocialAI-ELangColorInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4_exploration-bonus-type_cell_exploration-bonus-params__2_50_exploration-bonus-tanh_0.6", "29-10_SAI_LangColor_CS_PPO_CBL_env_SocialAI-ELangColorInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4_exploration-bonus-type_lang_exploration-bonus-params__10_50_exploration-bonus-tanh_0.6", "29-10_SAI_LangColor_CS_PPO_no_env_SocialAI-ELangColorInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4", "29-10_SAI_LangColor_CS_PPO_RIDE_env_SocialAI-ELangColorInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4_exploration-bonus-type_ride_intrinsic-reward-coef_0.01", "29-10_SAI_LangColor_CS_PPO_RND_env_SocialAI-ELangColorInformationSeekingParamEnv-v1_recurrence_5_lr_1e-4_exploration-bonus-type_rnd_intrinsic-reward-coef_0.005" ] color_dict = { "PPO_RIDE": "orange", "PPO_RND": "magenta", "PPO_no": "maroon", "PPO_CBL": "green", "PPO_CB": "blue", } def label_parser(label): label = label.split("_env_")[0].split("SAI_")[1] label=label.replace("LangColor_CS_", "") return label to_compare = None elif load_pattern == "formats_train": study_train = True study_eval = False plot_aggregated_test = False plot_only_aggregated_test = False max_x_lim = 45 load_pattern = "_" test_envs_to_plot = None plot_path = "../case_studies_final_figures/Formats_train" require_patterns = [ "21-01_formats_50M_CBL", "05-01_scaffolding_50M_no_acl", ] to_compare = [ ("21-01_formats_50M_CBL", "05-01_scaffolding_50M_no_acl", "black") ] def label_parser(label): label = label.replace("05-01_scaffolding_50M_no_acl", "PPO_no_bonus") label = label.replace("21-01_formats_50M_CBL", "PPO_CBL") return label elif load_pattern == "adversarial": show_plot = False save = True study_train = True study_eval = False plot_aggregated_test = False plot_only_aggregated_test = False # max_x_lim = 45 smooth_factor = 0 load_pattern = "_" test_envs_to_plot = None plot_path = "../case_studies_final_figures/adversarial" require_patterns = [ "26-01_Adversarial_2M_PPO_CB_hidden_npc", "26-01_Adversarial_2M_PPO_CB_asoc", "26-01_Adversarial_2M_PPO_CB", ] to_compare = [ ("26-01_Adversarial_2M_PPO_CB", "26-01_Adversarial_2M_PPO_CB_hidden_npc", "orange"), ("26-01_Adversarial_2M_PPO_CB", "26-01_Adversarial_2M_PPO_CB_asoc", "green") ] def label_parser(label): label = label.replace("26-01_Adversarial_2M_PPO_CB_hidden_npc", "PPO_CB_invisible_peer") label = label.replace("26-01_Adversarial_2M_PPO_CB_asoc", "PPO_CB_no_peer") label = label.replace("26-01_Adversarial_2M_PPO_CB", "PPO_CB") return label color_dict = { "PPO_CB": "blue", "PPO_CB_invisible_peer": "orange", "PPO_CB_no_peer": "green", } elif load_pattern == "adversarial_stumps": study_train = True study_eval = False plot_aggregated_test = False plot_only_aggregated_test = False # max_x_lim = 45 smooth_factor = 0 load_pattern = "_" test_envs_to_plot = None plot_path = "../case_studies_final_figures/adversarial_stumps" require_patterns = [ "26-01_Adversarial_5M_Stumps_PPO_CB_hidden_npc", "26-01_Adversarial_5M_Stumps_PPO_CB_asoc", "26-01_Adversarial_5M_Stumps_PPO_CB", ] to_compare = [ ("26-01_Adversarial_5M_Stumps_PPO_CB", "26-01_Adversarial_5M_Stumps_PPO_CB_hidden_npc", "orange"), ("26-01_Adversarial_5M_Stumps_PPO_CB", "26-01_Adversarial_5M_Stumps_PPO_CB_asoc", "green") ] def label_parser(label): label = label.replace("26-01_Adversarial_5M_Stumps_PPO_CB_hidden_npc", "PPO_CB_invisible_peer") label = label.replace("26-01_Adversarial_5M_Stumps_PPO_CB_asoc", "PPO_CB_no_peer") label = label.replace("26-01_Adversarial_5M_Stumps_PPO_CB", "PPO_CB") return label color_dict = { "PPO_CB": "blue", "PPO_CB_invisible_peer": "orange", "PPO_CB_no_peer": "green", } else: plot_path = "plots/testplot" require_patterns = [ "_", # pointing # "04-01_Pointing_CB_heldout_doors", ] if to_compare is None and len(require_patterns) == 2 and "_" not in require_patterns: # if only two curves compare those two automatically to_compare = [(require_patterns[0], require_patterns[1], "black")] save=False show_plot = True # all of those include_patterns = [] #include_patterns = ["rec_5"] fontsize = 20 legend_fontsize = 20 linewidth = 5 # linewidth = 1 leg_args = { 'fontsize': legend_fontsize } title_fontsize = int(fontsize*1.2) storage_dir = "storage/" if load_pattern.startswith(storage_dir): load_pattern = load_pattern[len(storage_dir):] if load_pattern.startswith("./storage/"): load_pattern = load_pattern[len("./storage/"):] get_datasets(storage_dir, str(load_pattern), load_subsample_step=load_subsample_step, ignore_patterns=ignore_patterns, require_patterns=require_patterns) label_parser_dict = { # "PPO_CB": "PPO_CB", # "02-06_AppleStealing_experiments_cb_bonus_angle_occ_env_SocialAI-OthersPerceptionInferenceParamEnv-v1_exploration-bonus-type_cell": "NPC_visible", } env_type = str(load_pattern) fig_type = "test" try: top_n = int(sys.argv[2]) except: top_n = 8 to_remove = [] for tr_ in to_remove: if tr_ in models_saves: del models_saves[tr_] print("Loaded:") print("\n".join(list(models_saves.keys()))) if per_model_colors: # order runs for legend order as in per_models_colors, with corresponding colors ordered_labels = OrderedDict() for teacher_type in per_model_colors.keys(): for k,v in labels.items(): if teacher_type in k: ordered_labels[k] = v labels = ordered_labels else: print('not using per_model_color') for k in models_saves.keys(): labels[k] = k # Plot utils def plot_with_shade(subplot_nb, ax, x, y, err, color, shade_color, label, legend=False, leg_loc='best', title=None, ylim=[0, 100], xlim=[0, 40], leg_args={}, leg_linewidth=13.0, linewidth=10.0, labelsize=20, fontsize=20, title_fontsize=30, zorder=None, xlabel='Perf', ylabel='Env steps', linestyle="-", xnbins=3, ynbins=3): #plt.rcParams.update({'font.size': 15}) ax.locator_params(axis='x', nbins=xnbins) ax.locator_params(axis='y', nbins=ynbins) ax.tick_params(axis='y', which='both', labelsize=labelsize) ax.tick_params(axis='x', which='both', labelsize=labelsize*0.8) # ax.tick_params(axis='both', which='both', labelsize="small") # ax.scatter(x, y, color=color,linewidth=linewidth,zorder=zorder, linestyle=linestyle) ax.plot(x, y, color=color, label=label, linewidth=linewidth, zorder=zorder, linestyle=linestyle) if not np.array_equal(err, np.zeros_like(err)): ax.fill_between(x, y-err, y+err, color=shade_color, alpha=0.2) if legend: leg = ax.legend(loc=leg_loc, **leg_args) # 34 for legobj in leg.legendHandles: legobj.set_linewidth(leg_linewidth) ax.set_xlabel(xlabel, fontsize=fontsize) if subplot_nb == 0: ax.set_ylabel(ylabel, fontsize=fontsize, labelpad=2) ax.set_xlim(xmin=xlim[0], xmax=xlim[1]) ax.set_ylim(bottom=ylim[0], top=ylim[1]) if title: ax.set_title(title, fontsize=title_fontsize) # only one figure is drawn -> maybe we can add loops later assert len(metrics) == 1 f, ax = plt.subplots(1, 1, figsize=(9.0, 9.0)) if len(metrics) == 1: ax = [ax] # max_y = -np.inf min_y = np.inf max_steps = 0 exclude_patterns = [] metric = metrics[0] ylabel = { "success_rate_mean": "Success rate (%)", "exploration_bonus_mean": "Exploration bonus", "NPC_intro": "Successful introduction (%)", }.get(metric, metric) # for metric_i, metric in enumerate(metrics): default_colors = default_colors_.copy() if study_train: for model_i, model_id in enumerate(models_saves.keys()): #excluding some experiments if any([ex_pat in model_id for ex_pat in exclude_patterns]): continue if len(include_patterns) > 0: if not any([in_pat in model_id for in_pat in include_patterns]): continue runs_data = models_saves[model_id]['data'] ys = [] if runs_data[0]['frames'][1] == 'frames': runs_data[0]['frames'] = list(filter(('frames').__ne__, runs_data[0]['frames'])) if per_seed: min_len = None else: # determine minimal run length across seeds lens = [len(run['frames']) for run in runs_data if len(run['frames'])] minimum = sorted(lens)[-min(top_n, len(lens))] min_len = np.min([len(run['frames']) for run in runs_data if len(run['frames']) >= minimum]) # keep only top k runs_data = [run for run in runs_data if len(run['frames']) >= minimum] # min_len = np.min([len(run['frames']) for run in runs_data if len(run['frames']) > 10]) # compute env steps (x axis) longest_id = np.argmax([len(rd['frames']) for rd in runs_data]) steps = np.array(runs_data[longest_id]['frames'], dtype=np.int) / steps_denom steps = steps[:min_len] for run in runs_data: if metric not in run: raise ValueError(f"Metric {metric} not found. Possible metrics: {list(run.keys())}") data = run[metric] # checking for header if data[1] == metric: data = np.array(list(filter((metric).__ne__, data)), dtype=np.float16) if per_seed: ys.append(data) else: if len(data) >= min_len: # discard extra if len(data) > min_len: print("run has too many {} datapoints ({}). Discarding {}".format(model_id, len(data), len(data) - min_len)) data = data[0:min_len] ys.append(data) else: raise ValueError("How can data be < min_len if it was capped above") ys_same_len = ys # computes stats n_seeds = len(ys_same_len) if per_seed: sems = np.array(ys_same_len) means = np.array(ys_same_len) stds = np.zeros_like(means) color = default_colors[model_i] else: sems = np.std(ys_same_len, axis=0)/np.sqrt(len(ys_same_len)) # sem stds = np.std(ys_same_len, axis=0) # std means = np.mean(ys_same_len, axis=0) color = default_colors[model_i] if metric == 'duration': means = means / 3600 sems = sems / 3600 stds = stds / 3600 if per_seed: # plot x y bounds curr_max_steps = np.max(np.max(steps)) else: # plot x y bounds curr_max_steps = np.max(steps) if curr_max_steps > max_steps: max_steps = curr_max_steps if subsample_step: steps = steps[0::subsample_step] means = means[0::subsample_step] stds = stds[0::subsample_step] sems = sems[0::subsample_step] ys_same_len = [y[0::subsample_step] for y in ys_same_len] # display seeds separately if per_seed: for s_i, seed_ys in enumerate(ys_same_len): label = label_parser(model_id) if study_eval: label = label + "_train_" label = label + f"(s:{s_i})" if label in color_dict: color = color_dict[label] else: color = default_colors[model_i*20+s_i] curve_ID = f"{model_id}_{s_i}" assert np.array_equal(stds, np.zeros_like(stds)) if smooth_factor: means = smooth(means, smooth_factor) to_plot_dict[curve_ID] = { "label": label, "steps": steps, "means": seed_ys, "stds": stds, "ys": ys_same_len, "color": color } else: label = label_parser(model_id) if study_eval: label = label+"(train)" if color_dict: color = color_dict[label] else: color = default_colors[model_i] if smooth_factor: means = smooth(means, smooth_factor) stds = smooth(stds, smooth_factor) to_plot_dict[model_id] = { "label": label, "steps": steps, "means": means, "stds": stds, "sems": sems, "ys": ys_same_len, "color": color, } if study_eval: print("Evaluation") # evaluation sets number_of_eval_envs = max(list([len(v.keys()) for v in model_eval_data.values()])) if plot_aggregated_test: number_of_eval_envs += 1 if number_of_eval_envs == 0: print("No eval envs") exit() default_colors = default_colors_.copy() test_summary_dict = defaultdict(dict) test_summary_dict_colors = defaultdict(dict) for model_i, model_id in enumerate(model_eval_data.keys()): # excluding some experiments if any([ex_pat in model_id for ex_pat in exclude_patterns]): continue if len(include_patterns) > 0: if not any([in_pat in model_id for in_pat in include_patterns]): continue # test envs test_envs = model_eval_data[model_id].items() # filter unwanted eval envs if test_envs_to_plot is not None: test_envs = [(name, data) for name, data in test_envs if name in test_envs_to_plot] # computes stats if sort_test: test_envs_sorted = list(sorted(test_envs, key=lambda kv: sort_test_set(kv[0]))) else: test_envs_sorted = list(test_envs) if plot_aggregated_test: agg_means = [] for env_i, (test_env, env_data) in enumerate(test_envs_sorted): ys_same_len = env_data["values"] steps = env_data["steps"].mean(0) / steps_denom n_seeds = len(ys_same_len) if per_seed: sems = np.array(ys_same_len) stds = np.array(ys_same_len) means = np.array(ys_same_len) color = default_colors[model_i] # plot x y bounds curr_max_steps = np.max(np.max(steps)) else: sems = np.std(ys_same_len, axis=0) / np.sqrt(len(ys_same_len)) # sem stds = np.std(ys_same_len, axis=0) # std means = np.mean(ys_same_len, axis=0) color = default_colors[model_i] curr_max_steps = np.max(steps) if plot_aggregated_test: agg_means.append(means) x_lim = max(steps[-1], x_lim) x_lim = min(max_x_lim, x_lim) eval_metric_name = { "test_success_rates": "Success rate", 'exploration_bonus_mean': "Exploration bonus", }.get(eval_metric, eval_metric) test_env_name = test_env.replace("Env", "").replace("Test", "") env_types = ["InformationSeeking", "Collaboration", "PerspectiveTaking"] for env_type in env_types: if env_type in test_env_name: test_env_name = test_env_name.replace(env_type, "") test_env_name += f"\n({env_type})" if per_seed: for s_i, seed_ys in enumerate(ys_same_len): label = label_parser(model_id) + f"_{test_env}" + f"(s:{s_i})" if eval_smooth_factor: seed_ys = smooth(seed_ys, eval_smooth_factor) curve_ID = f"{model_id}_{test_env}_{s_i}" to_plot_dict[curve_ID] = { "label": label, "steps": steps, "means": seed_ys, "stds": np.zeros_like(seed_ys), "ys": ys_same_len, "color": color } else: if len(test_envs_sorted) > 1: label = label_parser(model_id) + f"_{test_env}" else: label = label_parser(model_id) if study_train: label=label+"(test)" if not plot_only_aggregated_test: if label in color_dict: color = color_dict[label] else: color = default_colors[model_i*len(test_envs_sorted)+env_i] if legend_show_n_seeds: label = label + "({})".format(n_seeds) if eval_smooth_factor: means = smooth(means, eval_smooth_factor) stds = smooth(stds, eval_smooth_factor) sems = smooth(sems, eval_smooth_factor) to_plot_dict[model_id+f"_{test_env}"] = { "label": label, "steps": steps, "means": means, "stds": stds, "sems": sems, "ys": ys_same_len, "color": color, } if plot_aggregated_test: ys_same_len = agg_means agg_means = np.array(agg_means) agg_mean = agg_means.mean(axis=0) agg_std = agg_means.std(axis=0) # std agg_sems = ... label = label_parser(model_id) if study_train: label = label + "(train)" if eval_smooth_factor: agg_mean = smooth(agg_mean, eval_smooth_factor) agg_std = smooth(agg_std, eval_smooth_factor) agg_sems = smooth(agg_sems, eval_smooth_factor) if per_seed: print("Not smooth aggregated because of per seed") for s_i, (seed_ys, seed_st) in enumerate(zip(agg_mean, agg_std)): seed_c = default_colors[model_i + s_i] label = str(s_i) to_plot_dict[curve_ID] = { "label": label, "steps": steps, "means": seed_ys, "stds": seed_st, "ys": ys_same_len, "color": color } else: if label in color_dict: color = color_dict[label] else: color = default_colors[model_i] to_plot_dict[model_id+"_agg_test"] = { "label": label, "steps": steps, "means": agg_mean, "stds": agg_std, "sems": agg_sems, "ys": ys_same_len, "color": color, } # should be labels to_scatter_dict = {} if to_compare is not None: for comp_i, (a_model_id, b_model_id, color) in enumerate(to_compare): a_data = to_plot_dict[a_model_id]["ys"] b_data = to_plot_dict[b_model_id]["ys"] steps = to_plot_dict[a_model_id]["steps"] if color == "auto_color": color = to_plot_dict[a_model_id]["color"] if len(a_data[0]) != len(b_data[0]): # extract steps present in both a_steps = to_plot_dict[a_model_id]["steps"] b_steps = to_plot_dict[b_model_id]["steps"] steps = list(set(a_steps) & set(b_steps)) # keep only the values for those steps mask_a = [(a_s in steps) for a_s in a_steps] a_data = np.array(a_data)[:, mask_a] mask_b = [(b_s in steps) for b_s in b_steps] b_data = np.array(b_data)[:, mask_b] p = stats.ttest_ind( a_data, b_data, equal_var=False ).pvalue steps = [s for s, p in zip(steps, p) if p < test_p] ys = [1.02+0.02*comp_i]*len(steps) to_scatter_dict[f"compare_{a_model_id}_{b_model_id}"] = { "label": "", "xs": steps, "ys": ys, "color": color, } for scatter_i, (scatter_ID, scatter_id_data) in enumerate(to_scatter_dict.items()): # unpack data label, xs, ys, color = ( scatter_id_data["label"], scatter_id_data["xs"], scatter_id_data["ys"], scatter_id_data["color"], ) xlabel = f"Env steps (1e6)" plt.scatter( xs, ys, color=color, marker="x" ) summary_dict[label] = xs[-1] summary_dict_colors[label] = color for curve_i, (curve_ID, model_id_data) in enumerate(to_plot_dict.items()): # unpack data label, steps, means, stds, sems, ys, color = ( model_id_data["label"], model_id_data["steps"], model_id_data["means"], model_id_data["stds"], model_id_data["sems"], model_id_data["ys"], model_id_data["color"] ) # if smooth_factor: # means = smooth(means, smooth_factor) # stds = smooth(stds, smooth_factor) if legend_show_n_seeds: n_seeds = len(ys) label = label+"({})".format(n_seeds) x_lim = max(steps[-1], x_lim) x_lim = min(max_x_lim, x_lim) xlabel = f"Env steps (1e6)" plot_with_shade( 0, ax[0], steps, means, stds, color, color, label, # 0, ax[0], steps, means, sems, color, color, label, legend=draw_legend, xlim=[0, x_lim], ylim=[0, max_y], xlabel=xlabel, ylabel=ylabel, title=None, labelsize=fontsize, fontsize=fontsize, title_fontsize=title_fontsize, linewidth=linewidth, leg_linewidth=5, leg_args=leg_args, xnbins=xnbins, ynbins=ynbins, ) summary_dict[label] = means[-1] summary_dict_colors[label] = color # plot static lines if static_lines: for label, (mean, std, color) in static_lines.items(): if label == "": label = None plot_with_shade( 0, ax[0], steps, np.array([mean]*len(steps)), np.array([std]*len(steps)), color, color, label, legend=True, xlim=[0, x_lim], ylim=[0, 1.0], xlabel=f"Env steps (1e6)", ylabel=ylabel, linestyle=":", leg_args=leg_args, fontsize=fontsize, title_fontsize=title_fontsize, xnbins=xnbins, ynbins=ynbins, ) if plot_path: f.savefig(plot_path+".png") f.savefig(plot_path+".svg") print(f"Plot saved to {plot_path}.[png/svg].") # Summary dict if len(summary_dict) == 0: raise ValueError(f"No experiments found for {load_pattern}.") else: # print summary best = max(summary_dict.values()) pc = 0.3 n = int(len(summary_dict)*pc) print("top n: ", n) top_pc = sorted(summary_dict.values())[-n:] bottom_pc = sorted(summary_dict.values())[:n] print("legend:") cprint("\tbest", "green") cprint("\ttop {} %".format(pc), "blue") cprint("\tbottom {} %".format(pc), "red") print("\tothers") print() for l, p in sorted(summary_dict.items(), key=lambda kv: kv[1]): c = summary_dict_colors[l] if p == best: cprint("label: {} ({})".format(l, c), "green") cprint("\t {}:{}".format(metric, p), "green") elif p in top_pc: cprint("label: {} ({})".format(l, c), "blue") cprint("\t {}:{}".format(metric, p), "blue") elif p in bottom_pc: cprint("label: {} ({})".format(l, c), "red") cprint("\t {}:{}".format(metric, p), "red") else: print("label: {} ({})".format(l, c)) print("\t {}:{}".format(metric, p)) if show_plot: plt.tight_layout() plt.subplots_adjust(hspace=1.5, wspace=0.5, left=0.1, right=0.9, bottom=0.1, top=0.85) plt.suptitle(super_title) plt.show() plt.close()