import plotly.colors import plotly.graph_objects as go from plotly.subplots import make_subplots import os import matplotlib.pyplot as plt import argparse from utils.score_extract.ood_agg import ood_t2i_agg, ood_i2t_agg DEFAULT_PLOTLY_COLORS = plotly.colors.DEFAULT_PLOTLY_COLORS def to_rgba(rgb, alpha=1): return 'rgba' + rgb[3:][:-1] + f', {alpha})' def radar_plot(results, thetas, selected_models): # Extract performance values for each model across all benchmarks model_performance = {} selected_models = [os.path.basename(model) for model in selected_models] for model in selected_models: if model in results: benchmarks_data = results[model] model_performance[model] = [benchmarks_data[subfield] for subfield in benchmarks_data.keys()] # Create radar chart with plotly fig = make_subplots( rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.2, row_heights=[1, 0.4], specs=[[{"type": "polar"}], [{"type": "table"}]] ) for i, (model, performance) in enumerate(model_performance.items()): color = DEFAULT_PLOTLY_COLORS[i % len(DEFAULT_PLOTLY_COLORS)] fig.add_trace( go.Scatterpolar( r=performance + [performance[0]], theta=thetas + [thetas[0]], fill='toself', connectgaps=True, fillcolor=to_rgba(color, 0.1), name=model.split('/')[-1], # Use the last part of the model name for clarity ), row=1, col=1 ) header_texts = ["Model"] + [x.replace("
", " ") for x in thetas] rows = [[x.split('/')[-1] for x in selected_models]] + [[round(score[i], 2) for score in [model_performance[x] for x in selected_models]] for i in range(len(thetas))] # column_widths = [len(x) for x in header_texts] # column_widths[0] *= len(thetas) fig.add_trace( go.Table( header=dict(values=header_texts, font=dict(size=14.5), align="left"), cells=dict( values=rows, align="left", font=dict(size=14.5), height=30 ), # columnwidth=column_widths ), row=2, col=1 ) fig.update_layout( height=900, legend=dict(font=dict(size=20), orientation="h", xanchor="center", x=0.5, y=0.35), polar=dict( radialaxis=dict( visible=True, range=[0, 100], # Assuming accuracy is a percentage between 0 and 100 tickfont=dict(size=12) ), angularaxis=dict(tickfont=dict(size=20), type="category") ), showlegend=True, # title=f"{title}" ) return fig def main_radar_plot(main_scores, selected_models): fig = make_subplots( rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.2, row_heights=[1.0, 0.5], specs=[[{"type": "polar"}], [{"type": "table"}]] ) model_scores = {} for model in selected_models: model_name = os.path.basename(model) model_scores[model_name] = main_scores[model_name] perspectives = list(model_scores[os.path.basename(selected_models[0])].keys()) perspectives_shift = perspectives for i, model_name in enumerate(model_scores.keys()): color = DEFAULT_PLOTLY_COLORS[i % len(DEFAULT_PLOTLY_COLORS)] score_shifted = list(model_scores[model_name].values()) fig.add_trace( go.Scatterpolar( r=score_shifted + [score_shifted[0]], theta=perspectives_shift + [perspectives_shift[0]], connectgaps=True, fill='toself', fillcolor=to_rgba(color, 0.1), name=model_name, # Use the last part of the model name for clarity ), row=1, col=1 ) header_texts = ["Model"] + perspectives rows = [ list(model_scores.keys()), # Model Names *[[round(score[perspective], 2) for score in list(model_scores.values())] for perspective in perspectives] ] column_widths = [10] + [5] * len(perspectives) fig.add_trace( go.Table( header=dict(values=header_texts, font=dict(size=14.5), align="left"), cells=dict( values=rows, align="left", font=dict(size=14.5), height=30, ), columnwidth=column_widths, ), row=2, col=1 ) fig.update_layout( height=1200, legend=dict(font=dict(size=20), orientation="h", xanchor="center", x=0.5, y=0.4), polar=dict( radialaxis=dict( visible=True, range=[0, 100], # Assuming accuracy is a percentage between 0 and 100 tickfont=dict(size=12) ), angularaxis=dict(tickfont=dict(size=20), type="category", rotation=5) ), showlegend=True, title=dict(text="MM-DecodingTrust Scores (Higher is Better)"), ) return fig def breakdown_plot(scenario_results, subfields, selected_models): fig = radar_plot(scenario_results, subfields, selected_models) return fig def update_subscores(target_model, main_scores, config_dicts): perspectives = [] target_model = target_model.split('/')[-1] curr_main_scores = {} curr_main_scores[target_model] = {} for perspective in main_scores[target_model].keys(): curr_main_scores[target_model][config_dicts[perspective]["name"]] = main_scores[target_model][perspective] perspectives.append(config_dicts[perspective]["name"]) return curr_main_scores def generate_plot(model, main_scores, sub_scores, config_dict, out_path="plots"): curr_main_scores = update_subscores(model, main_scores, config_dict) for idx, perspective in enumerate(config_dict.keys()): if config_dict[perspective]["sub_plot"] == False: continue # if "openai/gpt-4-0314" not in sub_scores[perspective].keys(): # model_list = [model] # else: # model_list = [model, "openai/gpt-4-0314"] model_list = [model] subplot = breakdown_plot(sub_scores[perspective], list(sub_scores[perspective][model].keys()), model_list) perspective_name = config_dict[perspective]["name"].replace(" ", "_") subplot.write_image(f"{out_path}/{perspective_name}_breakdown.png", width=1400, height=700) plot = main_radar_plot(curr_main_scores, [model]) plot.write_image(f"{out_path}/main.png", width=1400, height=700) def generate_main_plot(models, main_scores): curr_main_scores = main_scores plot = main_radar_plot(curr_main_scores, models) return plot # plot.write_image(f"{out_path}/main.png", width=1400, height=700) def generate_sub_plot(models, sub_scores, perspective): subplot = breakdown_plot(sub_scores[perspective], list(sub_scores[perspective][models[0]].keys()), models) return subplot if __name__ == "__main__": # parser = argparse.ArgumentParser() # parser.add_argument("--model", type=str, default="hf/meta-llama/Llama-2-7b-chat-hf") # args = parser.parse_args() t2i_models = [ # Average time spent running the following example "dall-e-2", "dall-e-3", "DeepFloyd/IF-I-M-v1.0", # 15.372 "dreamlike-art/dreamlike-photoreal-2.0", # 3.526 "prompthero/openjourney-v4", # 4.981 "stabilityai/stable-diffusion-xl-base-1.0", # 7.463 ] i2t_models = [ # Average time spent running the following example "gpt-4-vision-preview", "gpt-4o-2024-05-13", "llava-hf/llava-v1.6-vicuna-7b-hf" ] perspectives = ["Safety", "Fairness", "Hallucination", "Privacy", "Adv", "OOD"] main_scores_t2i = {} main_scores_i2t = {} sub_scores_t2i = {} sub_scores_i2t = {} for model in t2i_models: model = model.split("/")[-1] main_scores_t2i[model] = {} for perspective in perspectives: # Place holder main_scores_t2i[model][perspective] = ood_t2i_agg(model, "./data/results")["score"] if perspective not in sub_scores_t2i.keys(): sub_scores_t2i[perspective] = {} sub_scores_t2i[perspective][model] = ood_t2i_agg(model, "./data/results")["subscenarios"] for model in i2t_models: model = model.split("/")[-1] main_scores_i2t[model] = {} for perspective in perspectives: # Place holder main_scores_i2t[model][perspective] = ood_i2t_agg(model, "./data/results")["score"] if perspective not in sub_scores_i2t.keys(): sub_scores_i2t[perspective] = {} sub_scores_i2t[perspective][model] = ood_i2t_agg(model, "./data/results")["subscenarios"] # generate_main_plot(t2i_models, main_scores_t2i) # generate_main_plot(i2t_models, main_scores_i2t) generate_sub_plot(t2i_models, sub_scores_t2i, "ood") # generate_sub_plot(i2t_models, sub_scores_i2t)