import json import os def ood_t2i_agg(model, result_dir): """ Aggregate scores for the given testing models. Parameters: model (str): Model name. result_dir (str): The path to the directory where the results are stored. Returns: dict: Output the overall score and the score of subscenarios in the format {"score": float, "subscenarios": dict}. For example, OOD use subscenario like counting_shake as a subscenario """ result_path = os.path.join(result_dir, "ood_t2i_summary.json") with open(result_path, "r") as file: results = json.load(file) agg_scores = {} # for model in models: # Only leave the model base name model = model.split("/")[-1] results_shake_fidelity = 0 results_shake_counting = 0 results_shake_spatial = 0 results_shake_color = 0 results_shake_size = 0 results_paraphrase_fidelity = 0 results_paraphrase_counting = 0 results_paraphrase_spatial = 0 results_paraphrase_color = 0 results_paraphrase_size = 0 for trial_id in [0, 1, 2]: results_shake_fidelity += results[model][f'trial_{trial_id}']['fidelity']['Shake_'] results_shake_counting += results[model][f'trial_{trial_id}']['counting']['Shake_'] results_shake_spatial += results[model][f'trial_{trial_id}']['spatial']['Shake_'] results_shake_color += results[model][f'trial_{trial_id}']['color']['Shake_'] results_shake_size += results[model][f'trial_{trial_id}']['size']['Shake_'] results_paraphrase_fidelity += results[model][f'trial_{trial_id}']['fidelity']['Paraphrase_'] results_paraphrase_counting += results[model][f'trial_{trial_id}']['counting']['Paraphrase_'] results_paraphrase_spatial += results[model][f'trial_{trial_id}']['spatial']['Paraphrase_'] results_paraphrase_color += results[model][f'trial_{trial_id}']['color']['Paraphrase_'] results_paraphrase_size += results[model][f'trial_{trial_id}']['size']['Paraphrase_'] results_shake_fidelity = results_shake_fidelity * 100 results_shake_fidelity /= 3 results_shake_counting /= 3 results_shake_spatial /= 3 results_shake_color /= 3 results_shake_size /= 3 results_shake_attribute = (results_shake_color + results_shake_size) / 2 results_paraphrase_fidelity = results_paraphrase_fidelity * 100 results_paraphrase_fidelity /= 3 results_paraphrase_counting /= 3 results_paraphrase_spatial /= 3 results_paraphrase_color /= 3 results_paraphrase_size /= 3 results_attribute = (results_paraphrase_color + results_paraphrase_size) / 2 avg_shake = (results_shake_fidelity + results_shake_counting + results_shake_spatial + results_shake_attribute) / 4 avg_paraphrase = (results_paraphrase_fidelity + results_paraphrase_counting + results_paraphrase_spatial + results_attribute) / 4 agg_score = (avg_shake + avg_paraphrase) / 2 agg_scores["score"] = agg_score agg_scores["subscenarios"] = { "helpfulness_shake": results_shake_fidelity, "counting_shake": results_shake_counting, "spatial_shake": results_shake_spatial, "attribute_shake": results_shake_attribute, "helpfulness_rare": results_paraphrase_fidelity, "counting_rare": results_paraphrase_counting, "spatial_rare": results_paraphrase_spatial, "attribute_rare": results_attribute } return agg_scores # agg_scores[model] = agg_score # return agg_scores def ood_i2t_agg(model, result_dir): """ Aggregate scores for the given testing models. Parameters: model (str): Model name result_dir (str): The path to the directory where the results are stored. Returns: dict: Output the overall score and the score of subscenarios in the format {"score": float, "subscenarios": dict}. For example, OOD use subscenario like counting_trans as a subscenario """ transformations = ["Van_Gogh", "oil_painting", "watercolour_painting"] corruptions = [ "zoom_blur", "gaussian_noise", "pixelate" ] result_path = os.path.join(result_dir, "ood_i2t_summary.json") with open(result_path, "r") as file: results = json.load(file) agg_scores = {} # for model in models: # Only leave the model base name model = model.split("/")[-1] identification_corrupt = sum([results[model]['identification'][corrupt]["Score"] for corrupt in corruptions]) / 3 count_corrupt = sum([results[model]['count'][corrupt]["Score"] for corrupt in corruptions]) / 3 spatial_corrupt = sum([results[model]['spatial'][corrupt]["Score"] for corrupt in corruptions]) / 3 attribute_corrupt = sum([results[model]['attribute'][corrupt]["Score"] for corrupt in corruptions]) / 3 avg_corrupt = (identification_corrupt + count_corrupt + spatial_corrupt + attribute_corrupt) / 4 identification_transform = sum([results[model]['identification'][transform]["Score"] for transform in transformations]) / 3 count_transform = sum([results[model]['count'][transform]["Score"] for transform in transformations]) / 3 spatial_transform = sum([results[model]['spatial'][transform]["Score"] for transform in transformations]) / 3 attribute_transform = sum([results[model]['attribute'][transform]["Score"] for transform in transformations]) / 3 avg_transform = (identification_transform + count_transform + spatial_transform + attribute_transform) / 4 agg_scores["score"] = (avg_corrupt + avg_transform) / 2 agg_scores["subscenarios"] = { "object_corrupt": identification_corrupt, "counting_corrupt": count_corrupt, "spatial_corrupt": spatial_corrupt, "attribute_corrupt": attribute_corrupt, "object_transform": identification_transform, "counting_transform": count_transform, "spatial_transform": spatial_transform, "attribute_transform": attribute_transform } return agg_scores # agg_scores[model] = agg_score # return agg_scores if __name__ == "__main__": t2i_models = [ # Average time spent running the following example "dall-e-2", "dall-e-3", "DeepFloyd/IF-I-M-v1.0", # 15.372 "dreamlike-art/dreamlike-photoreal-2.0", # 3.526 "prompthero/openjourney-v4", # 4.981 "stabilityai/stable-diffusion-xl-base-1.0", # 7.463 ] i2t_models = [ # Average time spent running the following example "gpt-4-vision-preview", "gpt-4o-2024-05-13", "llava-hf/llava-v1.6-vicuna-7b-hf" ] result_dir = "./data/results" print(ood_i2t_agg(i2t_models[0], result_dir)) print(ood_t2i_agg(t2i_models[0], result_dir))