Spaces:
Sleeping
Sleeping
import json | |
import os | |
def ood_t2i_agg(model, result_dir): | |
""" | |
Aggregate scores for the given testing models. | |
Parameters: | |
model (str): Model name. | |
result_dir (str): The path to the directory where the results are stored. | |
Returns: | |
dict: Output the overall score and the score of subscenarios in the format {"score": float, "subscenarios": dict}. | |
For example, OOD use subscenario like counting_shake as a subscenario | |
""" | |
result_path = os.path.join(result_dir, "ood_t2i_summary.json") | |
with open(result_path, "r") as file: | |
results = json.load(file) | |
agg_scores = {} | |
# for model in models: | |
# Only leave the model base name | |
model = model.split("/")[-1] | |
results_shake_fidelity = 0 | |
results_shake_counting = 0 | |
results_shake_spatial = 0 | |
results_shake_color = 0 | |
results_shake_size = 0 | |
results_paraphrase_fidelity = 0 | |
results_paraphrase_counting = 0 | |
results_paraphrase_spatial = 0 | |
results_paraphrase_color = 0 | |
results_paraphrase_size = 0 | |
for trial_id in [0, 1, 2]: | |
results_shake_fidelity += results[model][f'trial_{trial_id}']['fidelity']['Shake_'] | |
results_shake_counting += results[model][f'trial_{trial_id}']['counting']['Shake_'] | |
results_shake_spatial += results[model][f'trial_{trial_id}']['spatial']['Shake_'] | |
results_shake_color += results[model][f'trial_{trial_id}']['color']['Shake_'] | |
results_shake_size += results[model][f'trial_{trial_id}']['size']['Shake_'] | |
results_paraphrase_fidelity += results[model][f'trial_{trial_id}']['fidelity']['Paraphrase_'] | |
results_paraphrase_counting += results[model][f'trial_{trial_id}']['counting']['Paraphrase_'] | |
results_paraphrase_spatial += results[model][f'trial_{trial_id}']['spatial']['Paraphrase_'] | |
results_paraphrase_color += results[model][f'trial_{trial_id}']['color']['Paraphrase_'] | |
results_paraphrase_size += results[model][f'trial_{trial_id}']['size']['Paraphrase_'] | |
results_shake_fidelity = results_shake_fidelity * 100 | |
results_shake_fidelity /= 3 | |
results_shake_counting /= 3 | |
results_shake_spatial /= 3 | |
results_shake_color /= 3 | |
results_shake_size /= 3 | |
results_shake_attribute = (results_shake_color + results_shake_size) / 2 | |
results_paraphrase_fidelity = results_paraphrase_fidelity * 100 | |
results_paraphrase_fidelity /= 3 | |
results_paraphrase_counting /= 3 | |
results_paraphrase_spatial /= 3 | |
results_paraphrase_color /= 3 | |
results_paraphrase_size /= 3 | |
results_attribute = (results_paraphrase_color + results_paraphrase_size) / 2 | |
avg_shake = (results_shake_fidelity + results_shake_counting + results_shake_spatial + results_shake_attribute) / 4 | |
avg_paraphrase = (results_paraphrase_fidelity + results_paraphrase_counting + results_paraphrase_spatial + results_attribute) / 4 | |
agg_score = (avg_shake + avg_paraphrase) / 2 | |
agg_scores["score"] = agg_score | |
agg_scores["subscenarios"] = { | |
"helpfulness_shake": results_shake_fidelity, | |
"counting_shake": results_shake_counting, | |
"spatial_shake": results_shake_spatial, | |
"attribute_shake": results_shake_attribute, | |
"helpfulness_rare": results_paraphrase_fidelity, | |
"counting_rare": results_paraphrase_counting, | |
"spatial_rare": results_paraphrase_spatial, | |
"attribute_rare": results_attribute | |
} | |
return agg_scores | |
# agg_scores[model] = agg_score | |
# return agg_scores | |
def ood_i2t_agg(model, result_dir): | |
""" | |
Aggregate scores for the given testing models. | |
Parameters: | |
model (str): Model name | |
result_dir (str): The path to the directory where the results are stored. | |
Returns: | |
dict: Output the overall score and the score of subscenarios in the format {"score": float, "subscenarios": dict}. | |
For example, OOD use subscenario like counting_trans as a subscenario | |
""" | |
transformations = ["Van_Gogh", "oil_painting", "watercolour_painting"] | |
corruptions = [ | |
"zoom_blur", "gaussian_noise", "pixelate" | |
] | |
result_path = os.path.join(result_dir, "ood_i2t_summary.json") | |
with open(result_path, "r") as file: | |
results = json.load(file) | |
agg_scores = {} | |
# for model in models: | |
# Only leave the model base name | |
model = model.split("/")[-1] | |
identification_corrupt = sum([results[model]['identification'][corrupt]["Score"] for corrupt in corruptions]) / 3 | |
count_corrupt = sum([results[model]['count'][corrupt]["Score"] for corrupt in corruptions]) / 3 | |
spatial_corrupt = sum([results[model]['spatial'][corrupt]["Score"] for corrupt in corruptions]) / 3 | |
attribute_corrupt = sum([results[model]['attribute'][corrupt]["Score"] for corrupt in corruptions]) / 3 | |
avg_corrupt = (identification_corrupt + count_corrupt + spatial_corrupt + attribute_corrupt) / 4 | |
identification_transform = sum([results[model]['identification'][transform]["Score"] for transform in transformations]) / 3 | |
count_transform = sum([results[model]['count'][transform]["Score"] for transform in transformations]) / 3 | |
spatial_transform = sum([results[model]['spatial'][transform]["Score"] for transform in transformations]) / 3 | |
attribute_transform = sum([results[model]['attribute'][transform]["Score"] for transform in transformations]) / 3 | |
avg_transform = (identification_transform + count_transform + spatial_transform + attribute_transform) / 4 | |
agg_scores["score"] = (avg_corrupt + avg_transform) / 2 | |
agg_scores["subscenarios"] = { | |
"object_corrupt": identification_corrupt, | |
"counting_corrupt": count_corrupt, | |
"spatial_corrupt": spatial_corrupt, | |
"attribute_corrupt": attribute_corrupt, | |
"object_transform": identification_transform, | |
"counting_transform": count_transform, | |
"spatial_transform": spatial_transform, | |
"attribute_transform": attribute_transform | |
} | |
return agg_scores | |
# agg_scores[model] = agg_score | |
# return agg_scores | |
if __name__ == "__main__": | |
t2i_models = [ # Average time spent running the following example | |
"dall-e-2", | |
"dall-e-3", | |
"DeepFloyd/IF-I-M-v1.0", # 15.372 | |
"dreamlike-art/dreamlike-photoreal-2.0", # 3.526 | |
"prompthero/openjourney-v4", # 4.981 | |
"stabilityai/stable-diffusion-xl-base-1.0", # 7.463 | |
] | |
i2t_models = [ # Average time spent running the following example | |
"gpt-4-vision-preview", | |
"gpt-4o-2024-05-13", | |
"llava-hf/llava-v1.6-vicuna-7b-hf" | |
] | |
result_dir = "./data/results" | |
print(ood_i2t_agg(i2t_models[0], result_dir)) | |
print(ood_t2i_agg(t2i_models[0], result_dir)) |