polaris73's picture
hf upload
94afa8b
import json
import os
def ood_t2i_agg(model, result_dir):
"""
Aggregate scores for the given testing models.
Parameters:
model (str): Model name.
result_dir (str): The path to the directory where the results are stored.
Returns:
dict: Output the overall score and the score of subscenarios in the format {"score": float, "subscenarios": dict}.
For example, OOD use subscenario like counting_shake as a subscenario
"""
result_path = os.path.join(result_dir, "ood_t2i_summary.json")
with open(result_path, "r") as file:
results = json.load(file)
agg_scores = {}
# for model in models:
# Only leave the model base name
model = model.split("/")[-1]
results_shake_fidelity = 0
results_shake_counting = 0
results_shake_spatial = 0
results_shake_color = 0
results_shake_size = 0
results_paraphrase_fidelity = 0
results_paraphrase_counting = 0
results_paraphrase_spatial = 0
results_paraphrase_color = 0
results_paraphrase_size = 0
for trial_id in [0, 1, 2]:
results_shake_fidelity += results[model][f'trial_{trial_id}']['fidelity']['Shake_']
results_shake_counting += results[model][f'trial_{trial_id}']['counting']['Shake_']
results_shake_spatial += results[model][f'trial_{trial_id}']['spatial']['Shake_']
results_shake_color += results[model][f'trial_{trial_id}']['color']['Shake_']
results_shake_size += results[model][f'trial_{trial_id}']['size']['Shake_']
results_paraphrase_fidelity += results[model][f'trial_{trial_id}']['fidelity']['Paraphrase_']
results_paraphrase_counting += results[model][f'trial_{trial_id}']['counting']['Paraphrase_']
results_paraphrase_spatial += results[model][f'trial_{trial_id}']['spatial']['Paraphrase_']
results_paraphrase_color += results[model][f'trial_{trial_id}']['color']['Paraphrase_']
results_paraphrase_size += results[model][f'trial_{trial_id}']['size']['Paraphrase_']
results_shake_fidelity = results_shake_fidelity * 100
results_shake_fidelity /= 3
results_shake_counting /= 3
results_shake_spatial /= 3
results_shake_color /= 3
results_shake_size /= 3
results_shake_attribute = (results_shake_color + results_shake_size) / 2
results_paraphrase_fidelity = results_paraphrase_fidelity * 100
results_paraphrase_fidelity /= 3
results_paraphrase_counting /= 3
results_paraphrase_spatial /= 3
results_paraphrase_color /= 3
results_paraphrase_size /= 3
results_attribute = (results_paraphrase_color + results_paraphrase_size) / 2
avg_shake = (results_shake_fidelity + results_shake_counting + results_shake_spatial + results_shake_attribute) / 4
avg_paraphrase = (results_paraphrase_fidelity + results_paraphrase_counting + results_paraphrase_spatial + results_attribute) / 4
agg_score = (avg_shake + avg_paraphrase) / 2
agg_scores["score"] = agg_score
agg_scores["subscenarios"] = {
"helpfulness_shake": results_shake_fidelity,
"counting_shake": results_shake_counting,
"spatial_shake": results_shake_spatial,
"attribute_shake": results_shake_attribute,
"helpfulness_rare": results_paraphrase_fidelity,
"counting_rare": results_paraphrase_counting,
"spatial_rare": results_paraphrase_spatial,
"attribute_rare": results_attribute
}
return agg_scores
# agg_scores[model] = agg_score
# return agg_scores
def ood_i2t_agg(model, result_dir):
"""
Aggregate scores for the given testing models.
Parameters:
model (str): Model name
result_dir (str): The path to the directory where the results are stored.
Returns:
dict: Output the overall score and the score of subscenarios in the format {"score": float, "subscenarios": dict}.
For example, OOD use subscenario like counting_trans as a subscenario
"""
transformations = ["Van_Gogh", "oil_painting", "watercolour_painting"]
corruptions = [
"zoom_blur", "gaussian_noise", "pixelate"
]
result_path = os.path.join(result_dir, "ood_i2t_summary.json")
with open(result_path, "r") as file:
results = json.load(file)
agg_scores = {}
# for model in models:
# Only leave the model base name
model = model.split("/")[-1]
identification_corrupt = sum([results[model]['identification'][corrupt]["Score"] for corrupt in corruptions]) / 3
count_corrupt = sum([results[model]['count'][corrupt]["Score"] for corrupt in corruptions]) / 3
spatial_corrupt = sum([results[model]['spatial'][corrupt]["Score"] for corrupt in corruptions]) / 3
attribute_corrupt = sum([results[model]['attribute'][corrupt]["Score"] for corrupt in corruptions]) / 3
avg_corrupt = (identification_corrupt + count_corrupt + spatial_corrupt + attribute_corrupt) / 4
identification_transform = sum([results[model]['identification'][transform]["Score"] for transform in transformations]) / 3
count_transform = sum([results[model]['count'][transform]["Score"] for transform in transformations]) / 3
spatial_transform = sum([results[model]['spatial'][transform]["Score"] for transform in transformations]) / 3
attribute_transform = sum([results[model]['attribute'][transform]["Score"] for transform in transformations]) / 3
avg_transform = (identification_transform + count_transform + spatial_transform + attribute_transform) / 4
agg_scores["score"] = (avg_corrupt + avg_transform) / 2
agg_scores["subscenarios"] = {
"object_corrupt": identification_corrupt,
"counting_corrupt": count_corrupt,
"spatial_corrupt": spatial_corrupt,
"attribute_corrupt": attribute_corrupt,
"object_transform": identification_transform,
"counting_transform": count_transform,
"spatial_transform": spatial_transform,
"attribute_transform": attribute_transform
}
return agg_scores
# agg_scores[model] = agg_score
# return agg_scores
if __name__ == "__main__":
t2i_models = [ # Average time spent running the following example
"dall-e-2",
"dall-e-3",
"DeepFloyd/IF-I-M-v1.0", # 15.372
"dreamlike-art/dreamlike-photoreal-2.0", # 3.526
"prompthero/openjourney-v4", # 4.981
"stabilityai/stable-diffusion-xl-base-1.0", # 7.463
]
i2t_models = [ # Average time spent running the following example
"gpt-4-vision-preview",
"gpt-4o-2024-05-13",
"llava-hf/llava-v1.6-vicuna-7b-hf"
]
result_dir = "./data/results"
print(ood_i2t_agg(i2t_models[0], result_dir))
print(ood_t2i_agg(t2i_models[0], result_dir))