File size: 8,038 Bytes
9fa3d89 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import json
import argparse
from icecream import ic
import os
import numpy as np
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--results_folder", type=str, default="./playground/data/eval/results")
parser.add_argument("--ckpt", type=str)
args = parser.parse_args()
scores = {}
dirs = os.listdir(f"{args.results_folder}/{args.ckpt}")
for dir in dirs:
if args.ckpt in dir and dir not in args.ckpt:
with open(f"{args.results_folder}/{args.ckpt}/mmstar/merge_score.json", "r") as f:
data = json.load(f)
scores["MMStar"] = round(data.get("final score", 0)*100, 1) if data.get("final score") is not None else None
scores["MMStar"] = None
cv_scores = {}
with open(f"{args.results_folder}/{args.ckpt}/cv-bench/merge_score.json", "r") as f:
data = json.load(f)
scores["CV-Bench"] = round(data.get("Overall", 0)*100, 1) if data.get("Overall") is not None else None
cv_scores["CV-Bench (2D)"] = round(data.get("2D", 0)*100, 1) if data.get("2D") is not None else None
cv_scores["CV-Bench (3D)"] = round(data.get("3D", 0)*100, 1) if data.get("3D") is not None else None
cv_scores["CV-Bench (Count)"] = round(data.get("Count", 0)*100, 1) if data.get("Count") is not None else None
cv_scores["CV-Bench (Depth)"] = round(data.get("Depth", 0)*100, 1) if data.get("Depth") is not None else None
cv_scores["CV-Bench (Relation)"] = round(data.get("Relation", 0)*100, 1) if data.get("Relation") is not None else None
cv_scores["CV-Bench (Distance)"] = round(data.get("Distance", 0)*100, 1) if data.get("Distance") is not None else None
with open(f"{args.results_folder}/{args.ckpt}/{dir}/results.json", "r") as f:
results = json.load(f).get("results", {})
# scores["MME-Cognition"] = round(results.get("mme", {}).get("mme_cognition_score,none", 0), 1) if results.get("mme", {}).get("mme_cognition_score,none") is not None else None
# scores["MME-Perception"] = round(results.get("mme", {}).get("mme_percetion_score,none", 0), 1) if results.get("mme", {}).get("mme_percetion_score,none") is not None else None
scores["Realworld-QA"] = round(results.get("realworldqa", {}).get("exact_match,flexible-extract", 0)*100, 1) if results.get("realworldqa", {}).get("exact_match,flexible-extract") is not None else None
scores["VizWiz-VQA-Val"] = round(results.get("vizwiz_vqa_val", {}).get("exact_match,none", 0)*100, 1) if results.get("vizwiz_vqa_val", {}).get("exact_match,none") is not None else None
# scores["SEEDBench-Image"] = round(results.get("seedbench", {}).get("seed_image,none", 0)*100, 1) if results.get("seedbench", {}).get("seed_image,none") is not None else None
# scores["VQAv2-Val"] = round(results.get("vqav2_val", {}).get("exact_match,none", 0)*100, 1) if results.get("vqav2_val", {}).get("exact_match,none") is not None else None
# scores["Science-QA-Img"] = round(results.get("scienceqa_img", {}).get("exact_match,none", 0)*100, 1) if results.get("scienceqa_img", {}).get("exact_match,none") is not None else None
scores["MMMU-Val"] = round(results.get("mmmu_val", {}).get("mmmu_acc,none", 0)*100, 1) if results.get("mmmu_val", {}).get("mmmu_acc,none") is not None else None
# scores["MMBench"] = round(results.get("mmbench_en_dev", {}).get("gpt_eval_score,none", 0), 1) if results.get("mmbench_en_dev", {}).get("gpt_eval_score,none") is not None else None
# scores["NaturalBench"] = round(results.get("naturalbench", {}).get("mme_score,none", 0)*100, 1) if results.get("naturalbench", {}).get("mme_score,none") is not None else None
# scores["GQA"] = round(results.get("gqa", {}).get("exact_match,none", 0)*100, 1) if results.get("gqa", {}).get("exact_match,none") is not None else None
scores["POPE"] = round(results.get("pope", {}).get("pope_accuracy,none", 0)*100, 1) if results.get("pope", {}).get("pope_accuracy,none") is not None else None
scores["MMVet"] = round(results.get("mmvet", {}).get("gpt_eval_score", 0)*100, 1) if results.get("mmvet", {}).get("gpt_eval_score") is not None else None
scores["OK-VQA"] = round(results.get("ok_vqa", {}).get("exact_match,none", 0)*100, 1) if results.get("ok_vqa", {}).get("exact_match,none") is not None else None
# scores["ChartQA"] = round(results.get("chartqa", {}).get("relaxed_overall,none", 0)*100, 1) if results.get("chartqa", {}).get("relaxed_overall,none") is not None else None
# scores["DocVQA"] = round(results.get("docvqa_val", {}).get("anls,none", 0)*100, 1) if results.get("docvqa_val", {}).get("anls,none") is not None else None
# scores["TextVQA"] = round(results.get("textvqa_val", {}).get("exact_match,none", 0)*100, 1) if results.get("textvqa_val", {}).get("exact_match,none") is not None else None
with open(f"{args.results_folder}/{args.ckpt}/mmvp/merge_score.json", "r") as f:
data = json.load(f)
scores["MMVP"] = round(data.get("mmvp", 0)*100, 1) if data.get("mmvp") is not None else None
scores["MMVP"] = None
keys = list(scores.keys())
str_scores = [str(scores[key]) if scores[key] is not None else 'None' for key in keys]
abl_keys = ["CV-Bench", "MMStar", "VizWiz-VQA-Val", "MMVet", "MMVP", "MMMU-Val"]
abl_scores = [scores[key] for key in abl_keys if scores[key] is not None]
small_abl_keys = ["CV-Bench", "MMStar", "OK-VQA", "MMMU-Val"]
small_abl_scores = [scores[key] for key in small_abl_keys if scores[key] is not None]
cv_bench_keys = ["CV-Bench (2D)", "CV-Bench (3D)", "CV-Bench (Count)", "CV-Bench (Depth)", "CV-Bench (Relation)", "CV-Bench (Distance)"]
cv_bench_scores = [cv_scores[key] for key in cv_bench_keys if cv_scores[key] is not None]
# cat_scores = {}
# if os.path.exists(f"{args.results_folder}/{args.ckpt}/categorized_scores.json"):
# with open(f"{args.results_folder}/{args.ckpt}/categorized_scores.json", "r") as f:
# cat_scores = json.load(f)
# cat_scores.pop("Both")
print(" & ".join(keys))
print(" & ".join(str_scores))
if abl_scores:
print(" & ".join(abl_keys))
print(" & ".join([str(a) for a in abl_scores]))
print(f"Ablation Avg: {round(np.mean(abl_scores), 1)}")
print("Ablation Avg: None")
if small_abl_scores:
print(" & ".join(small_abl_keys))
print(" & ".join([str(a) for a in small_abl_scores]))
print(f"Small-Ablation Avg: {round(np.mean(small_abl_scores), 1)}")
print("Small-Ablation Avg: None")
if cv_bench_scores:
print(" & ".join(cv_bench_keys))
print(" & ".join([str(c) for c in cv_bench_scores]))
print(f"CV-Bench Overall: {round(np.mean(cv_bench_scores[:2]), 1)}")
print("CV-Bench Avg: None")
# if cat_scores is not None:
# print("\n====================Categorized-Scores===========================================")
# cats = []
# class_scores = []
# benches = []
# for k, v in cat_scores.items():
# cats.append(k)
# for bench, score in v.items():
# benches.append(bench)
# class_scores.append(round(score*100, 1))
# print(" & ".join(cats))
# print(" & ".join(benches))
# print(" & ".join([str(c) for c in class_scores]))
# print("================================================================")