Spaces:

shi-labs
/

OLA-VLM

Running on Zero

File size: 8,038 Bytes

9fa3d89

import json
import argparse
from icecream import ic
import os
import numpy as np


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--results_folder", type=str, default="./playground/data/eval/results")
    parser.add_argument("--ckpt", type=str)
    args = parser.parse_args()

    scores = {}

    dirs = os.listdir(f"{args.results_folder}/{args.ckpt}")
    for dir in dirs:
        if args.ckpt in dir and dir not in args.ckpt:
            break

    
    try:
        with open(f"{args.results_folder}/{args.ckpt}/mmstar/merge_score.json", "r") as f:
            data = json.load(f)
            scores["MMStar"] = round(data.get("final score", 0)*100, 1) if data.get("final score") is not None else None
    except:
        scores["MMStar"] = None
    
    cv_scores = {}

    with open(f"{args.results_folder}/{args.ckpt}/cv-bench/merge_score.json", "r") as f:
        data = json.load(f)
        scores["CV-Bench"] = round(data.get("Overall", 0)*100, 1) if data.get("Overall") is not None else None
        cv_scores["CV-Bench (2D)"] = round(data.get("2D", 0)*100, 1) if data.get("2D") is not None else None
        cv_scores["CV-Bench (3D)"] = round(data.get("3D", 0)*100, 1) if data.get("3D") is not None else None
        cv_scores["CV-Bench (Count)"] = round(data.get("Count", 0)*100, 1) if data.get("Count") is not None else None
        cv_scores["CV-Bench (Depth)"] = round(data.get("Depth", 0)*100, 1) if data.get("Depth") is not None else None
        cv_scores["CV-Bench (Relation)"] = round(data.get("Relation", 0)*100, 1) if data.get("Relation") is not None else None
        cv_scores["CV-Bench (Distance)"] = round(data.get("Distance", 0)*100, 1) if data.get("Distance") is not None else None

    
    with open(f"{args.results_folder}/{args.ckpt}/{dir}/results.json", "r") as f:
        results = json.load(f).get("results", {})
        # scores["MME-Cognition"] = round(results.get("mme", {}).get("mme_cognition_score,none", 0), 1) if results.get("mme", {}).get("mme_cognition_score,none") is not None else None
        # scores["MME-Perception"] = round(results.get("mme", {}).get("mme_percetion_score,none", 0), 1) if results.get("mme", {}).get("mme_percetion_score,none") is not None else None

        scores["Realworld-QA"] = round(results.get("realworldqa", {}).get("exact_match,flexible-extract", 0)*100, 1) if results.get("realworldqa", {}).get("exact_match,flexible-extract") is not None else None
        scores["VizWiz-VQA-Val"] = round(results.get("vizwiz_vqa_val", {}).get("exact_match,none", 0)*100, 1) if results.get("vizwiz_vqa_val", {}).get("exact_match,none") is not None else None
        # scores["SEEDBench-Image"] = round(results.get("seedbench", {}).get("seed_image,none", 0)*100, 1) if results.get("seedbench", {}).get("seed_image,none") is not None else None
        # scores["VQAv2-Val"] = round(results.get("vqav2_val", {}).get("exact_match,none", 0)*100, 1) if results.get("vqav2_val", {}).get("exact_match,none") is not None else None
        
        # scores["Science-QA-Img"] = round(results.get("scienceqa_img", {}).get("exact_match,none", 0)*100, 1) if results.get("scienceqa_img", {}).get("exact_match,none") is not None else None
        scores["MMMU-Val"] = round(results.get("mmmu_val", {}).get("mmmu_acc,none", 0)*100, 1) if results.get("mmmu_val", {}).get("mmmu_acc,none") is not None else None
        # scores["MMBench"] = round(results.get("mmbench_en_dev", {}).get("gpt_eval_score,none", 0), 1) if results.get("mmbench_en_dev", {}).get("gpt_eval_score,none") is not None else None

        # scores["NaturalBench"] = round(results.get("naturalbench", {}).get("mme_score,none", 0)*100, 1) if results.get("naturalbench", {}).get("mme_score,none") is not None else None

        # scores["GQA"] = round(results.get("gqa", {}).get("exact_match,none", 0)*100, 1) if results.get("gqa", {}).get("exact_match,none") is not None else None
        scores["POPE"] = round(results.get("pope", {}).get("pope_accuracy,none", 0)*100, 1) if results.get("pope", {}).get("pope_accuracy,none") is not None else None
        scores["MMVet"] = round(results.get("mmvet", {}).get("gpt_eval_score", 0)*100, 1) if results.get("mmvet", {}).get("gpt_eval_score") is not None else None
        scores["OK-VQA"] = round(results.get("ok_vqa", {}).get("exact_match,none", 0)*100, 1) if results.get("ok_vqa", {}).get("exact_match,none") is not None else None
        # scores["ChartQA"] = round(results.get("chartqa", {}).get("relaxed_overall,none", 0)*100, 1) if results.get("chartqa", {}).get("relaxed_overall,none") is not None else None
        # scores["DocVQA"] = round(results.get("docvqa_val", {}).get("anls,none", 0)*100, 1) if results.get("docvqa_val", {}).get("anls,none") is not None else None
        # scores["TextVQA"] = round(results.get("textvqa_val", {}).get("exact_match,none", 0)*100, 1) if results.get("textvqa_val", {}).get("exact_match,none") is not None else None
    
    try:
        with open(f"{args.results_folder}/{args.ckpt}/mmvp/merge_score.json", "r") as f:
            data = json.load(f)
            scores["MMVP"] = round(data.get("mmvp", 0)*100, 1) if data.get("mmvp") is not None else None
    except:
        scores["MMVP"] = None

    keys = list(scores.keys())
    str_scores = [str(scores[key]) if scores[key] is not None else 'None' for key in keys]

    abl_keys = ["CV-Bench", "MMStar", "VizWiz-VQA-Val", "MMVet", "MMVP", "MMMU-Val"]
    
    abl_scores = [scores[key] for key in abl_keys if scores[key] is not None]

    small_abl_keys = ["CV-Bench", "MMStar", "OK-VQA", "MMMU-Val"]
    small_abl_scores = [scores[key] for key in small_abl_keys if scores[key] is not None]

    cv_bench_keys = ["CV-Bench (2D)", "CV-Bench (3D)", "CV-Bench (Count)", "CV-Bench (Depth)", "CV-Bench (Relation)", "CV-Bench (Distance)"]
    cv_bench_scores = [cv_scores[key] for key in cv_bench_keys if cv_scores[key] is not None]
    
    # cat_scores = {}
    # if os.path.exists(f"{args.results_folder}/{args.ckpt}/categorized_scores.json"):
    #     with open(f"{args.results_folder}/{args.ckpt}/categorized_scores.json", "r") as f:
    #         cat_scores = json.load(f)
    #         cat_scores.pop("Both")
    
    print("\n====================All-Scores===========================================")
    print(" & ".join(keys))
    print(" & ".join(str_scores))
    if abl_scores:
        print("\n====================Abl-Scores===========================================")
        print(" & ".join(abl_keys))
        print(" & ".join([str(a) for a in abl_scores]))
        print(f"Ablation Avg: {round(np.mean(abl_scores), 1)}")
    else:
        print("Ablation Avg: None")
    
    if small_abl_scores:
        print("\n====================Small-Abl-Scores===========================================")
        print(" & ".join(small_abl_keys))
        print(" & ".join([str(a) for a in small_abl_scores]))
        print(f"Small-Ablation Avg: {round(np.mean(small_abl_scores), 1)}")
    else:
        print("Small-Ablation Avg: None")
    
    if cv_bench_scores:
        print("\n====================CV-Bench-Scores===========================================")
        print(" & ".join(cv_bench_keys))
        print(" & ".join([str(c) for c in cv_bench_scores]))
        print(f"CV-Bench Overall: {round(np.mean(cv_bench_scores[:2]), 1)}")
    else:
        print("CV-Bench Avg: None")
    
    # if cat_scores is not None:
    #     print("\n====================Categorized-Scores===========================================")
    #     cats = []
    #     class_scores = []
    #     benches = []
    #     for k, v in cat_scores.items():
    #         cats.append(k)
    #         for bench, score in v.items():
    #             benches.append(bench)
    #             class_scores.append(round(score*100, 1))
    #     print(" & ".join(cats))
    #     print(" & ".join(benches))
    #     print(" & ".join([str(c) for c in class_scores]))
    # print("================================================================")