import json import os from typing import Any, Dict import pandas as pd from huggingface_hub import HfApi, hf_hub_download from required_categories import required_mmlu_categories, required_unified_exam_categories class ModelHandler: def __init__(self, model_infos_path="model_results.json"): self.api = HfApi() self.model_infos_path = model_infos_path self.model_infos = self._load_model_infos() def _load_model_infos(self) -> Dict: if os.path.exists(self.model_infos_path): with open(self.model_infos_path) as f: return json.load(f) return {} def _save_model_infos(self): print("Saving model infos") with open(self.model_infos_path, "w") as f: json.dump(self.model_infos, f, indent=4) def get_arm_bench_data(self): models = self.api.list_models(filter="ArmBench-LLM") model_names = {model["model_name"] for model in self.model_infos} repositories = [model.modelId for model in models] for repo_id in repositories: files = [f for f in self.api.list_repo_files(repo_id) if f == "results.json"] if not files: continue for file in files: model_name = repo_id if model_name not in model_names: try: result_path = hf_hub_download(repo_id, filename=file) with open(result_path) as f: results = json.load(f) self.model_infos.append({ "model_name": model_name, "results": results }) except Exception as e: print(f"Error loading {model_name} - {e}") continue self._save_model_infos() mmlu_data = [] unified_exam_data = [] for model in self.model_infos: model_name = model["model_name"] results = model.get("results", {}) mmlu_results = results.get("mmlu_results", []) unified_exam_results = results.get("unified_exam_results", []) if mmlu_results: mmlu_row = {"Model": model_name} mmlu_categories = {result["category"] for result in mmlu_results} if all(category in mmlu_categories for category in required_mmlu_categories): for result in mmlu_results: mmlu_row[result["category"]] = result["score"] mmlu_data.append(mmlu_row) if unified_exam_results: unified_exam_row = {"Model": model_name} unified_exam_categories = {result["category"] for result in unified_exam_results} if all(category in unified_exam_categories for category in required_unified_exam_categories): for result in unified_exam_results: unified_exam_row[result["category"]] = result["score"] unified_exam_data.append(unified_exam_row) mmlu_df = pd.DataFrame(mmlu_data) unified_exam_df = pd.DataFrame(unified_exam_data) return mmlu_df, unified_exam_df