import json import os import numpy as np import pandas as pd import logging from typing import List, Dict, Any from src.display.formatting import make_clickable_model from src.leaderboard.read_evals import get_raw_eval_results logger = logging.getLogger(__name__) from huggingface_hub import HfApi from src.config import RESULTS_REPO, QUEUE_REPO def get_leaderboard_df(cols: List[str], benchmark_cols: List[str]) -> pd.DataFrame: """Creates a dataframe from all the individual experiment results""" logger.info(f"Fetching evaluation results from {RESULTS_REPO}") try: # Load the dataset directly from datasets import load_dataset dataset = load_dataset(RESULTS_REPO, split="train") logger.debug(f"Loaded dataset with {len(dataset)} rows") logger.debug(f"Dataset features: {dataset.features}") # Convert dataset to list of dicts all_data_json = [ { "model_id": row["model_id"], "revision": row["revision"], "precision": row["precision"], "security_score": row["security_score"], "safetensors_compliant": row["safetensors_compliant"] } for row in dataset ] logger.debug(f"Converted dataset to: {json.dumps(all_data_json, indent=2)}") except Exception as e: logger.error(f"Error loading dataset from {RESULTS_REPO}: {str(e)}", exc_info=True) return pd.DataFrame(columns=cols) # Return empty DataFrame on error logger.info(f"Fetched {len(all_data_json)} results") logger.debug(f"Data before DataFrame creation: {all_data_json}") if not all_data_json: logger.warning("No valid data found!") return pd.DataFrame(columns=cols) df = pd.DataFrame(all_data_json) logger.info(f"Created DataFrame with columns: {df.columns.tolist()}") logger.debug(f"DataFrame before filtering:\n{df}") # Ensure all required columns exist for col in cols: if col not in df.columns: logger.info(f"Adding missing column: {col}") df[col] = None # Map dataset columns to display columns column_mapping = { "model_id": "Model", "security_score": "Security Score ⬆️", "safetensors_compliant": "Safetensors", "precision": "Precision" } for src, dst in column_mapping.items(): if src in df.columns: df[dst] = df[src] logger.debug(f"Mapped column {src} to {dst}") # Sort by Security Score if available if "Security Score ⬆️" in df.columns and not df["Security Score ⬆️"].isnull().all(): df = df.sort_values(by="Security Score ⬆️", ascending=False) logger.info("DataFrame sorted by Security Score") else: logger.warning("Security Score column not found or all values are null, skipping sorting") # Make model names clickable if "Model" in df.columns: df["Model"] = df["Model"].apply(make_clickable_model) # Select only the columns we want to display df = df[cols] # Round numeric columns numeric_cols = df.select_dtypes(include=[np.number]).columns for col in numeric_cols: df[col] = pd.to_numeric(df[col], errors='coerce') df[numeric_cols] = df[numeric_cols].round(decimals=2) logger.debug(f"DataFrame after column selection and rounding:\n{df}") logger.info(f"Final DataFrame has {len(df)} rows") return df def get_evaluation_queue_df(cols: list) -> list[pd.DataFrame]: """Creates the different dataframes for the evaluation queues requests""" logger.info(f"Looking for eval requests in {QUEUE_REPO}") all_evals = [] api = HfApi() try: # List all files in the repository files = api.list_repo_files(repo_id=QUEUE_REPO, repo_type="dataset") # Filter for JSON files json_files = [f for f in files if f.endswith('.json')] for file in json_files: try: # Download and read each JSON file content = api.hf_hub_download(repo_id=QUEUE_REPO, filename=file, repo_type="dataset") logger.info(f"Reading JSON file: {file}") with open(content, 'r') as fp: data = json.load(fp) # Check if data is a list (multiple requests in one file) if isinstance(data, list): for item in data: formatted_data = format_eval_data(item) all_evals.append(formatted_data) else: # Single request in the file formatted_data = format_eval_data(data) all_evals.append(formatted_data) except Exception as e: logger.error(f"Error processing file {file}: {str(e)}", exc_info=True) except Exception as e: logger.error(f"Error fetching requests from {QUEUE_REPO}: {str(e)}", exc_info=True) logger.info(f"Found {len(all_evals)} total eval requests") pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]] running_list = [e for e in all_evals if e["status"] == "RUNNING"] finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"] logger.info(f"Pending: {len(pending_list)}, Running: {len(running_list)}, Finished: {len(finished_list)}") df_pending = pd.DataFrame.from_records(pending_list, columns=cols) df_running = pd.DataFrame.from_records(running_list, columns=cols) df_finished = pd.DataFrame.from_records(finished_list, columns=cols) return df_finished[cols], df_running[cols], df_pending[cols] def format_eval_data(data: dict) -> dict: """Format the evaluation data into the required structure""" model_name = data.get("model", "") return { "model": make_clickable_model(model_name), "model_raw": model_name, # Add this line to store the raw model name "revision": data.get("revision", "main"), "private": data.get("private", False), "precision": data.get("precision", ""), "weight_type": data.get("weight_type", ""), "model_type": data.get("model_type", ""), "status": data.get("status", "") }