import pandas as pd import numpy as np from typing import Tuple from datasets import load_dataset, Features, Value from about import results_repo_validation, results_repo_test from about import METRICS, STANDARD_COLS from loguru import logger def make_user_clickable(name: str): link =f'https://huggingface.co/{name}' return f'{name}' def make_tag_clickable(tag: str): return f'link' def fetch_dataset_df(): logger.info("Fetching latest results dataset from Hugging Face Hub...") # Specify feature types to load results dataset metric_features = { f'mean_{m}': Value('float64') for m in METRICS } metric_features.update({ f'std_{m}': Value('float64') for m in METRICS }) other_features = { 'user': Value('string'), 'Endpoint': Value('string'), 'submission_time': Value('string'), 'model_report': Value('string'), 'anonymous': Value('bool'), } feature_schema = Features(metric_features | other_features) dset = load_dataset(results_repo_validation, # change to results_repo_test for test set split='train', features=feature_schema, download_mode="force_redownload") full_df = dset.to_pandas() expected_mean_cols = [f"mean_{col}" for col in METRICS] expected_std_cols = [f"std_{col}" for col in METRICS] expected_all_cols = STANDARD_COLS + expected_mean_cols + expected_std_cols assert all( col in full_df.columns for col in expected_all_cols ), f"Expected columns not found in {full_df.columns}. Missing columns: {set(expected_all_cols) - set(full_df.columns)}" df = full_df.copy() df = df[df["user"] != "test"].copy() df["submission_time"] = pd.to_datetime(df["submission_time"], errors="coerce") df = df.dropna(subset=["submission_time"]) # Get the most recent submission per user & endpoint latest = ( df.sort_values("submission_time") .drop_duplicates(subset=["Endpoint", "user"], keep="last") .sort_values(["Endpoint", "user"]) .reset_index(drop=True) ) latest.rename(columns={"submission_time": "submission time"}, inplace=True) return latest def clip_and_log_transform(y: np.ndarray): """ Clip to a detection limit and transform to log10 scale. Parameters ---------- y : np.ndarray The array to be clipped and transformed. """ y = np.clip(y, a_min=0, a_max=None) return np.log10(y + 1) def bootstrap_sampling(size: int, n_samples: int) -> np.ndarray: """ Generate bootstrap samples for a given size and number of samples. Parameters ---------- size : int The size of the data. n_samples : int The number of samples to generate. Returns ------- np.ndarray Returns a numpy array of the bootstrap samples. """ rng = np.random.default_rng(0) return rng.choice(size, size=(n_samples, size), replace=True) def metrics_per_ep(pred: np.ndarray, true: np.ndarray )->Tuple[float, float, float, float]: """Predict evaluation metrics for a single sample Parameters ---------- pred : np.ndarray Array with predictions true : np.ndarray Array with actual values Returns ------- Tuple[float, float, float, float] Resulting metrics: (MAE, RAE, R2, Spearman R, Kendall's Tau) """ from scipy.stats import spearmanr, kendalltau from sklearn.metrics import mean_absolute_error, r2_score mae = mean_absolute_error(true, pred) rae = mae / np.mean(np.abs(true - np.mean(true))) if np.nanstd(true) == 0: r2=np.nan else: r2 = r2_score(true, pred) if np.nanstd(pred) < 0.0001: spr = np.nan ktau = np.nan else: spr = spearmanr(true, pred).statistic ktau = kendalltau(true, pred).statistic return mae, rae, r2, spr, ktau def bootstrap_metrics(pred: np.ndarray, true: np.ndarray, endpoint: str, n_bootstrap_samples=1000 )->pd.DataFrame: """Calculate bootstrap metrics given predicted and true values Parameters ---------- pred : np.ndarray Predicted endpoints true : np.ndarray Actual endpoint values endpoint : str String with endpoint n_bootstrap_samples : int, optional Size of bootstrapsample, by default 1000 Returns ------- pd.DataFrame Dataframe with estimated metric per bootstrap sample for the given endpoint """ cols = ["Sample", "Endpoint", "Metric", "Value"] bootstrap_results = pd.DataFrame(columns=cols) for i, indx in enumerate( bootstrap_sampling(true.shape[0], n_bootstrap_samples) ): mae, rae, r2, spr, ktau = metrics_per_ep(pred[indx], true[indx]) scores = pd.DataFrame( [ [i, endpoint, "MAE", mae], [i, endpoint, "RAE", rae], [i, endpoint, "R2", r2], [i, endpoint, "Spearman R", spr], [i, endpoint, "Kendall's Tau", ktau] ], columns=cols ) bootstrap_results = pd.concat([bootstrap_results, scores]) return bootstrap_results def map_metric_to_stats(df: pd.DataFrame, average=False) -> pd.DataFrame: """Map mean and std to 'mean +/- std' string for each metric Parameters ---------- df : pd.DataFrame Dataframe to modify average : bool, optional Whether the dataframe contains average info, by default False Returns ------- pd.DataFrame Modified dataframe """ metric_cols = METRICS[:] if average: metric_cols[1] = "MA-RAE" cols_drop = [] for col in metric_cols: mean_col = f"mean_{col}" std_col = f"std_{col}" df[col] = df.apply( lambda row: f"{row[mean_col]:.2f} +/- {row[std_col]:.2f}", axis=1 ) cols_drop.extend([mean_col, std_col]) df = df.drop(columns=cols_drop) return df