Spaces:
Running
Running
| import pandas as pd | |
| from statistics import mean | |
| import pandas as pd | |
| import json | |
| import numpy as np | |
| from statistics import mean | |
| import re | |
| from datasets import load_dataset | |
| import os | |
| from collections import defaultdict | |
| from src.envs import API, SAHARA_DATA, SAHARA_RESULTS | |
| TASKS_LIST={ | |
| 'xlni':'Cross-Lingual Natural Language Inference', | |
| 'lid':'Language Identification', | |
| 'news': 'News Classification', | |
| 'sentiment':'Sentiment Analysis', | |
| 'topic':'Topic Classification', | |
| 'mt_eng2xx':'Machine Translation - English to African', | |
| 'mt_fra2xx':'Machine Translation - French to African', | |
| 'mt_xx2xx':'Machine Translation - African to African', | |
| 'paraphrase':'Paraphrase', | |
| 'summary':'Summarization', | |
| 'title':'Title Generation', | |
| 'mmlu':'General Knowledge', | |
| 'mgsm':'Mathematical Word Problems', | |
| 'belebele':'Reading Comprehension', | |
| 'squad_qa':'Context-based Question Answering', | |
| 'ner':'Named Entity Recognition', | |
| 'phrase':'Phrase Chunking', | |
| 'pos':'Part-of-Speech Tagging', | |
| } | |
| CLUSTERS = { | |
| "Text Classification": [ | |
| 'xlni', 'lid', 'news', 'sentiment', 'topic', | |
| ], | |
| "Text Generation": [ | |
| 'mt_eng2xx', 'mt_fra2xx', 'mt_xx2xx', 'paraphrase', 'summary', 'title', | |
| ], | |
| "MCCR": [ | |
| 'mmlu', 'mgsm', 'belebele', 'squad_qa', | |
| ], | |
| "Tokens": [ | |
| 'ner', 'phrase', 'pos', | |
| ], | |
| } | |
| ALL_TASKS = [t for cluster in CLUSTERS.values() for t in cluster] | |
| # ===== Authenticate and Load Data From Private HF Repo ===== | |
| def load_private_leaderboard_df(): | |
| ds = load_dataset( | |
| path=SAHARA_DATA, | |
| name=None, | |
| data_files=SAHARA_RESULTS, | |
| split="train", | |
| download_mode="force_redownload" | |
| ) | |
| return ds.to_pandas() | |
| metrics_list={ | |
| 'bleu_1k':'spBleu<sup>1K</sup>', | |
| 'accuracy':'Accuracy', | |
| 'f1':'Macro-F1', | |
| 'exact_match':'Exact Match', | |
| 'rougeL':'RougeL', | |
| } | |
| LANG_ISO2NAME = { | |
| 'eng': 'English', | |
| 'fra': 'French', | |
| # 'ara': 'Arabic', | |
| 'amh': 'Amharic', | |
| 'ewe': 'Ewe', | |
| 'hau': 'Hausa', | |
| 'ibo': 'Igbo', | |
| 'kin': 'Kinyarwanda', | |
| 'lin': 'Lingala', | |
| 'lug': 'Ganda', | |
| 'orm': 'Oromo', | |
| 'sna': 'Shona', | |
| 'sot': 'Southern Sotho', | |
| 'swa': 'Swahili', 'swh': 'Swahili', | |
| 'twi': 'Twi', | |
| 'wol': 'Wolof', | |
| 'xho': 'Xhosa', | |
| 'yor': 'Yoruba', | |
| 'zul': 'Zulu', | |
| 'afr': 'Afrikaans', | |
| 'run': 'Rundi', | |
| 'tir': 'Tigrinya', | |
| 'som': 'Somali', | |
| 'pcm': 'Nigerian Pidgin', | |
| 'teo': 'Teso', | |
| 'nyn': 'Nyankore/Nyankole', | |
| 'lgg': 'Lugbara', | |
| 'bem': 'Bemba/Chibemba', | |
| 'tsn': 'Tswana', | |
| 'bbj': 'Ghomálá', | |
| 'mos': 'Moore', | |
| 'bam': 'Bambara', | |
| 'fon': 'Fon', | |
| 'ach': 'Acholi', | |
| 'nso': 'Sepedi', | |
| 'tso': 'Tsonga', | |
| 'fuv': 'Fulfude Nigeria', | |
| 'gaz': 'Oromo, West Central', | |
| 'kea': 'Kabuverdianu', | |
| 'nya': 'Nyanja', | |
| 'ssw': 'Swati', | |
| 'luo': 'Dholuo/Luo', | |
| 'ven': 'Venda', | |
| 'kir':"Kirundi", | |
| } | |
| # ===== Build Language Name→ISOs map ===== | |
| def build_langname_to_isos(iso2name): | |
| name2isos = defaultdict(set) | |
| for iso, name in iso2name.items(): | |
| name2isos[name].add(iso) | |
| return name2isos | |
| LANGNAME2ISOS = build_langname_to_isos(LANG_ISO2NAME) | |
| #show only African langs | |
| LANG_NAME_LIST = sorted([lang for lang in LANGNAME2ISOS.keys() if lang not in ['eng', 'fra', 'English', 'French']]) | |
| def get_task_metric_map(df): | |
| mapping = {} | |
| for _, row in df.iterrows(): | |
| mapping[row["task"]] = row["metric"] | |
| return mapping | |
| def cluster_average(row, tasks): | |
| vals = [] | |
| for t in tasks: | |
| try: | |
| v = float(row[t]) | |
| vals.append(v) | |
| except Exception: | |
| continue | |
| return np.mean(vals) if vals else np.nan | |
| def add_medals_to_models(df, score_col="overall score"): | |
| score_float_col = "__score_float" | |
| df[score_float_col] = df[score_col].apply(lambda x: float(x) if x != "---" else np.nan) | |
| df = df.sort_values(by=score_float_col, ascending=False, kind="mergesort").reset_index(drop=True) | |
| def get_rank_symbols(scores): | |
| unique_scores = sorted(set([s for s in scores if not pd.isna(s)]), reverse=True) | |
| symbols = ["🏆", "🥈", "🥉"] | |
| score_to_symbol = {s: symbols[i] for i, s in enumerate(unique_scores[:3])} | |
| return [score_to_symbol.get(s, "") for s in scores] | |
| df['rank_symbol'] = get_rank_symbols(df[score_float_col].tolist()) | |
| df['model'] = df['rank_symbol'] + ' ' + df['model'] | |
| df = df.drop(columns=['rank_symbol', score_float_col]) | |
| return df | |
| def format_cluster_table(df, cluster_tasks, metric_map): | |
| col_order = ["model"] + cluster_tasks | |
| for t in cluster_tasks: | |
| if t not in df.columns: | |
| df[t] = '---' | |
| df = df[col_order] | |
| for t in cluster_tasks: | |
| df[t] = df[t].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float, np.integer, np.floating)) else x) | |
| df["Cluster Score"] = df[cluster_tasks].apply( | |
| lambda row: cluster_average(row, cluster_tasks), axis=1 | |
| ) | |
| df["Cluster Score"] = df["Cluster Score"].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---") | |
| df = df[["model", "Cluster Score"] + cluster_tasks] | |
| # rename = {t: f"{t}\n{metric_map.get(t, '')}" for t in cluster_tasks} | |
| rename = {t: f"{TASKS_LIST[t]}<br>Metric: {metrics_list[metric_map.get(t, '')]}" for t in cluster_tasks} | |
| df = df.rename(columns=rename) | |
| df = add_medals_to_models(df, score_col="Cluster Score") | |
| return df | |
| def format_main_overall_table(df, metric_map): | |
| main = df.copy() | |
| for cname, tasks in CLUSTERS.items(): | |
| main[cname] = main[tasks].apply(lambda row: cluster_average(row, tasks), axis=1) | |
| cluster_cols = list(CLUSTERS.keys()) | |
| main["Overall Score"] = main[cluster_cols].apply( | |
| lambda row: np.nanmean([x for x in row if pd.notna(x)]), axis=1 | |
| ) | |
| for c in cluster_cols + ["Overall Score"]: | |
| main[c] = main[c].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---") | |
| main = main[["model", "Overall Score"] + cluster_cols] | |
| main = add_medals_to_models(main, score_col="Overall Score") | |
| main.rename(columns={'Overall Score': 'Sahara Score'}, inplace=True) | |
| return main | |
| def load_leaderboards(): | |
| df = load_private_leaderboard_df() | |
| metric_map = get_task_metric_map(df) | |
| main_df = df[df['leaderboard'] == 'main'].copy() | |
| if main_df.empty: | |
| cluster_tabs = {c: pd.DataFrame([{"Info": "No data"}]) for c in CLUSTERS} | |
| main_overall_tab = pd.DataFrame([{"Info": "No data"}]) | |
| return cluster_tabs, main_overall_tab, [], {}, df, metric_map | |
| main_tasks_df = main_df.pivot_table(index='model', columns='task', values='score').reset_index() | |
| cluster_tabs = {} | |
| for cname, tasks in CLUSTERS.items(): | |
| cluster_tabs[cname] = format_cluster_table(main_tasks_df, tasks, metric_map) | |
| for t in ALL_TASKS: | |
| if t not in main_tasks_df.columns: | |
| main_tasks_df[t] = np.nan | |
| main_overall_tab = format_main_overall_table(main_tasks_df, metric_map) | |
| all_langs = sorted([lb for lb in df['leaderboard'].unique() if lb not in ['main']]) | |
| return cluster_tabs, main_overall_tab, df, metric_map | |
| def df_to_html(df, col_minwidth=90, col_maxwidth=140, model_col_width=400): | |
| # Remove any column whose name contains "task" | |
| drop_cols = [col for col in df.columns if "task" in col] | |
| df = df.drop(columns=drop_cols, errors="ignore") | |
| df.columns.name = None | |
| html="" | |
| # html = f""" | |
| # <style> | |
| # .gradio-container-5-34-1 .prose table {{ | |
| # border-top: 2px solid #dca02a; | |
| # border-bottom: 2px solid #dca02a; | |
| # margin-bottom:20px; | |
| # margin-left: auto; | |
| # margin-right: auto; | |
| # width: 100%; | |
| # border-collapse: collapse; | |
| # table-layout: fixed; | |
| # }} | |
| # .gradio-container-5-34-1 .prose thead tr {{ | |
| # background: #fffbe9; | |
| # border-bottom: 2px solid #dca02a; | |
| # }} | |
| # .gradio-container-5-34-1 .prose th {{ | |
| # color: #7d3561; | |
| # font-weight: bold; | |
| # font-size: 20px; | |
| # background: #fffbe9; | |
| # padding: 8px 5px; | |
| # vertical-align: middle; | |
| # border: 0px solid #e0e0e0; | |
| # }} | |
| # td {{ | |
| # font-size: 18px; | |
| # padding: 8px 5px; | |
| # border: 0px solid #e0e0e0; | |
| # vertical-align: middle; | |
| # }} | |
| # th:first-child, td:first-child {{ | |
| # min-width: {model_col_width}px !important; | |
| # max-width: {model_col_width}px !important; | |
| # width: {model_col_width}px !important; | |
| # text-align: left !important; | |
| # }} | |
| # th:not(:first-child), td:not(:first-child) {{ | |
| # min-width: {col_minwidth}px; | |
| # max-width: {col_maxwidth}px; | |
| # width: auto; | |
| # text-align: center; | |
| # }} | |
| # </style> | |
| # """ | |
| html += df.to_html(index=False, escape=False) | |
| return html | |
| cluster_tabs, main_overall_tab, all_df, metric_map = load_leaderboards() | |
| def get_lang_table(lang_name): | |
| iso_codes = LANGNAME2ISOS.get(lang_name, []) | |
| if not iso_codes: | |
| return pd.DataFrame([{"Info": "No data for this language"}]) | |
| # Find all leaderboards containing any ISO in this language group | |
| pattern = re.compile(r"(^|-)(" + "|".join(re.escape(iso) for iso in iso_codes) + r")(-|$)") | |
| matched_langs = [lb for lb in all_df['leaderboard'].unique() if lb not in ['main'] and pattern.search(lb)] | |
| lang_df = all_df[all_df['leaderboard'].isin(matched_langs)].copy() | |
| if lang_df.empty: | |
| return pd.DataFrame([{"Info": "No data for this language"}]) | |
| def make_task_col(row): | |
| lb = row['leaderboard'] | |
| task = row['task'] | |
| metric = row['metric'] | |
| if '-' in lb: | |
| pair_lang = lb.split('-') | |
| pair = lb.replace('-', '_') | |
| # return f"{TASKS_LIST[task]}({task}) {LANG_ISO2NAME[pair_lang[0]]} to {LANG_ISO2NAME[pair_lang[1]]} ({pair})\n{metric}" | |
| return f"{TASKS_LIST[task]} <br> {LANG_ISO2NAME[pair_lang[0]]} to {LANG_ISO2NAME[pair_lang[1]]} <br> Metric: {metrics_list[metric]}" | |
| else: | |
| return f"{TASKS_LIST[task]} <br> Metric: {metrics_list[metric]}" | |
| lang_df['task_col'] = lang_df.apply(make_task_col, axis=1) | |
| table = lang_df.pivot_table(index='model', columns='task_col', values='score').reset_index() | |
| score_cols = [col for col in table.columns if col != 'model'] | |
| for col in score_cols: | |
| table[col] = table[col].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float, np.integer, np.floating)) else x) | |
| def avg_score(row): | |
| vals = [] | |
| for col in score_cols: | |
| try: | |
| v = float(row[col]) | |
| vals.append(v) | |
| except Exception: | |
| continue | |
| return np.mean(vals) if vals else np.nan | |
| table.insert(1, 'Language Score', table.apply(avg_score, axis=1).apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---")) | |
| table['__overall_score_float'] = table['Language Score'].apply(lambda x: float(x) if x != "---" else np.nan) | |
| table = table.sort_values(by='__overall_score_float', ascending=False, kind="mergesort").reset_index(drop=True) | |
| def get_rank_symbols(scores): | |
| unique_scores = sorted(set([s for s in scores if not pd.isna(s)]), reverse=True) | |
| symbols = ["🏆", "🥈", "🥉"] | |
| score_to_symbol = {s: symbols[i] for i, s in enumerate(unique_scores[:3])} | |
| return [score_to_symbol.get(s, "") for s in scores] | |
| table['rank_symbol'] = get_rank_symbols(table['__overall_score_float'].tolist()) | |
| table['model'] = table['rank_symbol'] + ' ' + table['model'] | |
| table = table.drop(columns=['rank_symbol', '__overall_score_float']) | |
| return table | |