Spaces:

AMR-KELEG
/

MLADI

Running

File size: 12,598 Bytes

import os
import subprocess
import streamlit as st

import datasets
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from constants import DIALECTS_WITH_LABELS
from inspect import getmembers, isfunction
import eval_utils
import utils
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

from huggingface_hub import HfApi

api = HfApi()

st.set_page_config(layout="wide")
st.title("MLADI Leaderboard")
st.write(
    "The Multi-label Arabic Dialect Identification (MLADI) leaderboard serves as a public interface for benchmarking ADI "
    "models using an 'extended version' of the NADI 2024 test set, "
    "the first multi-label country-level ADI dataset.\n\n"
    "🔜 More information can be found through the following paper: [Revisiting Common Assumptions about Arabic Dialects in NLP](https://arxiv.org/abs/2505.21816)\n\n"
)

SHARED_TASK_TEAMS = {
    "Elyadata": "https://aclanthology.org/2024.arabicnlp-1.85/",
    "NLP_DI": "https://aclanthology.org/2024.arabicnlp-1.82/",
    "dzNlp": "https://aclanthology.org/2024.arabicnlp-1.84/",
}
tab1, tab2 = st.tabs(["Leaderboard", "Submit a Model"])
with tab1:
    # Load the labels
    dataset_name = os.environ["DATASET_NAME"]
    dataset = datasets.load_dataset(dataset_name)["test"]
    labels = {dialect: dataset[dialect] for dialect in DIALECTS_WITH_LABELS}

    print("Loaded the labels, no. of samples:", len(dataset))

    # Load the models' predictions
    try:
        model_predictions_rows = datasets.load_dataset(
            os.environ["PREDICTIONS_DATASET_NAME"]
        )["train"]

    except Exception as e:
        st.info(f"Error in loading the results!")
        model_predictions_rows = []

    if model_predictions_rows:
        # TODO: Store these metrics in a separate dataset!
        evaluation_metrics = []
        for row in model_predictions_rows:
            # Evaluate the models
            accuracy_scores = {}
            f1_scores = {}
            recall_scores = {}
            precision_scores = {}
            predictions = row["predictions"]

            if not row["status"] == "completed":
                continue

            for dialect in DIALECTS_WITH_LABELS:
                y_true = labels[dialect]
                y_pred = [dialect in prediction for prediction in predictions]
                accuracy = accuracy_score(y_true, y_pred)
                f1 = f1_score(y_true, y_pred)
                recall = recall_score(y_true, y_pred)
                precision = precision_score(y_true, y_pred)

                accuracy_scores[dialect] = accuracy
                f1_scores[dialect] = f1
                recall_scores[dialect] = recall
                precision_scores[dialect] = precision

            macro_avg_accuracy = np.mean(list(accuracy_scores.values()))
            macro_avg_f1 = np.mean(list(f1_scores.values()))
            macro_avg_recall = np.mean(list(recall_scores.values()))
            macro_avg_precision = np.mean(list(precision_scores.values()))

            evaluation_metrics.append(
                {
                    "Model Name": row["model_name"],
                    "Accuracy": macro_avg_accuracy,
                    "Recall": macro_avg_recall,
                    "Precision": macro_avg_precision,
                    "F1 score": macro_avg_f1,
                    "Inference Method": row["inference_function"],
                    "URL": f"https://huggingface.co/{row['model_name']}"
                    if ("shared task team" not in row["model_name"])
                    else SHARED_TASK_TEAMS[row["model_name"].split(" (")[0]],
                    "Commit ID": row["commit_id"][:5]
                    if ("shared task team" not in row["model_name"])
                    else "N/A",
                }
            )

        if evaluation_metrics:
            results_df = pd.DataFrame(evaluation_metrics).sort_values(
                "F1 score", ascending=False
            )
            results_df["Rank"] = range(1, len(results_df) + 1)

            results_df = results_df[
                [
                    "Rank",
                    "Model Name",
                    "F1 score",
                    "Precision",
                    "Recall",
                    "Accuracy",
                    "Inference Method",
                    "URL",
                    "Commit ID",
                ]
            ]
            st.data_editor(
                results_df,
                column_config={
                    "URL": st.column_config.LinkColumn("URL", required=False),
                },
                hide_index=True,
            )
            st.write("Note: The metrics are macro-averaged across all 11 dialects.")

        with st.expander("Click for more information."):
            inference_functions_names = [
                func_name for func_name, _ in getmembers(eval_utils, isfunction)
            ]
            # Show the docstring of the inference functions
            inference_functions_docstring = [
                getattr(eval_utils, func).__doc__ for func in inference_functions_names
            ]

            inference_functions_df = pd.DataFrame(
                {
                    "Method": inference_functions_names,
                    "Description": inference_functions_docstring,
                }
            )
            st.markdown("## Inference Methods' Descriptions", unsafe_allow_html=True)
            st.markdown(
                inference_functions_df.to_markdown(index=False), unsafe_allow_html=True
            )

            with open("leaderboard_info.md", "r") as f:
                MARKDOWN_TEXT = f.read()
            st.markdown(MARKDOWN_TEXT)
            st.markdown("For any inquiries, please do not hesistate to contact me: https://amr-keleg.github.io/")

        with st.expander("Cite this leaderboard!"):
            st.write(
                """
                Please cite the following paper in which we introduced the NADI 2024 evaluation sets:
                ```
                @inproceedings{keleg-etal-2025-revisiting,
                    title = "Revisiting Common Assumptions about Arabic Dialects in NLP",
                    author = "Keleg, Amr  and
                    Goldwater, Sharon  and
                    Magdy, Walid",
                    booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics",
                    month = july,
                    year = "2025",
                    address = "Vienna, Austria",
                    publisher = "Association for Computational Linguistics",
                }

                @inproceedings{abdul-mageed-etal-2024-nadi,
                    title = "{NADI} 2024: The Fifth Nuanced {A}rabic Dialect Identification Shared Task",
                    author = "Abdul-Mageed, Muhammad  and
                    Keleg, Amr  and
                    Elmadany, AbdelRahim  and
                    Zhang, Chiyu  and
                    Hamed, Injy  and
                    Magdy, Walid  and
                    Bouamor, Houda  and
                    Habash, Nizar",
                    editor = "Habash, Nizar  and
                    Bouamor, Houda  and
                    Eskander, Ramy  and
                    Tomeh, Nadi  and
                    Abu Farha, Ibrahim  and
                    Abdelali, Ahmed  and
                    Touileb, Samia  and
                    Hamed, Injy  and
                    Onaizan, Yaser  and
                    Alhafni, Bashar  and
                    Antoun, Wissam  and
                    Khalifa, Salam  and
                    Haddad, Hatem  and
                    Zitouni, Imed  and
                    AlKhamissi, Badr  and
                    Almatham, Rawan  and
                    Mrini, Khalil",
                    booktitle = "Proceedings of The Second Arabic Natural Language Processing Conference",
                    month = aug,
                    year = "2024",
                    address = "Bangkok, Thailand",
                    publisher = "Association for Computational Linguistics",
                    url = "https://aclanthology.org/2024.arabicnlp-1.79",
                    doi = "10.18653/v1/2024.arabicnlp-1.79",
                    pages = "709--728",
                }
                ```
                """
            )

    # Evaluate the models queued
    if model_predictions_rows:
        models_to_be_evaluated = []
        models_in_progress = []

        for row in model_predictions_rows:
            if row["status"] == "queued":
                models_to_be_evaluated.append(row)
            elif row["status"] == "in_progress":
                models_in_progress.append(row)

        for model in models_in_progress:
            # Check if the evaluation is staled for more than a day!
            timestamp = model["last_updated_timestamp"]
            if utils.current_seconds_time() - timestamp > 86400:
                utils.update_model_queue(
                    repo_id=os.environ["PREDICTIONS_DATASET_NAME"],
                    model_name=model["model_name"],
                    commit_id=model["commit_id"],
                    inference_function=model["inference_function"],
                    status="queued",
                )
                print(f"Model {model['model_name']} is staled for more than a day.")
                models_to_be_evaluated.append(model)
                models_in_progress.remove(model)

        if models_in_progress == []:
            for row in models_to_be_evaluated:
                # Evaluate the model
                subprocess.Popen(
                    [
                        "python",
                        "background_inference.py",
                        row["model_name"],
                        row["commit_id"],
                        row["inference_function"],
                    ]
                )
                print(f"Started the evaluation of {row['model_name']}.")

with tab2:
    model_name = st.text_input("Enter a model's name on HF")
    model_revision = st.text_input(
        "Enter a model's revision on HF (commit id, or branch name)",
        placeholder="main",
        value="main",
    )
    inference_functions_names = [
        func_name for func_name, _ in getmembers(eval_utils, isfunction)
    ]
    inference_function = st.selectbox(
        "Inference Method",
        inference_functions_names,
    )

    # TODO: Allow modifying the adhoc threshold values of the different inference methods

    # Show the docstring of the inference functions
    inference_functions_docstring = [
        getattr(eval_utils, func).__doc__ for func in inference_functions_names
    ]

    inference_functions_df = pd.DataFrame(
        {
            "Method": inference_functions_names,
            "Description": inference_functions_docstring,
        }
    )
    with st.expander("Check the inference methods' short descriptions"):
        st.markdown(
            inference_functions_df.to_markdown(index=False), unsafe_allow_html=True
        )
        st.write(
            "Note: We are happy to discuss adding new custom inference methods for your models."
        )

    if model_name and model_revision and inference_function:
        # Get the model's commit id
        commit_id = api.list_repo_commits(model_name, revision=model_revision)[
            0
        ].commit_id

        model_predictions_rows = datasets.load_dataset(
            os.environ["PREDICTIONS_DATASET_NAME"]
        )["train"]

        # Check if the model is already in the leaderboard
        model_exists = any(
            [
                row["model_name"] == model_name
                and row["commit_id"] == commit_id
                and row["inference_function"] == inference_function
                for row in model_predictions_rows
            ]
        )

        if not model_exists:
            # Add the model to the evaluation queue
            utils.update_model_queue(
                repo_id=os.environ["PREDICTIONS_DATASET_NAME"],
                model_name=model_name,
                commit_id=commit_id,
                inference_function=inference_function,
                status="queued",
            )
            st.info(
                f"The evaluation of the model {model_name} is queued for processing."
            )

        else:
            st.info(
                f"The model {model_name} has already submitted to the leaderboard before."
            )