MLADI / app.py
Amr Keleg
Fix bug in leaderboard rendering
1c6beae
import os
import subprocess
import streamlit as st
import datasets
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from constants import DIALECTS_WITH_LABELS
from inspect import getmembers, isfunction
import eval_utils
import utils
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from huggingface_hub import HfApi
api = HfApi()
st.set_page_config(layout="wide")
st.title("MLADI Leaderboard")
st.write(
"The Multi-label Arabic Dialect Identification (MLADI) leaderboard serves as a public interface for benchmarking ADI "
"models using an 'extended version' of the NADI 2024 test set, "
"the first multi-label country-level ADI dataset.\n\n"
"🔜 More information about the dataset extension will be coming soon, stay tuned!"
)
SHARED_TASK_TEAMS = {
"Elyadata": "https://aclanthology.org/2024.arabicnlp-1.85/",
"NLP_DI": "https://aclanthology.org/2024.arabicnlp-1.82/",
"dzNlp": "https://aclanthology.org/2024.arabicnlp-1.84/",
}
tab1, tab2 = st.tabs(["Leaderboard", "Submit a Model"])
with tab1:
# Load the labels
dataset_name = os.environ["DATASET_NAME"]
dataset = datasets.load_dataset(dataset_name)["test"]
labels = {dialect: dataset[dialect] for dialect in DIALECTS_WITH_LABELS}
print("Loaded the labels, no. of samples:", len(dataset))
# Load the models' predictions
try:
model_predictions_rows = datasets.load_dataset(
os.environ["PREDICTIONS_DATASET_NAME"]
)["train"]
except Exception as e:
st.info(f"Error in loading the results!")
model_predictions_rows = []
if model_predictions_rows:
# TODO: Store these metrics in a separate dataset!
evaluation_metrics = []
for row in model_predictions_rows:
# Evaluate the models
accuracy_scores = {}
f1_scores = {}
recall_scores = {}
precision_scores = {}
predictions = row["predictions"]
if not row["status"] == "completed":
continue
for dialect in DIALECTS_WITH_LABELS:
y_true = labels[dialect]
y_pred = [dialect in prediction for prediction in predictions]
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
accuracy_scores[dialect] = accuracy
f1_scores[dialect] = f1
recall_scores[dialect] = recall
precision_scores[dialect] = precision
macro_avg_accuracy = np.mean(list(accuracy_scores.values()))
macro_avg_f1 = np.mean(list(f1_scores.values()))
macro_avg_recall = np.mean(list(recall_scores.values()))
macro_avg_precision = np.mean(list(precision_scores.values()))
evaluation_metrics.append(
{
"Model Name": row["model_name"],
"Accuracy": macro_avg_accuracy,
"Recall": macro_avg_recall,
"Precision": macro_avg_precision,
"F1 score": macro_avg_f1,
"Inference Method": row["inference_function"],
"URL": f"https://huggingface.co/{row['model_name']}"
if ("shared task team" not in row["model_name"])
else SHARED_TASK_TEAMS[row["model_name"].split(" (")[0]],
"Commit ID": row["commit_id"][:5]
if ("shared task team" not in row["model_name"])
else "N/A",
}
)
if evaluation_metrics:
results_df = pd.DataFrame(evaluation_metrics).sort_values(
"F1 score", ascending=False
)
results_df["Rank"] = range(1, len(results_df) + 1)
results_df = results_df[
[
"Rank",
"Model Name",
"F1 score",
"Precision",
"Recall",
"Accuracy",
"Inference Method",
"URL",
"Commit ID",
]
]
st.data_editor(
results_df,
column_config={
"URL": st.column_config.LinkColumn("URL", required=False),
},
hide_index=True,
)
st.write("Note: The metrics are macro-averaged across all 11 dialects.")
with st.expander("Click for more information."):
inference_functions_names = [
func_name for func_name, _ in getmembers(eval_utils, isfunction)
]
# Show the docstring of the inference functions
inference_functions_docstring = [
getattr(eval_utils, func).__doc__ for func in inference_functions_names
]
inference_functions_df = pd.DataFrame(
{
"Method": inference_functions_names,
"Description": inference_functions_docstring,
}
)
st.markdown("## Inference Methods' Descriptions", unsafe_allow_html=True)
st.markdown(
inference_functions_df.to_markdown(index=False), unsafe_allow_html=True
)
with open("leaderboard_info.md", "r") as f:
MARKDOWN_TEXT = f.read()
st.markdown(MARKDOWN_TEXT)
st.markdown("For any inquiries, please do not hesistate to contact me: https://amr-keleg.github.io/")
with st.expander("Cite this leaderboard!"):
st.write(
"""
Please cite the following paper in which we introduced the NADI 2024 evaluation sets:
```
@inproceedings{abdul-mageed-etal-2024-nadi,
title = "{NADI} 2024: The Fifth Nuanced {A}rabic Dialect Identification Shared Task",
author = "Abdul-Mageed, Muhammad and
Keleg, Amr and
Elmadany, AbdelRahim and
Zhang, Chiyu and
Hamed, Injy and
Magdy, Walid and
Bouamor, Houda and
Habash, Nizar",
editor = "Habash, Nizar and
Bouamor, Houda and
Eskander, Ramy and
Tomeh, Nadi and
Abu Farha, Ibrahim and
Abdelali, Ahmed and
Touileb, Samia and
Hamed, Injy and
Onaizan, Yaser and
Alhafni, Bashar and
Antoun, Wissam and
Khalifa, Salam and
Haddad, Hatem and
Zitouni, Imed and
AlKhamissi, Badr and
Almatham, Rawan and
Mrini, Khalil",
booktitle = "Proceedings of The Second Arabic Natural Language Processing Conference",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.arabicnlp-1.79",
doi = "10.18653/v1/2024.arabicnlp-1.79",
pages = "709--728",
}
```
"""
)
# Evaluate the models queued
if model_predictions_rows:
models_to_be_evaluated = []
models_in_progress = []
for row in model_predictions_rows:
if row["status"] == "queued":
models_to_be_evaluated.append(row)
elif row["status"] == "in_progress":
models_in_progress.append(row)
for model in models_in_progress:
# Check if the evaluation is staled for more than a day!
timestamp = model["last_updated_timestamp"]
if utils.current_seconds_time() - timestamp > 86400:
utils.update_model_queue(
repo_id=os.environ["PREDICTIONS_DATASET_NAME"],
model_name=model["model_name"],
commit_id=model["commit_id"],
inference_function=model["inference_function"],
status="queued",
)
print(f"Model {model['model_name']} is staled for more than a day.")
models_to_be_evaluated.append(model)
models_in_progress.remove(model)
if models_in_progress == []:
for row in models_to_be_evaluated:
# Evaluate the model
subprocess.Popen(
[
"python",
"background_inference.py",
row["model_name"],
row["commit_id"],
row["inference_function"],
]
)
print(f"Started the evaluation of {row['model_name']}.")
with tab2:
model_name = st.text_input("Enter a model's name on HF")
model_revision = st.text_input(
"Enter a model's revision on HF (commit id, or branch name)",
placeholder="main",
value="main",
)
inference_functions_names = [
func_name for func_name, _ in getmembers(eval_utils, isfunction)
]
inference_function = st.selectbox(
"Inference Method",
inference_functions_names,
)
# TODO: Allow modifying the adhoc threshold values of the different inference methods
# Show the docstring of the inference functions
inference_functions_docstring = [
getattr(eval_utils, func).__doc__ for func in inference_functions_names
]
inference_functions_df = pd.DataFrame(
{
"Method": inference_functions_names,
"Description": inference_functions_docstring,
}
)
with st.expander("Check the inference methods' short descriptions"):
st.markdown(
inference_functions_df.to_markdown(index=False), unsafe_allow_html=True
)
st.write(
"Note: We are happy to discuss adding new custom inference methods for your models."
)
if model_name and model_revision and inference_function:
# Get the model's commit id
commit_id = api.list_repo_commits(model_name, revision=model_revision)[
0
].commit_id
model_predictions_rows = datasets.load_dataset(
os.environ["PREDICTIONS_DATASET_NAME"]
)["train"]
# Check if the model is already in the leaderboard
model_exists = any(
[
row["model_name"] == model_name
and row["commit_id"] == commit_id
and row["inference_function"] == inference_function
for row in model_predictions_rows
]
)
if not model_exists:
# Add the model to the evaluation queue
utils.update_model_queue(
repo_id=os.environ["PREDICTIONS_DATASET_NAME"],
model_name=model_name,
commit_id=commit_id,
inference_function=inference_function,
status="queued",
)
st.info(
f"The evaluation of the model {model_name} is queued for processing."
)
else:
st.info(
f"The model {model_name} has already submitted to the leaderboard before."
)