|
import os |
|
import subprocess |
|
import streamlit as st |
|
|
|
import datasets |
|
from tqdm import tqdm |
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer |
|
|
|
from constants import DIALECTS_WITH_LABELS |
|
from inspect import getmembers, isfunction |
|
import eval_utils |
|
import utils |
|
import numpy as np |
|
import pandas as pd |
|
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score |
|
|
|
from huggingface_hub import HfApi |
|
|
|
api = HfApi() |
|
|
|
st.set_page_config(layout="wide") |
|
st.title("MLADI Leaderboard") |
|
st.write( |
|
"The Multi-label Arabic Dialect Identification (MLADI) leaderboard serves as a public interface for benchmarking ADI " |
|
"models using an 'extended version' of the NADI 2024 test set, " |
|
"the first multi-label country-level ADI dataset.\n\n" |
|
"🔜 More information about the dataset extension will be coming soon, stay tuned!" |
|
) |
|
|
|
SHARED_TASK_TEAMS = { |
|
"Elyadata": "https://aclanthology.org/2024.arabicnlp-1.85/", |
|
"NLP_DI": "https://aclanthology.org/2024.arabicnlp-1.82/", |
|
"dzNlp": "https://aclanthology.org/2024.arabicnlp-1.84/", |
|
} |
|
tab1, tab2 = st.tabs(["Leaderboard", "Submit a Model"]) |
|
with tab1: |
|
|
|
dataset_name = os.environ["DATASET_NAME"] |
|
dataset = datasets.load_dataset(dataset_name)["test"] |
|
labels = {dialect: dataset[dialect] for dialect in DIALECTS_WITH_LABELS} |
|
|
|
print("Loaded the labels, no. of samples:", len(dataset)) |
|
|
|
|
|
try: |
|
model_predictions_rows = datasets.load_dataset( |
|
os.environ["PREDICTIONS_DATASET_NAME"] |
|
)["train"] |
|
|
|
except Exception as e: |
|
st.info(f"Error in loading the results!") |
|
model_predictions_rows = [] |
|
|
|
if model_predictions_rows: |
|
|
|
evaluation_metrics = [] |
|
for row in model_predictions_rows: |
|
|
|
accuracy_scores = {} |
|
f1_scores = {} |
|
recall_scores = {} |
|
precision_scores = {} |
|
predictions = row["predictions"] |
|
|
|
if not row["status"] == "completed": |
|
continue |
|
|
|
for dialect in DIALECTS_WITH_LABELS: |
|
y_true = labels[dialect] |
|
y_pred = [dialect in prediction for prediction in predictions] |
|
accuracy = accuracy_score(y_true, y_pred) |
|
f1 = f1_score(y_true, y_pred) |
|
recall = recall_score(y_true, y_pred) |
|
precision = precision_score(y_true, y_pred) |
|
|
|
accuracy_scores[dialect] = accuracy |
|
f1_scores[dialect] = f1 |
|
recall_scores[dialect] = recall |
|
precision_scores[dialect] = precision |
|
|
|
macro_avg_accuracy = np.mean(list(accuracy_scores.values())) |
|
macro_avg_f1 = np.mean(list(f1_scores.values())) |
|
macro_avg_recall = np.mean(list(recall_scores.values())) |
|
macro_avg_precision = np.mean(list(precision_scores.values())) |
|
|
|
evaluation_metrics.append( |
|
{ |
|
"Model Name": row["model_name"], |
|
"Accuracy": macro_avg_accuracy, |
|
"Recall": macro_avg_recall, |
|
"Precision": macro_avg_precision, |
|
"F1 score": macro_avg_f1, |
|
"Inference Method": row["inference_function"], |
|
"URL": f"https://huggingface.co/{row['model_name']}" |
|
if ("shared task team" not in row["model_name"]) |
|
else SHARED_TASK_TEAMS[row["model_name"].split(" (")[0]], |
|
"Commit ID": row["commit_id"][:5] |
|
if ("shared task team" not in row["model_name"]) |
|
else "N/A", |
|
} |
|
) |
|
|
|
if evaluation_metrics: |
|
results_df = pd.DataFrame(evaluation_metrics).sort_values( |
|
"F1 score", ascending=False |
|
) |
|
results_df["Rank"] = range(1, len(results_df) + 1) |
|
|
|
results_df = results_df[ |
|
[ |
|
"Rank", |
|
"Model Name", |
|
"F1 score", |
|
"Precision", |
|
"Recall", |
|
"Accuracy", |
|
"Inference Method", |
|
"URL", |
|
"Commit ID", |
|
] |
|
] |
|
st.data_editor( |
|
results_df, |
|
column_config={ |
|
"URL": st.column_config.LinkColumn("URL", required=False), |
|
}, |
|
hide_index=True, |
|
) |
|
st.write("Note: The metrics are macro-averaged across all 11 dialects.") |
|
|
|
with st.expander("Click for more information."): |
|
inference_functions_names = [ |
|
func_name for func_name, _ in getmembers(eval_utils, isfunction) |
|
] |
|
|
|
inference_functions_docstring = [ |
|
getattr(eval_utils, func).__doc__ for func in inference_functions_names |
|
] |
|
|
|
inference_functions_df = pd.DataFrame( |
|
{ |
|
"Method": inference_functions_names, |
|
"Description": inference_functions_docstring, |
|
} |
|
) |
|
st.markdown("## Inference Methods' Descriptions", unsafe_allow_html=True) |
|
st.markdown( |
|
inference_functions_df.to_markdown(index=False), unsafe_allow_html=True |
|
) |
|
|
|
with open("leaderboard_info.md", "r") as f: |
|
MARKDOWN_TEXT = f.read() |
|
st.markdown(MARKDOWN_TEXT) |
|
st.markdown("For any inquiries, please do not hesistate to contact me: https://amr-keleg.github.io/") |
|
|
|
with st.expander("Cite this leaderboard!"): |
|
st.write( |
|
""" |
|
Please cite the following paper in which we introduced the NADI 2024 evaluation sets: |
|
``` |
|
@inproceedings{abdul-mageed-etal-2024-nadi, |
|
title = "{NADI} 2024: The Fifth Nuanced {A}rabic Dialect Identification Shared Task", |
|
author = "Abdul-Mageed, Muhammad and |
|
Keleg, Amr and |
|
Elmadany, AbdelRahim and |
|
Zhang, Chiyu and |
|
Hamed, Injy and |
|
Magdy, Walid and |
|
Bouamor, Houda and |
|
Habash, Nizar", |
|
editor = "Habash, Nizar and |
|
Bouamor, Houda and |
|
Eskander, Ramy and |
|
Tomeh, Nadi and |
|
Abu Farha, Ibrahim and |
|
Abdelali, Ahmed and |
|
Touileb, Samia and |
|
Hamed, Injy and |
|
Onaizan, Yaser and |
|
Alhafni, Bashar and |
|
Antoun, Wissam and |
|
Khalifa, Salam and |
|
Haddad, Hatem and |
|
Zitouni, Imed and |
|
AlKhamissi, Badr and |
|
Almatham, Rawan and |
|
Mrini, Khalil", |
|
booktitle = "Proceedings of The Second Arabic Natural Language Processing Conference", |
|
month = aug, |
|
year = "2024", |
|
address = "Bangkok, Thailand", |
|
publisher = "Association for Computational Linguistics", |
|
url = "https://aclanthology.org/2024.arabicnlp-1.79", |
|
doi = "10.18653/v1/2024.arabicnlp-1.79", |
|
pages = "709--728", |
|
} |
|
``` |
|
""" |
|
) |
|
|
|
|
|
if model_predictions_rows: |
|
models_to_be_evaluated = [] |
|
models_in_progress = [] |
|
|
|
for row in model_predictions_rows: |
|
if row["status"] == "queued": |
|
models_to_be_evaluated.append(row) |
|
elif row["status"] == "in_progress": |
|
models_in_progress.append(row) |
|
|
|
for model in models_in_progress: |
|
|
|
timestamp = model["last_updated_timestamp"] |
|
if utils.current_seconds_time() - timestamp > 86400: |
|
utils.update_model_queue( |
|
repo_id=os.environ["PREDICTIONS_DATASET_NAME"], |
|
model_name=model["model_name"], |
|
commit_id=model["commit_id"], |
|
inference_function=model["inference_function"], |
|
status="queued", |
|
) |
|
print(f"Model {model['model_name']} is staled for more than a day.") |
|
models_to_be_evaluated.append(model) |
|
models_in_progress.remove(model) |
|
|
|
if models_in_progress == []: |
|
for row in models_to_be_evaluated: |
|
|
|
subprocess.Popen( |
|
[ |
|
"python", |
|
"background_inference.py", |
|
row["model_name"], |
|
row["commit_id"], |
|
row["inference_function"], |
|
] |
|
) |
|
print(f"Started the evaluation of {row['model_name']}.") |
|
|
|
with tab2: |
|
model_name = st.text_input("Enter a model's name on HF") |
|
model_revision = st.text_input( |
|
"Enter a model's revision on HF (commit id, or branch name)", |
|
placeholder="main", |
|
value="main", |
|
) |
|
inference_functions_names = [ |
|
func_name for func_name, _ in getmembers(eval_utils, isfunction) |
|
] |
|
inference_function = st.selectbox( |
|
"Inference Method", |
|
inference_functions_names, |
|
) |
|
|
|
|
|
|
|
|
|
inference_functions_docstring = [ |
|
getattr(eval_utils, func).__doc__ for func in inference_functions_names |
|
] |
|
|
|
inference_functions_df = pd.DataFrame( |
|
{ |
|
"Method": inference_functions_names, |
|
"Description": inference_functions_docstring, |
|
} |
|
) |
|
with st.expander("Check the inference methods' short descriptions"): |
|
st.markdown( |
|
inference_functions_df.to_markdown(index=False), unsafe_allow_html=True |
|
) |
|
st.write( |
|
"Note: We are happy to discuss adding new custom inference methods for your models." |
|
) |
|
|
|
if model_name and model_revision and inference_function: |
|
|
|
commit_id = api.list_repo_commits(model_name, revision=model_revision)[ |
|
0 |
|
].commit_id |
|
|
|
model_predictions_rows = datasets.load_dataset( |
|
os.environ["PREDICTIONS_DATASET_NAME"] |
|
)["train"] |
|
|
|
|
|
model_exists = any( |
|
[ |
|
row["model_name"] == model_name |
|
and row["commit_id"] == commit_id |
|
and row["inference_function"] == inference_function |
|
for row in model_predictions_rows |
|
] |
|
) |
|
|
|
if not model_exists: |
|
|
|
utils.update_model_queue( |
|
repo_id=os.environ["PREDICTIONS_DATASET_NAME"], |
|
model_name=model_name, |
|
commit_id=commit_id, |
|
inference_function=inference_function, |
|
status="queued", |
|
) |
|
st.info( |
|
f"The evaluation of the model {model_name} is queued for processing." |
|
) |
|
|
|
else: |
|
st.info( |
|
f"The model {model_name} has already submitted to the leaderboard before." |
|
) |
|
|