Spaces:

AMR-KELEG
/

MLADI

Running

App Files Files Community

MLADI / app.py

AMR-KELEG

Add the new paper

40f9336 about 1 month ago

raw

history blame contribute delete

12.6 kB

	import os
	import subprocess
	import streamlit as st

	import datasets
	from tqdm import tqdm
	from transformers import AutoModelForSequenceClassification, AutoTokenizer

	from constants import DIALECTS_WITH_LABELS
	from inspect import getmembers, isfunction
	import eval_utils
	import utils
	import numpy as np
	import pandas as pd
	from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

	from huggingface_hub import HfApi

	api = HfApi()

	st.set_page_config(layout="wide")
	st.title("MLADI Leaderboard")
	st.write(
	"The Multi-label Arabic Dialect Identification (MLADI) leaderboard serves as a public interface for benchmarking ADI "
	"models using an 'extended version' of the NADI 2024 test set, "
	"the first multi-label country-level ADI dataset.\n\n"
	"🔜 More information can be found through the following paper: [Revisiting Common Assumptions about Arabic Dialects in NLP](https://arxiv.org/abs/2505.21816)\n\n"
	)

	SHARED_TASK_TEAMS = {
	"Elyadata": "https://aclanthology.org/2024.arabicnlp-1.85/",
	"NLP_DI": "https://aclanthology.org/2024.arabicnlp-1.82/",
	"dzNlp": "https://aclanthology.org/2024.arabicnlp-1.84/",
	}
	tab1, tab2 = st.tabs(["Leaderboard", "Submit a Model"])
	with tab1:
	# Load the labels
	dataset_name = os.environ["DATASET_NAME"]
	dataset = datasets.load_dataset(dataset_name)["test"]
	labels = {dialect: dataset[dialect] for dialect in DIALECTS_WITH_LABELS}

	print("Loaded the labels, no. of samples:", len(dataset))

	# Load the models' predictions
	try:
	model_predictions_rows = datasets.load_dataset(
	os.environ["PREDICTIONS_DATASET_NAME"]
	)["train"]

	except Exception as e:
	st.info(f"Error in loading the results!")
	model_predictions_rows = []

	if model_predictions_rows:
	# TODO: Store these metrics in a separate dataset!
	evaluation_metrics = []
	for row in model_predictions_rows:
	# Evaluate the models
	accuracy_scores = {}
	f1_scores = {}
	recall_scores = {}
	precision_scores = {}
	predictions = row["predictions"]

	if not row["status"] == "completed":
	continue

	for dialect in DIALECTS_WITH_LABELS:
	y_true = labels[dialect]
	y_pred = [dialect in prediction for prediction in predictions]
	accuracy = accuracy_score(y_true, y_pred)
	f1 = f1_score(y_true, y_pred)
	recall = recall_score(y_true, y_pred)
	precision = precision_score(y_true, y_pred)

	accuracy_scores[dialect] = accuracy
	f1_scores[dialect] = f1
	recall_scores[dialect] = recall
	precision_scores[dialect] = precision

	macro_avg_accuracy = np.mean(list(accuracy_scores.values()))
	macro_avg_f1 = np.mean(list(f1_scores.values()))
	macro_avg_recall = np.mean(list(recall_scores.values()))
	macro_avg_precision = np.mean(list(precision_scores.values()))

	evaluation_metrics.append(
	{
	"Model Name": row["model_name"],
	"Accuracy": macro_avg_accuracy,
	"Recall": macro_avg_recall,
	"Precision": macro_avg_precision,
	"F1 score": macro_avg_f1,
	"Inference Method": row["inference_function"],
	"URL": f"https://huggingface.co/{row['model_name']}"
	if ("shared task team" not in row["model_name"])
	else SHARED_TASK_TEAMS[row["model_name"].split(" (")[0]],
	"Commit ID": row["commit_id"][:5]
	if ("shared task team" not in row["model_name"])
	else "N/A",
	}
	)

	if evaluation_metrics:
	results_df = pd.DataFrame(evaluation_metrics).sort_values(
	"F1 score", ascending=False
	)
	results_df["Rank"] = range(1, len(results_df) + 1)

	results_df = results_df[
	[
	"Rank",
	"Model Name",
	"F1 score",
	"Precision",
	"Recall",
	"Accuracy",
	"Inference Method",
	"URL",
	"Commit ID",
	]
	]
	st.data_editor(
	results_df,
	column_config={
	"URL": st.column_config.LinkColumn("URL", required=False),
	},
	hide_index=True,
	)
	st.write("Note: The metrics are macro-averaged across all 11 dialects.")

	with st.expander("Click for more information."):
	inference_functions_names = [
	func_name for func_name, _ in getmembers(eval_utils, isfunction)
	]
	# Show the docstring of the inference functions
	inference_functions_docstring = [
	getattr(eval_utils, func).__doc__ for func in inference_functions_names
	]

	inference_functions_df = pd.DataFrame(
	{
	"Method": inference_functions_names,
	"Description": inference_functions_docstring,
	}
	)
	st.markdown("## Inference Methods' Descriptions", unsafe_allow_html=True)
	st.markdown(
	inference_functions_df.to_markdown(index=False), unsafe_allow_html=True
	)

	with open("leaderboard_info.md", "r") as f:
	MARKDOWN_TEXT = f.read()
	st.markdown(MARKDOWN_TEXT)
	st.markdown("For any inquiries, please do not hesistate to contact me: https://amr-keleg.github.io/")

	with st.expander("Cite this leaderboard!"):
	st.write(
	"""
	Please cite the following paper in which we introduced the NADI 2024 evaluation sets:
	```
	@inproceedings{keleg-etal-2025-revisiting,
	title = "Revisiting Common Assumptions about Arabic Dialects in NLP",
	author = "Keleg, Amr and
	Goldwater, Sharon and
	Magdy, Walid",
	booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics",
	month = july,
	year = "2025",
	address = "Vienna, Austria",
	publisher = "Association for Computational Linguistics",
	}

	@inproceedings{abdul-mageed-etal-2024-nadi,
	title = "{NADI} 2024: The Fifth Nuanced {A}rabic Dialect Identification Shared Task",
	author = "Abdul-Mageed, Muhammad and
	Keleg, Amr and
	Elmadany, AbdelRahim and
	Zhang, Chiyu and
	Hamed, Injy and
	Magdy, Walid and
	Bouamor, Houda and
	Habash, Nizar",
	editor = "Habash, Nizar and
	Bouamor, Houda and
	Eskander, Ramy and
	Tomeh, Nadi and
	Abu Farha, Ibrahim and
	Abdelali, Ahmed and
	Touileb, Samia and
	Hamed, Injy and
	Onaizan, Yaser and
	Alhafni, Bashar and
	Antoun, Wissam and
	Khalifa, Salam and
	Haddad, Hatem and
	Zitouni, Imed and
	AlKhamissi, Badr and
	Almatham, Rawan and
	Mrini, Khalil",
	booktitle = "Proceedings of The Second Arabic Natural Language Processing Conference",
	month = aug,
	year = "2024",
	address = "Bangkok, Thailand",
	publisher = "Association for Computational Linguistics",
	url = "https://aclanthology.org/2024.arabicnlp-1.79",
	doi = "10.18653/v1/2024.arabicnlp-1.79",
	pages = "709--728",
	}
	```
	"""
	)

	# Evaluate the models queued
	if model_predictions_rows:
	models_to_be_evaluated = []
	models_in_progress = []

	for row in model_predictions_rows:
	if row["status"] == "queued":
	models_to_be_evaluated.append(row)
	elif row["status"] == "in_progress":
	models_in_progress.append(row)

	for model in models_in_progress:
	# Check if the evaluation is staled for more than a day!
	timestamp = model["last_updated_timestamp"]
	if utils.current_seconds_time() - timestamp > 86400:
	utils.update_model_queue(
	repo_id=os.environ["PREDICTIONS_DATASET_NAME"],
	model_name=model["model_name"],
	commit_id=model["commit_id"],
	inference_function=model["inference_function"],
	status="queued",
	)
	print(f"Model {model['model_name']} is staled for more than a day.")
	models_to_be_evaluated.append(model)
	models_in_progress.remove(model)

	if models_in_progress == []:
	for row in models_to_be_evaluated:
	# Evaluate the model
	subprocess.Popen(
	[
	"python",
	"background_inference.py",
	row["model_name"],
	row["commit_id"],
	row["inference_function"],
	]
	)
	print(f"Started the evaluation of {row['model_name']}.")

	with tab2:
	model_name = st.text_input("Enter a model's name on HF")
	model_revision = st.text_input(
	"Enter a model's revision on HF (commit id, or branch name)",
	placeholder="main",
	value="main",
	)
	inference_functions_names = [
	func_name for func_name, _ in getmembers(eval_utils, isfunction)
	]
	inference_function = st.selectbox(
	"Inference Method",
	inference_functions_names,
	)

	# TODO: Allow modifying the adhoc threshold values of the different inference methods

	# Show the docstring of the inference functions
	inference_functions_docstring = [
	getattr(eval_utils, func).__doc__ for func in inference_functions_names
	]

	inference_functions_df = pd.DataFrame(
	{
	"Method": inference_functions_names,
	"Description": inference_functions_docstring,
	}
	)
	with st.expander("Check the inference methods' short descriptions"):
	st.markdown(
	inference_functions_df.to_markdown(index=False), unsafe_allow_html=True
	)
	st.write(
	"Note: We are happy to discuss adding new custom inference methods for your models."
	)

	if model_name and model_revision and inference_function:
	# Get the model's commit id
	commit_id = api.list_repo_commits(model_name, revision=model_revision)[
	0
	].commit_id

	model_predictions_rows = datasets.load_dataset(
	os.environ["PREDICTIONS_DATASET_NAME"]
	)["train"]

	# Check if the model is already in the leaderboard
	model_exists = any(
	[
	row["model_name"] == model_name
	and row["commit_id"] == commit_id
	and row["inference_function"] == inference_function
	for row in model_predictions_rows
	]
	)

	if not model_exists:
	# Add the model to the evaluation queue
	utils.update_model_queue(
	repo_id=os.environ["PREDICTIONS_DATASET_NAME"],
	model_name=model_name,
	commit_id=commit_id,
	inference_function=inference_function,
	status="queued",
	)
	st.info(
	f"The evaluation of the model {model_name} is queued for processing."
	)

	else:
	st.info(
	f"The model {model_name} has already submitted to the leaderboard before."
	)