File size: 11,908 Bytes
bbaf429
2ba85d4
af23890
 
 
947aa37
baffbf0
af23890
f818d64
 
 
465af14
f818d64
0382281
f818d64
80852b8
c147e35
6cd8ae6
c147e35
80852b8
366115a
56e148d
d356b05
a8895f8
56e148d
 
 
d356b05
48a308f
6ebedb7
 
 
 
 
f4a8c66
f818d64
0382281
 
 
 
 
43faa22
 
0382281
bd5743f
 
 
 
 
 
1fe7716
bd5743f
0382281
172e0d7
1fe7716
172e0d7
 
 
 
 
 
 
 
 
013e3f5
 
 
172e0d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2235669
172e0d7
 
 
 
1c6beae
6ebedb7
 
 
 
 
 
172e0d7
 
6ebedb7
5a485e4
 
 
 
 
 
 
1fe7716
 
 
 
e5d01c7
 
 
2235669
1fe7716
 
 
5a485e4
 
 
 
6ebedb7
5a485e4
 
 
56e148d
48a308f
d356b05
be8b41c
 
 
 
 
 
 
 
 
 
 
 
 
 
3b9a198
be8b41c
 
 
 
d356b05
 
 
60b65e2
d356b05
1a452c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c147e35
 
 
2fe3ef5
c147e35
 
 
 
 
2fe3ef5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c147e35
2fe3ef5
c147e35
 
 
 
 
 
 
 
 
 
 
b85527f
1a452c0
f818d64
 
c147e35
366115a
 
 
c147e35
8600cf1
 
 
f818d64
 
8600cf1
48a308f
0382281
37b6ae3
 
8600cf1
 
 
 
 
 
 
 
 
 
 
f4a8c66
8f6e384
 
 
f4a8c66
 
 
8600cf1
eae4d81
 
 
 
 
 
1fe7716
 
 
 
 
 
 
 
 
 
 
 
465af14
1fe7716
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b85527f
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
import os
import subprocess
import streamlit as st

import datasets
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from constants import DIALECTS_WITH_LABELS
from inspect import getmembers, isfunction
import eval_utils
import utils
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

from huggingface_hub import HfApi

api = HfApi()

st.set_page_config(layout="wide")
st.title("MLADI Leaderboard")
st.write(
    "The Multi-label Arabic Dialect Identification (MLADI) leaderboard serves as a public interface for benchmarking ADI "
    "models using an 'extended version' of the NADI 2024 test set, "
    "the first multi-label country-level ADI dataset.\n\n"
    "🔜 More information about the dataset extension will be coming soon, stay tuned!"
)

SHARED_TASK_TEAMS = {
    "Elyadata": "https://aclanthology.org/2024.arabicnlp-1.85/",
    "NLP_DI": "https://aclanthology.org/2024.arabicnlp-1.82/",
    "dzNlp": "https://aclanthology.org/2024.arabicnlp-1.84/",
}
tab1, tab2 = st.tabs(["Leaderboard", "Submit a Model"])
with tab1:
    # Load the labels
    dataset_name = os.environ["DATASET_NAME"]
    dataset = datasets.load_dataset(dataset_name)["test"]
    labels = {dialect: dataset[dialect] for dialect in DIALECTS_WITH_LABELS}

    print("Loaded the labels, no. of samples:", len(dataset))

    # Load the models' predictions
    try:
        model_predictions_rows = datasets.load_dataset(
            os.environ["PREDICTIONS_DATASET_NAME"]
        )["train"]

    except Exception as e:
        st.info(f"Error in loading the results!")
        model_predictions_rows = []

    if model_predictions_rows:
        # TODO: Store these metrics in a separate dataset!
        evaluation_metrics = []
        for row in model_predictions_rows:
            # Evaluate the models
            accuracy_scores = {}
            f1_scores = {}
            recall_scores = {}
            precision_scores = {}
            predictions = row["predictions"]

            if not row["status"] == "completed":
                continue

            for dialect in DIALECTS_WITH_LABELS:
                y_true = labels[dialect]
                y_pred = [dialect in prediction for prediction in predictions]
                accuracy = accuracy_score(y_true, y_pred)
                f1 = f1_score(y_true, y_pred)
                recall = recall_score(y_true, y_pred)
                precision = precision_score(y_true, y_pred)

                accuracy_scores[dialect] = accuracy
                f1_scores[dialect] = f1
                recall_scores[dialect] = recall
                precision_scores[dialect] = precision

            macro_avg_accuracy = np.mean(list(accuracy_scores.values()))
            macro_avg_f1 = np.mean(list(f1_scores.values()))
            macro_avg_recall = np.mean(list(recall_scores.values()))
            macro_avg_precision = np.mean(list(precision_scores.values()))

            evaluation_metrics.append(
                {
                    "Model Name": row["model_name"],
                    "Accuracy": macro_avg_accuracy,
                    "Recall": macro_avg_recall,
                    "Precision": macro_avg_precision,
                    "F1 score": macro_avg_f1,
                    "Inference Method": row["inference_function"],
                    "URL": f"https://huggingface.co/{row['model_name']}"
                    if ("shared task team" not in row["model_name"])
                    else SHARED_TASK_TEAMS[row["model_name"].split(" (")[0]],
                    "Commit ID": row["commit_id"][:5]
                    if ("shared task team" not in row["model_name"])
                    else "N/A",
                }
            )

        if evaluation_metrics:
            results_df = pd.DataFrame(evaluation_metrics).sort_values(
                "F1 score", ascending=False
            )
            results_df["Rank"] = range(1, len(results_df) + 1)

            results_df = results_df[
                [
                    "Rank",
                    "Model Name",
                    "F1 score",
                    "Precision",
                    "Recall",
                    "Accuracy",
                    "Inference Method",
                    "URL",
                    "Commit ID",
                ]
            ]
            st.data_editor(
                results_df,
                column_config={
                    "URL": st.column_config.LinkColumn("URL", required=False),
                },
                hide_index=True,
            )
            st.write("Note: The metrics are macro-averaged across all 11 dialects.")

        with st.expander("Click for more information."):
            inference_functions_names = [
                func_name for func_name, _ in getmembers(eval_utils, isfunction)
            ]
            # Show the docstring of the inference functions
            inference_functions_docstring = [
                getattr(eval_utils, func).__doc__ for func in inference_functions_names
            ]

            inference_functions_df = pd.DataFrame(
                {
                    "Method": inference_functions_names,
                    "Description": inference_functions_docstring,
                }
            )
            st.markdown("## Inference Methods' Descriptions", unsafe_allow_html=True)
            st.markdown(
                inference_functions_df.to_markdown(index=False), unsafe_allow_html=True
            )

            with open("leaderboard_info.md", "r") as f:
                MARKDOWN_TEXT = f.read()
            st.markdown(MARKDOWN_TEXT)
            st.markdown("For any inquiries, please do not hesistate to contact me: https://amr-keleg.github.io/")

        with st.expander("Cite this leaderboard!"):
            st.write(
                """
                Please cite the following paper in which we introduced the NADI 2024 evaluation sets:
                ```
                @inproceedings{abdul-mageed-etal-2024-nadi,
                    title = "{NADI} 2024: The Fifth Nuanced {A}rabic Dialect Identification Shared Task",
                    author = "Abdul-Mageed, Muhammad  and
                    Keleg, Amr  and
                    Elmadany, AbdelRahim  and
                    Zhang, Chiyu  and
                    Hamed, Injy  and
                    Magdy, Walid  and
                    Bouamor, Houda  and
                    Habash, Nizar",
                    editor = "Habash, Nizar  and
                    Bouamor, Houda  and
                    Eskander, Ramy  and
                    Tomeh, Nadi  and
                    Abu Farha, Ibrahim  and
                    Abdelali, Ahmed  and
                    Touileb, Samia  and
                    Hamed, Injy  and
                    Onaizan, Yaser  and
                    Alhafni, Bashar  and
                    Antoun, Wissam  and
                    Khalifa, Salam  and
                    Haddad, Hatem  and
                    Zitouni, Imed  and
                    AlKhamissi, Badr  and
                    Almatham, Rawan  and
                    Mrini, Khalil",
                    booktitle = "Proceedings of The Second Arabic Natural Language Processing Conference",
                    month = aug,
                    year = "2024",
                    address = "Bangkok, Thailand",
                    publisher = "Association for Computational Linguistics",
                    url = "https://aclanthology.org/2024.arabicnlp-1.79",
                    doi = "10.18653/v1/2024.arabicnlp-1.79",
                    pages = "709--728",
                }
                ```
                """
            )

    # Evaluate the models queued
    if model_predictions_rows:
        models_to_be_evaluated = []
        models_in_progress = []

        for row in model_predictions_rows:
            if row["status"] == "queued":
                models_to_be_evaluated.append(row)
            elif row["status"] == "in_progress":
                models_in_progress.append(row)

        for model in models_in_progress:
            # Check if the evaluation is staled for more than a day!
            timestamp = model["last_updated_timestamp"]
            if utils.current_seconds_time() - timestamp > 86400:
                utils.update_model_queue(
                    repo_id=os.environ["PREDICTIONS_DATASET_NAME"],
                    model_name=model["model_name"],
                    commit_id=model["commit_id"],
                    inference_function=model["inference_function"],
                    status="queued",
                )
                print(f"Model {model['model_name']} is staled for more than a day.")
                models_to_be_evaluated.append(model)
                models_in_progress.remove(model)

        if models_in_progress == []:
            for row in models_to_be_evaluated:
                # Evaluate the model
                subprocess.Popen(
                    [
                        "python",
                        "background_inference.py",
                        row["model_name"],
                        row["commit_id"],
                        row["inference_function"],
                    ]
                )
                print(f"Started the evaluation of {row['model_name']}.")

with tab2:
    model_name = st.text_input("Enter a model's name on HF")
    model_revision = st.text_input(
        "Enter a model's revision on HF (commit id, or branch name)",
        placeholder="main",
        value="main",
    )
    inference_functions_names = [
        func_name for func_name, _ in getmembers(eval_utils, isfunction)
    ]
    inference_function = st.selectbox(
        "Inference Method",
        inference_functions_names,
    )

    # TODO: Allow modifying the adhoc threshold values of the different inference methods

    # Show the docstring of the inference functions
    inference_functions_docstring = [
        getattr(eval_utils, func).__doc__ for func in inference_functions_names
    ]

    inference_functions_df = pd.DataFrame(
        {
            "Method": inference_functions_names,
            "Description": inference_functions_docstring,
        }
    )
    with st.expander("Check the inference methods' short descriptions"):
        st.markdown(
            inference_functions_df.to_markdown(index=False), unsafe_allow_html=True
        )
        st.write(
            "Note: We are happy to discuss adding new custom inference methods for your models."
        )

    if model_name and model_revision and inference_function:
        # Get the model's commit id
        commit_id = api.list_repo_commits(model_name, revision=model_revision)[
            0
        ].commit_id

        model_predictions_rows = datasets.load_dataset(
            os.environ["PREDICTIONS_DATASET_NAME"]
        )["train"]

        # Check if the model is already in the leaderboard
        model_exists = any(
            [
                row["model_name"] == model_name
                and row["commit_id"] == commit_id
                and row["inference_function"] == inference_function
                for row in model_predictions_rows
            ]
        )

        if not model_exists:
            # Add the model to the evaluation queue
            utils.update_model_queue(
                repo_id=os.environ["PREDICTIONS_DATASET_NAME"],
                model_name=model_name,
                commit_id=commit_id,
                inference_function=inference_function,
                status="queued",
            )
            st.info(
                f"The evaluation of the model {model_name} is queued for processing."
            )

        else:
            st.info(
                f"The model {model_name} has already submitted to the leaderboard before."
            )