File size: 10,241 Bytes
e9f938b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
import json
import os
from datetime import datetime

import gradio as gr
import pandas as pd

from envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO

custom_css = """
@import url('https://fonts.googleapis.com/css2?family=Vazirmatn&display=swap');
body, .gradio-container, .gr-button, .gr-input, .gr-slider, .gr-dropdown, .gr-markdown {
    font-family: 'Vazirmatn', sans-serif !important;
}

.markdown-text {
    font-size: 16px !important;
}

#models-to-add-text {
    font-size: 18px !important;
}

#citation-button span {
    font-size: 16px !important;
}

#citation-button textarea {
    font-size: 16px !important;
}

#citation-button > label > button {
    margin: 6px;
    transform: scale(1.3);
}

#leaderboard-table {
    margin-top: 15px
}

#leaderboard-table-lite {
    margin-top: 15px
}

#search-bar-table-box > div:first-child {
    background: none;
    border: none;
}

#search-bar {
    padding: 0px;
}

/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
#leaderboard-table td:nth-child(2),
#leaderboard-table th:nth-child(2) {
    max-width: 400px;
    overflow: auto;
    white-space: nowrap;
}

.tab-buttons button {
    font-size: 20px;
}

#scale-logo {
    border-style: none !important;
    box-shadow: none;
    display: block;
    margin-left: auto;
    margin-right: auto;
    max-width: 600px;
}

#scale-logo .download {
    display: none;
}
#filter_type{
    border: 0;
    padding-left: 0;
    padding-top: 0;
}
#filter_type label {
    display: flex;
}
#filter_type label > span{
    margin-top: var(--spacing-lg);
    margin-right: 0.5em;
}
#filter_type label > .wrap{
    width: 103px;
}
#filter_type label > .wrap .wrap-inner{  
    padding: 2px;
}
#filter_type label > .wrap .wrap-inner input{
    width: 1px
}
#filter-columns-type{
    border:0;
    padding:0.5;
}
#filter-columns-size{
    border:0;
    padding:0.5;
}
#box-filter > .form{
    border: 0
}
"""

LLM_BENCHMARKS_ABOUT_TEXT = f"""
# Persian LLM Leaderboard (v1.0.0)

> The Persian LLM Evaluation Leaderboard, developed by **Part DP AI** in collaboration with **AUT (Amirkabir University of Technology) NLP Lab**, provides a comprehensive benchmarking system specifically designed for Persian LLMs. This leaderboard, based on the open-source [LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness), offers a unique platform for evaluating the performance of large language models (LLMs) on tasks that demand linguistic proficiency and technical skill in Persian.

> **Note:** This leaderboard is continuously updating its data and models, reflecting the latest developments in Persian LLMs. It is currently in version 1.0.0, serving as the initial benchmark for Persian LLM evaluation, with plans for future enhancements.

## 1. Key Features

> 1. **Open Evaluation Access**  
>    The leaderboard allows open participation, meaning that developers and researchers working with open-source models can submit evaluation requests for their models. This accessibility encourages the development and testing of Persian LLMs within the broader AI ecosystem.
> 
> 2. **Task Diversity**  
>    Six specialized tasks have been curated for this leaderboard, each tailored to challenge different aspects of a model’s capabilities. These tasks include:
>    - **Part Multiple Choice**
>    - **ARC Easy**
>    - **ARC Challenge**
>    - **MMLU Pro**
>    - **GSM8k Persian**
>    - **Multiple Choice Persian**
> 
>    Each dataset is available in Persian, providing a robust testing ground for models in a non-English setting. The datasets collectively contain over **40k samples** across various categories such as **Common Knowledge**, **Reasoning**, **Summarization**, **Math**, and **Specialized Examinations**, offering comprehensive coverage of diverse linguistic and technical challenges.
> 
> 3. **Open-Source Dataset Sample**  
>    A sample of the evaluation dataset is hosted on [Hugging Face Datasets](https://huggingface.co/datasets/PartAI/llm-leaderboard-datasets-sample), offering the AI community a glimpse of the benchmark content and format. This sample allows developers to pre-assess their models against representative data before a full leaderboard evaluation.
> 
> 4. **Collaborative Development**  
>    This leaderboard represents a significant collaboration between Part AI and Professor Saeedeh Momtazi of Amirkabir University of Technology, leveraging industrial expertise and academic research to create a high-quality, open benchmarking tool. The partnership underscores a shared commitment to advancing Persian LLMs.
> 
> 5. **Comprehensive Evaluation Pipeline**  
>    By integrating a standardized evaluation pipeline, models are assessed across a variety of data types, including text, mathematical formulas, and numerical data. This multi-faceted approach enhances the evaluation’s reliability and allows for precise, nuanced assessment of model performance across multiple dimensions.

## 2. Background and Goals

> Recent months have seen a notable increase in the development of Persian LLMs by research centers and AI companies in Iran. However, the lack of reliable, standardized benchmarks for Persian LLMs has made it challenging to evaluate model quality comprehensively. Global benchmarks typically do not support Persian, resulting in skewed or unreliable results for Persian LLMs.
> 
> This leaderboard addresses this gap by providing a locally-focused, transparent system that enables consistent, fair comparisons of Persian LLMs. It is expected to be a valuable tool for Persian-speaking businesses and developers, allowing them to select models best suited to their needs. Researchers and model developers also benefit from the competitive environment, with opportunities to showcase and improve their models based on benchmark rankings.

## 3. Data Privacy and Integrity

> To maintain evaluation integrity and prevent overfitting or data leakage, only part of the benchmark dataset is openly available. This limited access approach upholds model evaluation reliability, ensuring that results are genuinely representative of each model’s capabilities across unseen data.
> 
> The leaderboard represents a significant milestone in Persian LLMs and is positioned to become the leading standard for LLM evaluation in the Persian-speaking world.

"""


LLM_BENCHMARKS_SUBMIT_TEXT = """## Submitting a Model for Evaluation

> To submit your open-source model for evaluation, follow these steps:
> 
> 1. **Ensure your model is on Hugging Face**: Your model must be publicly available on [Hugging Face](https://huggingface.co/).
> 
> 2. **Submit Request**: Send a request with your model's Hugging Face identifier.
> 
> 3. **Manual Queue**: Please note that the evaluation process is currently handled manually. Submissions will be queued and processed as soon as possible.
> 
> 4. **Results**: Once the evaluation is complete, your model’s results will be updated on the leaderboard.
> 
> We appreciate your patience and contributions to the Persian LLM ecosystem!
"""


PART_LOGO = """
<img src="https://avatars.githubusercontent.com/u/39557177?v=4" style="width:30%;display:block;margin-left:auto;margin-right:auto"> 
<h1 style="font-size: 28px; margin-bottom: 2px;">Part DP AI</h1>
"""


def load_jsonl(input_file):
    data = []
    with open(input_file, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return data


def jsonl_to_dataframe(input_file):
    data = load_jsonl(input_file)
    return pd.DataFrame(data)


def sort_dataframe_by_column(df, column_name):
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")
    return df.sort_values(by=column_name, ascending=False).reset_index(drop=True)


def add_average_column_to_df(df,columns_to_average, index=3, average_column_name="Average Accuracy"):
    average_column = df[columns_to_average].mean(axis=1)
    df.insert(index, average_column_name, average_column)
    return df


def model_hyperlink(link, model_name):
    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'


def make_clickable_model(model_name):
    link = f"https://huggingface.co/{model_name}"
    return model_hyperlink(link, model_name)


def center_align_markdown(text):
    return f'<div align="center">{text}</div>'


def apply_markdown_format_for_columns(df, model_column_name):
    columns = list(df.columns)
    df[model_column_name] = df[model_column_name].apply(make_clickable_model)
    # for column in columns:
    #     if column != model_column_name:
    #         df[column] = df[column].apply(center_align_markdown)
    return df


def submit(model_name, model_id, contact_email, license):
    if model_name == "" or model_id == "" or license == "" or contact_email == "":
        gr.Info("Please fill all the fields")
        return


    try:
        user_name = ""
        if "/" in model_id:
            user_name = model_id.split("/")[0]
            model_path = model_id.split("/")[1]

        eval_entry = {
            "model_name": model_name,
            "model_id": model_id,
            "contact_email": contact_email,
            "license": license
        }

        # Get the current timestamp to add to the filename
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
        os.makedirs(OUT_DIR, exist_ok=True)

        # Add the timestamp to the filename
        out_path = f"{OUT_DIR}/{user_name}_{model_path}_{timestamp}.json"

        with open(out_path, "w") as f:
            f.write(json.dumps(eval_entry))

        print("Uploading eval file")
        API.upload_file(
            path_or_fileobj=out_path,
            path_in_repo=out_path.split("eval-queue/")[1],
            repo_id=QUEUE_REPO,
            repo_type="dataset",
            commit_message=f"Add {model_name} to eval queue",
        )

        gr.Info("Successfully submitted", duration=10)
        # Remove the local file
        os.remove(out_path)
    except Exception as e:
        gr.Error(f"Error submitting the model: {e}")