Spaces:
Running
Running
alielfilali01
commited on
Commit
•
7206088
1
Parent(s):
f5920ad
Update app.py
Browse filesFilter out any entries that contain '_last_sync_timestamp' in results.json
app.py
CHANGED
@@ -11,48 +11,42 @@ DATASET_REPO_ID = f"{OWNER}/requests-dataset"
|
|
11 |
|
12 |
HEADER = """
|
13 |
<center>
|
|
|
14 |
<h1>AraGen Leaderboard: Generative Tasks Evaluation of Arabic LLMs</h1>
|
15 |
</center>
|
16 |
|
17 |
<br></br>
|
18 |
|
19 |
-
<p>This leaderboard
|
20 |
|
21 |
-
<p>For more details, please consider going through the technical blogpost <a href="https://huggingface.co/blog/
|
22 |
"""
|
23 |
|
24 |
ABOUT_SECTION = """
|
25 |
-
## About
|
26 |
|
27 |
The AraGen Leaderboard is designed to evaluate and compare the performance of Chat Arabic Large Language Models (LLMs) on a set of generative tasks. By leveraging the new **3C3H** evaluation measure which evaluate the model's output across six dimensions —Correctness, Completeness, Conciseness, Helpfulness, Honesty, and Harmlessness— the leaderboard provides a comprehensive and holistic evaluation of a model's performance in generating human-like and ethically responsible content.
|
28 |
|
29 |
### Why Focus on Chat Models?
|
30 |
|
31 |
-
AraGen
|
32 |
|
33 |
### How to Submit Your Model?
|
34 |
|
35 |
-
|
36 |
|
37 |
### Contact
|
38 |
|
39 |
-
For any inquiries or assistance,
|
40 |
-
"""
|
41 |
-
|
42 |
-
CITATION_BUTTON_LABEL = """
|
43 |
-
Copy the following snippet to cite these results
|
44 |
"""
|
45 |
|
46 |
CITATION_BUTTON_TEXT = """
|
47 |
-
|
48 |
-
author = {El Filali, Ali and Sengupta, Neha and Abouelseoud, Arwa and Nakov, Preslav and Fourrier, Clémentine},
|
49 |
-
title = {Rethinking LLM Evaluation with 3C3H: AraGen Benchmark and Leaderboard},
|
50 |
-
year = {2024},
|
51 |
-
publisher = {Inception},
|
52 |
-
howpublished = "url{https://huggingface.co/spaces/inceptionai/AraGen-Leaderboard}"
|
53 |
-
}
|
54 |
"""
|
55 |
|
|
|
|
|
|
|
56 |
|
57 |
def load_results():
|
58 |
# Get the current directory of the script and construct the path to results.json
|
@@ -63,6 +57,16 @@ def load_results():
|
|
63 |
with open(results_file, 'r') as f:
|
64 |
data = json.load(f)
|
65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
# Lists to collect data
|
67 |
data_3c3h = []
|
68 |
data_tasks = []
|
@@ -654,7 +658,7 @@ def main():
|
|
654 |
citation_button = gr.Textbox(
|
655 |
value=CITATION_BUTTON_TEXT,
|
656 |
label=CITATION_BUTTON_LABEL,
|
657 |
-
lines=
|
658 |
elem_id="citation-button",
|
659 |
show_copy_button=True,
|
660 |
)
|
|
|
11 |
|
12 |
HEADER = """
|
13 |
<center>
|
14 |
+
<h1>This space is experimental and should stay always private!</h1><br></br>
|
15 |
<h1>AraGen Leaderboard: Generative Tasks Evaluation of Arabic LLMs</h1>
|
16 |
</center>
|
17 |
|
18 |
<br></br>
|
19 |
|
20 |
+
<p>This leaderboard is designed to redefine the evaluation of Arabic Large Language Models (LLMs). Powered by the new <strong>3C3H</strong> evaluation measure, this framework delivers a transparent, robust, and holistic evaluation system that balances factual accuracy and usability assessment for a production ready setting.</p>
|
21 |
|
22 |
+
<p>For more details, please consider going through the technical blogpost <a href="https://huggingface.co/blog/">here</a>.</p>
|
23 |
"""
|
24 |
|
25 |
ABOUT_SECTION = """
|
26 |
+
## About:
|
27 |
|
28 |
The AraGen Leaderboard is designed to evaluate and compare the performance of Chat Arabic Large Language Models (LLMs) on a set of generative tasks. By leveraging the new **3C3H** evaluation measure which evaluate the model's output across six dimensions —Correctness, Completeness, Conciseness, Helpfulness, Honesty, and Harmlessness— the leaderboard provides a comprehensive and holistic evaluation of a model's performance in generating human-like and ethically responsible content.
|
29 |
|
30 |
### Why Focus on Chat Models?
|
31 |
|
32 |
+
AraGen —And 3C3H in general— is specifically tailored to assess **chat models**, which interact in conversational settings, intended for end user interaction and require a blend of factual accuracy and user-centric dialogue capabilities. While it is technically possible to submit foundational models, we kindly ask users to refrain from doing so. For evaluations of foundational models using likelihood accuracy based benchmarks, please refer to the [Open Arabic LLM Leaderboard (OALL)](https://huggingface.co/spaces/OALL/Open-Arabic-LLM-Leaderboard).
|
33 |
|
34 |
### How to Submit Your Model?
|
35 |
|
36 |
+
avigate to the submission section below to submit your open chat model from the HuggingFace Hub for evaluation. Ensure that your model is public and the submmited metadata (precision, revision, #params) is accurate.
|
37 |
|
38 |
### Contact
|
39 |
|
40 |
+
For any inquiries or assistance, please reach out through the community tab at [Inception AraGen Community](https://huggingface.co/spaces/inceptionai/AraGen-Leaderboard/discussions) or via [email](mailto:ali.filali@inceptionai.ai).
|
|
|
|
|
|
|
|
|
41 |
"""
|
42 |
|
43 |
CITATION_BUTTON_TEXT = """
|
44 |
+
CITATION
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
"""
|
46 |
|
47 |
+
CITATION_BUTTON_LABEL = """
|
48 |
+
CITATION_BUTTON_LABEL
|
49 |
+
"""
|
50 |
|
51 |
def load_results():
|
52 |
# Get the current directory of the script and construct the path to results.json
|
|
|
57 |
with open(results_file, 'r') as f:
|
58 |
data = json.load(f)
|
59 |
|
60 |
+
# Filter out any entries that only contain '_last_sync_timestamp'
|
61 |
+
filtered_data = []
|
62 |
+
for entry in data:
|
63 |
+
# If '_last_sync_timestamp' is the only key, skip it
|
64 |
+
if len(entry.keys()) == 1 and "_last_sync_timestamp" in entry:
|
65 |
+
continue
|
66 |
+
filtered_data.append(entry)
|
67 |
+
|
68 |
+
data = filtered_data
|
69 |
+
|
70 |
# Lists to collect data
|
71 |
data_3c3h = []
|
72 |
data_tasks = []
|
|
|
658 |
citation_button = gr.Textbox(
|
659 |
value=CITATION_BUTTON_TEXT,
|
660 |
label=CITATION_BUTTON_LABEL,
|
661 |
+
lines=20,
|
662 |
elem_id="citation-button",
|
663 |
show_copy_button=True,
|
664 |
)
|