Spaces:

inceptionai
/

AraGen-Leaderboard

Running

App Files Files Community

alielfilali01 commited on Dec 13, 2024

Commit

7206088

verified ·

1 Parent(s): f5920ad

Update app.py

Browse files

Filter out any entries that contain '_last_sync_timestamp' in results.json

Files changed (1) hide show

app.py +22 -18

app.py CHANGED Viewed

@@ -11,48 +11,42 @@ DATASET_REPO_ID = f"{OWNER}/requests-dataset"
 HEADER = """
 <center>
 <h1>AraGen Leaderboard: Generative Tasks Evaluation of Arabic LLMs</h1>
 </center>
 <br></br>
-<p>This leaderboard introduces generative tasks evaluation for Arabic Large Language Models (LLMs). Powered by the new <strong>3C3H</strong> evaluation measure, this framework delivers a transparent, robust, and holistic evaluation system that balances factual accuracy and usability assessment for a production ready setting.</p>
-<p>For more details, please consider going through the technical blogpost <a href="https://huggingface.co/blog/leaderboard-3c3h-aragen">here</a>.</p>
 """
 ABOUT_SECTION = """
-## About
 The AraGen Leaderboard is designed to evaluate and compare the performance of Chat Arabic Large Language Models (LLMs) on a set of generative tasks. By leveraging the new **3C3H** evaluation measure which evaluate the model's output across six dimensions —Correctness, Completeness, Conciseness, Helpfulness, Honesty, and Harmlessness— the leaderboard provides a comprehensive and holistic evaluation of a model's performance in generating human-like and ethically responsible content.
 ### Why Focus on Chat Models?
-AraGen Leaderboard —And 3C3H in general— is specifically designed to assess **chat models**, which interact in conversational settings, intended for end user interaction and require a blend of factual accuracy and user-centric dialogue capabilities. While it is technically possible to submit foundational models, we kindly ask users to refrain from doing so. For evaluations of foundational models using likelihood accuracy based benchmarks, please refer to the [Open Arabic LLM Leaderboard (OALL)](https://huggingface.co/spaces/OALL/Open-Arabic-LLM-Leaderboard).
 ### How to Submit Your Model?
-Navigate to the submission section below to submit your open chat model from the HuggingFace Hub for evaluation. Ensure that your model is public and the submmited metadata (precision, revision, #params) is accurate.
 ### Contact
-For any inquiries or assistance, feel free to reach out through the community tab at [Inception AraGen Community](https://huggingface.co/spaces/inceptionai/AraGen-Leaderboard/discussions) or via [email](mailto:ali.filali@inceptionai.ai).
-"""
-CITATION_BUTTON_LABEL = """
-Copy the following snippet to cite these results
 """
 CITATION_BUTTON_TEXT = """
-@misc{AraGen,
-  author = {El Filali, Ali and Sengupta, Neha and Abouelseoud, Arwa and Nakov, Preslav and Fourrier, Clémentine},
-  title = {Rethinking LLM Evaluation with 3C3H: AraGen Benchmark and Leaderboard},
-  year = {2024},
-  publisher = {Inception},
-  howpublished = "url{https://huggingface.co/spaces/inceptionai/AraGen-Leaderboard}"
-}
 """
 def load_results():
     # Get the current directory of the script and construct the path to results.json
@@ -63,6 +57,16 @@ def load_results():
     with open(results_file, 'r') as f:
         data = json.load(f)
     # Lists to collect data
     data_3c3h = []
     data_tasks = []
@@ -654,7 +658,7 @@ def main():
                     citation_button = gr.Textbox(
                         value=CITATION_BUTTON_TEXT,
                         label=CITATION_BUTTON_LABEL,
-                        lines=10,
                         elem_id="citation-button",
                         show_copy_button=True,
                     )

 HEADER = """
 <center>
+<h1>This space is experimental and should stay always private!</h1><br></br>
 <h1>AraGen Leaderboard: Generative Tasks Evaluation of Arabic LLMs</h1>
 </center>
 <br></br>
+<p>This leaderboard is designed to redefine the evaluation of Arabic Large Language Models (LLMs). Powered by the new <strong>3C3H</strong> evaluation measure, this framework delivers a transparent, robust, and holistic evaluation system that balances factual accuracy and usability assessment for a production ready setting.</p>
+<p>For more details, please consider going through the technical blogpost <a href="https://huggingface.co/blog/">here</a>.</p>
 """
 ABOUT_SECTION = """
+## About:
 The AraGen Leaderboard is designed to evaluate and compare the performance of Chat Arabic Large Language Models (LLMs) on a set of generative tasks. By leveraging the new **3C3H** evaluation measure which evaluate the model's output across six dimensions —Correctness, Completeness, Conciseness, Helpfulness, Honesty, and Harmlessness— the leaderboard provides a comprehensive and holistic evaluation of a model's performance in generating human-like and ethically responsible content.
 ### Why Focus on Chat Models?
+AraGen —And 3C3H in general— is specifically tailored to assess **chat models**, which interact in conversational settings, intended for end user interaction and require a blend of factual accuracy and user-centric dialogue capabilities. While it is technically possible to submit foundational models, we kindly ask users to refrain from doing so. For evaluations of foundational models using likelihood accuracy based benchmarks, please refer to the [Open Arabic LLM Leaderboard (OALL)](https://huggingface.co/spaces/OALL/Open-Arabic-LLM-Leaderboard).
 ### How to Submit Your Model?
+avigate to the submission section below to submit your open chat model from the HuggingFace Hub for evaluation. Ensure that your model is public and the submmited metadata (precision, revision, #params) is accurate.
 ### Contact
+For any inquiries or assistance, please reach out through the community tab at [Inception AraGen Community](https://huggingface.co/spaces/inceptionai/AraGen-Leaderboard/discussions) or via [email](mailto:ali.filali@inceptionai.ai).
 """
 CITATION_BUTTON_TEXT = """
+CITATION
 """
+CITATION_BUTTON_LABEL = """
+CITATION_BUTTON_LABEL
+"""
 def load_results():
     # Get the current directory of the script and construct the path to results.json
     with open(results_file, 'r') as f:
         data = json.load(f)
+    # Filter out any entries that only contain '_last_sync_timestamp'
+    filtered_data = []
+    for entry in data:
+        # If '_last_sync_timestamp' is the only key, skip it
+        if len(entry.keys()) == 1 and "_last_sync_timestamp" in entry:
+            continue
+        filtered_data.append(entry)
+    data = filtered_data
     # Lists to collect data
     data_3c3h = []
     data_tasks = []
                     citation_button = gr.Textbox(
                         value=CITATION_BUTTON_TEXT,
                         label=CITATION_BUTTON_LABEL,
+                        lines=20,
                         elem_id="citation-button",
                         show_copy_button=True,
                     )