Clémentine
commited on
Commit
•
4ccfada
1
Parent(s):
e4ab31c
fix display
Browse files- README.md +1 -1
- app.py +1 -1
- src/display/about.py +10 -9
- src/leaderboard/read_evals.py +3 -0
README.md
CHANGED
@@ -17,7 +17,7 @@ space_ci:
|
|
17 |
- H4_TOKEN
|
18 |
tags:
|
19 |
- leaderboard
|
20 |
-
short_description:
|
21 |
---
|
22 |
|
23 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
17 |
- H4_TOKEN
|
18 |
tags:
|
19 |
- leaderboard
|
20 |
+
short_description: Track, rank and evaluate open LLMs and chatbots
|
21 |
---
|
22 |
|
23 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -340,7 +340,7 @@ with demo:
|
|
340 |
with gr.TabItem("❗FAQ", elem_id="llm-benchmark-tab-table", id=4):
|
341 |
gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
|
342 |
|
343 |
-
with gr.TabItem("🚀 Submit
|
344 |
with gr.Column():
|
345 |
with gr.Row():
|
346 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
|
|
340 |
with gr.TabItem("❗FAQ", elem_id="llm-benchmark-tab-table", id=4):
|
341 |
gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
|
342 |
|
343 |
+
with gr.TabItem("🚀 Submit ", elem_id="llm-benchmark-tab-table", id=5):
|
344 |
with gr.Column():
|
345 |
with gr.Row():
|
346 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
src/display/about.py
CHANGED
@@ -1,16 +1,8 @@
|
|
1 |
from src.display.utils import ModelType
|
2 |
|
3 |
-
TITLE = """<h1
|
4 |
|
5 |
INTRODUCTION_TEXT = """
|
6 |
-
📐 The 🤗 Open LLM Leaderboard aims to track, rank and evaluate open LLMs and chatbots.
|
7 |
-
|
8 |
-
🤗 Submit a model for automated evaluation on the 🤗 GPU cluster on the "Submit" page!
|
9 |
-
The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
|
10 |
-
|
11 |
-
Other cool leaderboards:
|
12 |
-
- [LLM safety](https://huggingface.co/spaces/AI-Secure/llm-trustworthy-leaderboard)
|
13 |
-
- [LLM performance](https://huggingface.co/spaces/optimum/llm-perf-leaderboard)
|
14 |
"""
|
15 |
|
16 |
icons = f"""
|
@@ -24,6 +16,9 @@ LLM_BENCHMARKS_TEXT = f"""
|
|
24 |
## ABOUT
|
25 |
With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
|
26 |
|
|
|
|
|
|
|
27 |
### Tasks
|
28 |
📈 We evaluate models on 6 key benchmarks using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
|
29 |
|
@@ -88,6 +83,12 @@ To get more information about quantization, see:
|
|
88 |
### Useful links
|
89 |
- [Community resources](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/174)
|
90 |
- [Collection of best models](https://huggingface.co/collections/open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03)
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
"""
|
92 |
|
93 |
FAQ_TEXT = """
|
|
|
1 |
from src.display.utils import ModelType
|
2 |
|
3 |
+
TITLE = """<h1 style="text-align:left;float:left; id="space-title">🤗 Open LLM Leaderboard</h1> Track, rank and evaluate open LLMs and chatbots"""
|
4 |
|
5 |
INTRODUCTION_TEXT = """
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
"""
|
7 |
|
8 |
icons = f"""
|
|
|
16 |
## ABOUT
|
17 |
With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
|
18 |
|
19 |
+
🤗 Submit a model for automated evaluation on the 🤗 GPU cluster on the "Submit" page!
|
20 |
+
The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details below!
|
21 |
+
|
22 |
### Tasks
|
23 |
📈 We evaluate models on 6 key benchmarks using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
|
24 |
|
|
|
83 |
### Useful links
|
84 |
- [Community resources](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/174)
|
85 |
- [Collection of best models](https://huggingface.co/collections/open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03)
|
86 |
+
|
87 |
+
### Other cool leaderboards:
|
88 |
+
- [LLM safety](https://huggingface.co/spaces/AI-Secure/llm-trustworthy-leaderboard)
|
89 |
+
- [LLM performance](https://huggingface.co/spaces/optimum/llm-perf-leaderboard)
|
90 |
+
|
91 |
+
|
92 |
"""
|
93 |
|
94 |
FAQ_TEXT = """
|
src/leaderboard/read_evals.py
CHANGED
@@ -204,6 +204,9 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
|
|
204 |
eval_result.update_with_request_file(requests_path)
|
205 |
if eval_result.full_model in dynamic_data:
|
206 |
eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
|
|
|
|
|
|
|
207 |
|
208 |
# Store results of same eval together
|
209 |
eval_name = eval_result.eval_name
|
|
|
204 |
eval_result.update_with_request_file(requests_path)
|
205 |
if eval_result.full_model in dynamic_data:
|
206 |
eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
|
207 |
+
# Hardcoding because of gating problem
|
208 |
+
if "meta-llama" in eval_result.full_model:
|
209 |
+
eval_result.still_on_hub = True
|
210 |
|
211 |
# Store results of same eval together
|
212 |
eval_name = eval_result.eval_name
|