Update space
Browse files- app.py +76 -8
- src/about.py +18 -1
app.py
CHANGED
@@ -11,6 +11,8 @@ from src.about import (
|
|
11 |
INTRODUCTION_TEXT,
|
12 |
LLM_BENCHMARKS_TEXT,
|
13 |
TITLE,
|
|
|
|
|
14 |
COMING_SOON_TEXT
|
15 |
)
|
16 |
from src.display.css_html_js import custom_css
|
@@ -99,7 +101,8 @@ def init_leaderboard(dataframe):
|
|
99 |
|
100 |
# model_result_path = "./src/results/models_2024-10-07-14:50:12.666068.jsonl"
|
101 |
# model_result_path = "./src/results/models_2024-10-08-03:10:26.811832.jsonl"
|
102 |
-
model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
|
|
|
103 |
# model_leaderboard_df = get_model_leaderboard_df(model_result_path)
|
104 |
|
105 |
|
@@ -124,6 +127,8 @@ def overall_leaderboard(dataframe):
|
|
124 |
demo = gr.Blocks(css=custom_css)
|
125 |
with demo:
|
126 |
gr.HTML(TITLE)
|
|
|
|
|
127 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
128 |
|
129 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
@@ -147,6 +152,12 @@ with demo:
|
|
147 |
)
|
148 |
|
149 |
with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
|
151 |
leaderboard = overall_leaderboard(
|
152 |
get_model_leaderboard_df(
|
@@ -164,7 +175,21 @@ with demo:
|
|
164 |
))
|
165 |
|
166 |
with gr.TabItem("🔢 Math", elem_id="math-tab-table", id=2):
|
167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
# leaderboard = init_leaderboard(LEADERBOARD_DF)
|
169 |
with gr.TabItem("🧮 Algebra", elem_id="algebra_subtab", id=0, elem_classes="subtab"):
|
170 |
leaderboard = overall_leaderboard(
|
@@ -217,8 +242,27 @@ with demo:
|
|
217 |
)
|
218 |
)
|
219 |
|
220 |
-
|
221 |
with gr.TabItem("🧠 Reasoning", elem_id="reasonong-tab-table", id=3):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
|
223 |
with gr.TabItem("🧩 Logical", elem_id="logical_subtab", id=0, elem_classes="subtab"):
|
224 |
leaderboard = overall_leaderboard(
|
@@ -254,13 +298,37 @@ with demo:
|
|
254 |
)
|
255 |
)
|
256 |
|
257 |
-
|
258 |
-
|
259 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
260 |
|
261 |
|
262 |
-
with gr.TabItem("🔬 Science", elem_id="science-table", id=5):
|
263 |
-
gr.Markdown(COMING_SOON_TEXT, elem_classes="markdown-text")
|
264 |
|
265 |
|
266 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=6):
|
|
|
11 |
INTRODUCTION_TEXT,
|
12 |
LLM_BENCHMARKS_TEXT,
|
13 |
TITLE,
|
14 |
+
SUB_TITLE,
|
15 |
+
EXTERNAL_LINKS,
|
16 |
COMING_SOON_TEXT
|
17 |
)
|
18 |
from src.display.css_html_js import custom_css
|
|
|
101 |
|
102 |
# model_result_path = "./src/results/models_2024-10-07-14:50:12.666068.jsonl"
|
103 |
# model_result_path = "./src/results/models_2024-10-08-03:10:26.811832.jsonl"
|
104 |
+
# model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
|
105 |
+
model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
|
106 |
# model_leaderboard_df = get_model_leaderboard_df(model_result_path)
|
107 |
|
108 |
|
|
|
127 |
demo = gr.Blocks(css=custom_css)
|
128 |
with demo:
|
129 |
gr.HTML(TITLE)
|
130 |
+
gr.HTML(SUB_TITLE)
|
131 |
+
gr.HTML(EXTERNAL_LINKS)
|
132 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
133 |
|
134 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
|
|
152 |
)
|
153 |
|
154 |
with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
|
155 |
+
DESCRIPTION_TEXT = """
|
156 |
+
Overall dimension measures the comprehensive performance of LLMs across diverse tasks.
|
157 |
+
We start with diverse questions from the widely-used [MT-Bench](https://arxiv.org/abs/2306.05685), coving a wide range of domains, including writing, roleplay, extraction, reasoning, math, coding, knowledge I (STEM), and knowledge II (humanities/social science).
|
158 |
+
|
159 |
+
"""
|
160 |
+
gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
|
161 |
|
162 |
leaderboard = overall_leaderboard(
|
163 |
get_model_leaderboard_df(
|
|
|
175 |
))
|
176 |
|
177 |
with gr.TabItem("🔢 Math", elem_id="math-tab-table", id=2):
|
178 |
+
DESCRIPTION_TEXT="""
|
179 |
+
Algebra, Geometry, and Probability are the current three main math domains in the leaderboard.
|
180 |
+
To mitigate the potential impact of data contimination, we have carefully selected the datasets from various sources.
|
181 |
+
We prioritize recent math datasets and focus on college and beyond level math questions.
|
182 |
+
The current datasets include
|
183 |
+
[MATH](https://arxiv.org/abs/2103.03874),
|
184 |
+
[MATH-500](https://github.com/openai/prm800k/tree/main/prm800k/math_splits),
|
185 |
+
[Omni](https://omni-math.github.io/),
|
186 |
+
[MathQA](https://arxiv.org/abs/1905.13319),
|
187 |
+
[MathBench](https://arxiv.org/abs/2405.12209),
|
188 |
+
[SciBench](https://arxiv.org/abs/2307.10635), and more!
|
189 |
+
We plan to include more math domains, such as calculus, number theory, and more in the future.
|
190 |
+
"""
|
191 |
+
gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
|
192 |
+
|
193 |
# leaderboard = init_leaderboard(LEADERBOARD_DF)
|
194 |
with gr.TabItem("🧮 Algebra", elem_id="algebra_subtab", id=0, elem_classes="subtab"):
|
195 |
leaderboard = overall_leaderboard(
|
|
|
242 |
)
|
243 |
)
|
244 |
|
|
|
245 |
with gr.TabItem("🧠 Reasoning", elem_id="reasonong-tab-table", id=3):
|
246 |
+
DESCRIPTION_TEXT = """
|
247 |
+
Reasoning is a broad domain for evaluating LLMs, but traditional tasks like commonsense reasoning have become less effective at distinguishing between modern LLMs.
|
248 |
+
Our current focus is on two challenging types of reasoning: logical reasoning and social reasoning, both of which present more meaningful and sophisticated ways to assess LLM performance.
|
249 |
+
|
250 |
+
For logical reasoning, we collect datasets from
|
251 |
+
[BigBench Hard (BBH)](https://arxiv.org/abs/2210.09261),
|
252 |
+
[FOLIO](https://arxiv.org/abs/2209.00840),
|
253 |
+
[LogiQA2.0](https://github.com/csitfun/LogiQA2.0),
|
254 |
+
[PrOntoQA](https://arxiv.org/abs/2210.01240),
|
255 |
+
[ReClor](https://arxiv.org/abs/2002.04326).
|
256 |
+
|
257 |
+
For social reasoning, we collect datasets from
|
258 |
+
[MMToM-QA](https://arxiv.org/abs/2401.08743),
|
259 |
+
[BigToM](https://arxiv.org/abs/2306.15448),
|
260 |
+
[Adv-CSFB](https://arxiv.org/abs/2305.14763),
|
261 |
+
[SocialIQA](https://arxiv.org/abs/1904.09728),
|
262 |
+
[NormBank](https://arxiv.org/abs/2305.17008).
|
263 |
+
|
264 |
+
"""
|
265 |
+
gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
|
266 |
|
267 |
with gr.TabItem("🧩 Logical", elem_id="logical_subtab", id=0, elem_classes="subtab"):
|
268 |
leaderboard = overall_leaderboard(
|
|
|
298 |
)
|
299 |
)
|
300 |
|
301 |
+
with gr.TabItem("🔬 Science", elem_id="science-table", id=4):
|
302 |
+
CURRENT_TEXT = """
|
303 |
+
# Coming soon!
|
304 |
+
We are working on adding more tasks on scientific domains to the leaderboard. The forthcoming ones are biology, chemistry, and physics.
|
305 |
+
We have diversely and aggressively collected recent science datasets, including but not limited to
|
306 |
+
[GPQA](https://arxiv.org/abs/2311.12022),
|
307 |
+
[JEEBench](https://aclanthology.org/2023.emnlp-main.468/),
|
308 |
+
[MMLU-Pro](https://arxiv.org/abs/2406.01574),
|
309 |
+
[OlympiadBench](https://arxiv.org/abs/2402.14008),
|
310 |
+
[SciBench](https://arxiv.org/abs/2307.10635),
|
311 |
+
[SciEval](https://arxiv.org/abs/2308.13149).
|
312 |
+
"""
|
313 |
+
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
|
314 |
+
|
315 |
+
|
316 |
+
with gr.TabItem("</> Coding", elem_id="coding-tab-table", id=5):
|
317 |
+
CURRENT_TEXT = """
|
318 |
+
# Comming soon!
|
319 |
+
We are working on adding more tasks in coding domains to the leaderboard.
|
320 |
+
The forthcoming ones focus on Python, Java, and C++, with plans to expand to more languages.
|
321 |
+
We collect a variety of recent coding datasets, including
|
322 |
+
[HumanEval](https://huggingface.co/datasets/openai/openai_humaneval),
|
323 |
+
[MBPP](https://huggingface.co/datasets/google-research-datasets/mbpp),
|
324 |
+
[HumanEvalFix](https://huggingface.co/datasets/bigcode/humanevalpack),
|
325 |
+
[newly crawled LeetCode data](https://leetcode.com/problemset/),
|
326 |
+
filtered code-related queries from [Arena-Hard-Auto](https://github.com/lmarena/arena-hard-auto) and more!
|
327 |
+
Our efforts also include synthesizing new code-related queries to ensure diversity!
|
328 |
+
"""
|
329 |
+
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
|
330 |
|
331 |
|
|
|
|
|
332 |
|
333 |
|
334 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=6):
|
src/about.py
CHANGED
@@ -53,7 +53,19 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
53 |
|
54 |
|
55 |
# Your leaderboard name
|
56 |
-
TITLE = """<h1 align="center" id="space-title">Decentralized Arena</h1>"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
# What does your leaderboard evaluate?
|
59 |
INTRODUCTION_TEXT = """
|
@@ -110,4 +122,9 @@ If everything is done, check you can launch the EleutherAIHarness on your model
|
|
110 |
|
111 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
112 |
CITATION_BUTTON_TEXT = r"""
|
|
|
|
|
|
|
|
|
|
|
113 |
"""
|
|
|
53 |
|
54 |
|
55 |
# Your leaderboard name
|
56 |
+
TITLE = """<h1 align="center" id="space-title">Decentralized Arena Leaderboard</h1>"""
|
57 |
+
|
58 |
+
SUB_TITLE = """<h3 align="center" id="space-subtitle">Building Automated, Robust, and Transparent LLM Evaluation for Numerous Dimensions</h3>"""
|
59 |
+
|
60 |
+
EXTERNAL_LINKS = """
|
61 |
+
<h3 align="center" id="space-links">
|
62 |
+
<a href="https://de-arena.maitrix.org/" target="_blank">Blog</a> |
|
63 |
+
<a href="https://github.com/maitrix-org/de-arena" target="_blank">GitHub</a> |
|
64 |
+
<a href="https://de-arena.maitrix.org/images/Heading.mp4" target="">Video</a> |
|
65 |
+
<a href="https://maitrix.org/" target="_blank">@Maitrix.org</a> |
|
66 |
+
<a href="https://www.llm360.ai/" target="_blank">@LLM360</a>
|
67 |
+
</h3>
|
68 |
+
"""
|
69 |
|
70 |
# What does your leaderboard evaluate?
|
71 |
INTRODUCTION_TEXT = """
|
|
|
122 |
|
123 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
124 |
CITATION_BUTTON_TEXT = r"""
|
125 |
+
@misc{decentralized2024,
|
126 |
+
title={Decentralized Arena via Collective LLM Intelligence: Building Automated, Robust, and Transparent LLM Evaluation for Numerous Dimensions},
|
127 |
+
author={Yanbin Yin, Zhen Wang, Kun Zhou, Xiangdong Zhang, Shibo Hao, Yi Gu, Jieyuan Liu, Somanshu Singla, Tianyang Liu, Eric P. Xing, Zhengzhong Liu, Haojian Jin, Zhiting Hu},
|
128 |
+
year=2024
|
129 |
+
}
|
130 |
"""
|