yzabc007 commited on
Commit
606c189
·
1 Parent(s): 056a0a0

Update space

Browse files
Files changed (2) hide show
  1. app.py +76 -8
  2. src/about.py +18 -1
app.py CHANGED
@@ -11,6 +11,8 @@ from src.about import (
11
  INTRODUCTION_TEXT,
12
  LLM_BENCHMARKS_TEXT,
13
  TITLE,
 
 
14
  COMING_SOON_TEXT
15
  )
16
  from src.display.css_html_js import custom_css
@@ -99,7 +101,8 @@ def init_leaderboard(dataframe):
99
 
100
  # model_result_path = "./src/results/models_2024-10-07-14:50:12.666068.jsonl"
101
  # model_result_path = "./src/results/models_2024-10-08-03:10:26.811832.jsonl"
102
- model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
 
103
  # model_leaderboard_df = get_model_leaderboard_df(model_result_path)
104
 
105
 
@@ -124,6 +127,8 @@ def overall_leaderboard(dataframe):
124
  demo = gr.Blocks(css=custom_css)
125
  with demo:
126
  gr.HTML(TITLE)
 
 
127
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
128
 
129
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
@@ -147,6 +152,12 @@ with demo:
147
  )
148
 
149
  with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
 
 
 
 
 
 
150
 
151
  leaderboard = overall_leaderboard(
152
  get_model_leaderboard_df(
@@ -164,7 +175,21 @@ with demo:
164
  ))
165
 
166
  with gr.TabItem("🔢 Math", elem_id="math-tab-table", id=2):
167
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  # leaderboard = init_leaderboard(LEADERBOARD_DF)
169
  with gr.TabItem("🧮 Algebra", elem_id="algebra_subtab", id=0, elem_classes="subtab"):
170
  leaderboard = overall_leaderboard(
@@ -217,8 +242,27 @@ with demo:
217
  )
218
  )
219
 
220
-
221
  with gr.TabItem("🧠 Reasoning", elem_id="reasonong-tab-table", id=3):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
  with gr.TabItem("🧩 Logical", elem_id="logical_subtab", id=0, elem_classes="subtab"):
224
  leaderboard = overall_leaderboard(
@@ -254,13 +298,37 @@ with demo:
254
  )
255
  )
256
 
257
-
258
- with gr.TabItem("</> Coding", elem_id="coding-tab-table", id=4):
259
- gr.Markdown(COMING_SOON_TEXT, elem_classes="markdown-text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
 
261
 
262
- with gr.TabItem("🔬 Science", elem_id="science-table", id=5):
263
- gr.Markdown(COMING_SOON_TEXT, elem_classes="markdown-text")
264
 
265
 
266
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=6):
 
11
  INTRODUCTION_TEXT,
12
  LLM_BENCHMARKS_TEXT,
13
  TITLE,
14
+ SUB_TITLE,
15
+ EXTERNAL_LINKS,
16
  COMING_SOON_TEXT
17
  )
18
  from src.display.css_html_js import custom_css
 
101
 
102
  # model_result_path = "./src/results/models_2024-10-07-14:50:12.666068.jsonl"
103
  # model_result_path = "./src/results/models_2024-10-08-03:10:26.811832.jsonl"
104
+ # model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
105
+ model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
106
  # model_leaderboard_df = get_model_leaderboard_df(model_result_path)
107
 
108
 
 
127
  demo = gr.Blocks(css=custom_css)
128
  with demo:
129
  gr.HTML(TITLE)
130
+ gr.HTML(SUB_TITLE)
131
+ gr.HTML(EXTERNAL_LINKS)
132
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
133
 
134
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
 
152
  )
153
 
154
  with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
155
+ DESCRIPTION_TEXT = """
156
+ Overall dimension measures the comprehensive performance of LLMs across diverse tasks.
157
+ We start with diverse questions from the widely-used [MT-Bench](https://arxiv.org/abs/2306.05685), coving a wide range of domains, including writing, roleplay, extraction, reasoning, math, coding, knowledge I (STEM), and knowledge II (humanities/social science).
158
+
159
+ """
160
+ gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
161
 
162
  leaderboard = overall_leaderboard(
163
  get_model_leaderboard_df(
 
175
  ))
176
 
177
  with gr.TabItem("🔢 Math", elem_id="math-tab-table", id=2):
178
+ DESCRIPTION_TEXT="""
179
+ Algebra, Geometry, and Probability are the current three main math domains in the leaderboard.
180
+ To mitigate the potential impact of data contimination, we have carefully selected the datasets from various sources.
181
+ We prioritize recent math datasets and focus on college and beyond level math questions.
182
+ The current datasets include
183
+ [MATH](https://arxiv.org/abs/2103.03874),
184
+ [MATH-500](https://github.com/openai/prm800k/tree/main/prm800k/math_splits),
185
+ [Omni](https://omni-math.github.io/),
186
+ [MathQA](https://arxiv.org/abs/1905.13319),
187
+ [MathBench](https://arxiv.org/abs/2405.12209),
188
+ [SciBench](https://arxiv.org/abs/2307.10635), and more!
189
+ We plan to include more math domains, such as calculus, number theory, and more in the future.
190
+ """
191
+ gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
192
+
193
  # leaderboard = init_leaderboard(LEADERBOARD_DF)
194
  with gr.TabItem("🧮 Algebra", elem_id="algebra_subtab", id=0, elem_classes="subtab"):
195
  leaderboard = overall_leaderboard(
 
242
  )
243
  )
244
 
 
245
  with gr.TabItem("🧠 Reasoning", elem_id="reasonong-tab-table", id=3):
246
+ DESCRIPTION_TEXT = """
247
+ Reasoning is a broad domain for evaluating LLMs, but traditional tasks like commonsense reasoning have become less effective at distinguishing between modern LLMs.
248
+ Our current focus is on two challenging types of reasoning: logical reasoning and social reasoning, both of which present more meaningful and sophisticated ways to assess LLM performance.
249
+
250
+ For logical reasoning, we collect datasets from
251
+ [BigBench Hard (BBH)](https://arxiv.org/abs/2210.09261),
252
+ [FOLIO](https://arxiv.org/abs/2209.00840),
253
+ [LogiQA2.0](https://github.com/csitfun/LogiQA2.0),
254
+ [PrOntoQA](https://arxiv.org/abs/2210.01240),
255
+ [ReClor](https://arxiv.org/abs/2002.04326).
256
+
257
+ For social reasoning, we collect datasets from
258
+ [MMToM-QA](https://arxiv.org/abs/2401.08743),
259
+ [BigToM](https://arxiv.org/abs/2306.15448),
260
+ [Adv-CSFB](https://arxiv.org/abs/2305.14763),
261
+ [SocialIQA](https://arxiv.org/abs/1904.09728),
262
+ [NormBank](https://arxiv.org/abs/2305.17008).
263
+
264
+ """
265
+ gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
266
 
267
  with gr.TabItem("🧩 Logical", elem_id="logical_subtab", id=0, elem_classes="subtab"):
268
  leaderboard = overall_leaderboard(
 
298
  )
299
  )
300
 
301
+ with gr.TabItem("🔬 Science", elem_id="science-table", id=4):
302
+ CURRENT_TEXT = """
303
+ # Coming soon!
304
+ We are working on adding more tasks on scientific domains to the leaderboard. The forthcoming ones are biology, chemistry, and physics.
305
+ We have diversely and aggressively collected recent science datasets, including but not limited to
306
+ [GPQA](https://arxiv.org/abs/2311.12022),
307
+ [JEEBench](https://aclanthology.org/2023.emnlp-main.468/),
308
+ [MMLU-Pro](https://arxiv.org/abs/2406.01574),
309
+ [OlympiadBench](https://arxiv.org/abs/2402.14008),
310
+ [SciBench](https://arxiv.org/abs/2307.10635),
311
+ [SciEval](https://arxiv.org/abs/2308.13149).
312
+ """
313
+ gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
314
+
315
+
316
+ with gr.TabItem("</> Coding", elem_id="coding-tab-table", id=5):
317
+ CURRENT_TEXT = """
318
+ # Comming soon!
319
+ We are working on adding more tasks in coding domains to the leaderboard.
320
+ The forthcoming ones focus on Python, Java, and C++, with plans to expand to more languages.
321
+ We collect a variety of recent coding datasets, including
322
+ [HumanEval](https://huggingface.co/datasets/openai/openai_humaneval),
323
+ [MBPP](https://huggingface.co/datasets/google-research-datasets/mbpp),
324
+ [HumanEvalFix](https://huggingface.co/datasets/bigcode/humanevalpack),
325
+ [newly crawled LeetCode data](https://leetcode.com/problemset/),
326
+ filtered code-related queries from [Arena-Hard-Auto](https://github.com/lmarena/arena-hard-auto) and more!
327
+ Our efforts also include synthesizing new code-related queries to ensure diversity!
328
+ """
329
+ gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
330
 
331
 
 
 
332
 
333
 
334
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=6):
src/about.py CHANGED
@@ -53,7 +53,19 @@ NUM_FEWSHOT = 0 # Change with your few shot
53
 
54
 
55
  # Your leaderboard name
56
- TITLE = """<h1 align="center" id="space-title">Decentralized Arena</h1>"""
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  # What does your leaderboard evaluate?
59
  INTRODUCTION_TEXT = """
@@ -110,4 +122,9 @@ If everything is done, check you can launch the EleutherAIHarness on your model
110
 
111
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
112
  CITATION_BUTTON_TEXT = r"""
 
 
 
 
 
113
  """
 
53
 
54
 
55
  # Your leaderboard name
56
+ TITLE = """<h1 align="center" id="space-title">Decentralized Arena Leaderboard</h1>"""
57
+
58
+ SUB_TITLE = """<h3 align="center" id="space-subtitle">Building Automated, Robust, and Transparent LLM Evaluation for Numerous Dimensions</h3>"""
59
+
60
+ EXTERNAL_LINKS = """
61
+ <h3 align="center" id="space-links">
62
+ <a href="https://de-arena.maitrix.org/" target="_blank">Blog</a> |
63
+ <a href="https://github.com/maitrix-org/de-arena" target="_blank">GitHub</a> |
64
+ <a href="https://de-arena.maitrix.org/images/Heading.mp4" target="">Video</a> |
65
+ <a href="https://maitrix.org/" target="_blank">@Maitrix.org</a> |
66
+ <a href="https://www.llm360.ai/" target="_blank">@LLM360</a>
67
+ </h3>
68
+ """
69
 
70
  # What does your leaderboard evaluate?
71
  INTRODUCTION_TEXT = """
 
122
 
123
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
124
  CITATION_BUTTON_TEXT = r"""
125
+ @misc{decentralized2024,
126
+ title={Decentralized Arena via Collective LLM Intelligence: Building Automated, Robust, and Transparent LLM Evaluation for Numerous Dimensions},
127
+ author={Yanbin Yin, Zhen Wang, Kun Zhou, Xiangdong Zhang, Shibo Hao, Yi Gu, Jieyuan Liu, Somanshu Singla, Tianyang Liu, Eric P. Xing, Zhengzhong Liu, Haojian Jin, Zhiting Hu},
128
+ year=2024
129
+ }
130
  """