Terry Zhuo commited on
Commit
6c29798
1 Parent(s): ae7a86d
Files changed (2) hide show
  1. app.py +19 -11
  2. src/text_content.py +11 -1
app.py CHANGED
@@ -9,7 +9,7 @@ import requests
9
  from huggingface_hub import HfApi
10
 
11
  from src.css_html import custom_css
12
- from src.text_content import ABOUT_TEXT, SUBMISSION_TEXT_3
13
  from src.utils import (
14
  AutoEvalColumn,
15
  fields,
@@ -23,11 +23,11 @@ from src.utils import (
23
  from datasets import load_dataset
24
  TOKEN = os.environ.get("TOKEN", None)
25
  api = HfApi(TOKEN)
26
- df = load_dataset("bigcode/bigcodebench-results", split="train").to_pandas().sort_values("complete", ascending=False)
27
- task_elo_mle_df = load_dataset("bigcode/bigcodebench-elo", split="train").to_pandas()
28
- model_elo_mle_df = load_dataset("bigcode/bigcodebench-elo-model-with-tie", split="train").to_pandas()
29
- complete_solve_rate = load_dataset("bigcode/bigcodebench-complete-solve-rate", split="train").to_pandas()
30
- instruct_solve_rate = load_dataset("bigcode/bigcodebench-instruct-solve-rate", split="train").to_pandas()
31
 
32
  QUEUE_REPO = "bigcode/bigcodebench-requests"
33
  EVAL_REQUESTS_PATH = "eval-queue"
@@ -248,10 +248,9 @@ with demo:
248
  - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
249
  - `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
250
  - `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on `BigCodeBench-Complete`, which starts from 1000 and is boostrapped 500 times.
251
- - `size` is the amount of activated model weight during inference.
252
  - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
253
  - For more details check the 📝 About section.
254
- - Models with a 🔴 symbol represent external evaluation submission, this means that we didn't verify the results, you can find the author's submission under `Submission PR` field from `See All Columns` tab.
255
  """,
256
  elem_classes="markdown-text",
257
  )
@@ -265,7 +264,7 @@ with demo:
265
  with gr.Group():
266
  gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
267
  model_elo_map = gr.Plot()
268
- demo.load(plot_elo_mle, [gr.Dataframe(model_elo_mle_df, visible=False)], model_elo_map)
269
 
270
  with gr.TabItem("🧩 Solve Rate", id=2):
271
  with gr.Column():
@@ -280,8 +279,17 @@ with demo:
280
 
281
  with gr.TabItem("📝 About", id=3):
282
  gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
283
- with gr.TabItem("Submit Results 🚀", id=4):
284
  gr.Markdown(SUBMISSION_TEXT_3)
285
-
 
 
 
 
 
 
 
 
 
286
 
287
  demo.launch()
 
9
  from huggingface_hub import HfApi
10
 
11
  from src.css_html import custom_css
12
+ from src.text_content import ABOUT_TEXT, SUBMISSION_TEXT_3, CITATION_BUTTON_TEXT, CITATION_BUTTON_LABEL
13
  from src.utils import (
14
  AutoEvalColumn,
15
  fields,
 
23
  from datasets import load_dataset
24
  TOKEN = os.environ.get("TOKEN", None)
25
  api = HfApi(TOKEN)
26
+ df = load_dataset("bigcode/bigcodebench-results", split="train").to_pandas().sort_values("elo_mle", ascending=False)
27
+ task_elo_mle_df = load_dataset("bigcode/bigcodebench-elo", split="task_no_tie").to_pandas()
28
+ bench_elo_mle_df = load_dataset("bigcode/bigcodebench-elo", split="benchmark_tie").to_pandas()
29
+ complete_solve_rate = load_dataset("bigcode/bigcodebench-solve-rate", split="complete").to_pandas()
30
+ instruct_solve_rate = load_dataset("bigcode/bigcodebench-solve-rate", split="instruct").to_pandas()
31
 
32
  QUEUE_REPO = "bigcode/bigcodebench-requests"
33
  EVAL_REQUESTS_PATH = "eval-queue"
 
248
  - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
249
  - `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
250
  - `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on `BigCodeBench-Complete`, which starts from 1000 and is boostrapped 500 times.
251
+ - `size` (optional) is the amount of activated model weight during inference.
252
  - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
253
  - For more details check the 📝 About section.
 
254
  """,
255
  elem_classes="markdown-text",
256
  )
 
264
  with gr.Group():
265
  gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
266
  model_elo_map = gr.Plot()
267
+ demo.load(plot_elo_mle, [gr.Dataframe(bench_elo_mle_df, visible=False)], model_elo_map)
268
 
269
  with gr.TabItem("🧩 Solve Rate", id=2):
270
  with gr.Column():
 
279
 
280
  with gr.TabItem("📝 About", id=3):
281
  gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
282
+ with gr.TabItem("Submit/Request Results 🚀", id=4):
283
  gr.Markdown(SUBMISSION_TEXT_3)
284
+
285
+ with gr.Row():
286
+ with gr.Accordion("📙 Citation", open=False):
287
+ citation_button = gr.Textbox(
288
+ value=CITATION_BUTTON_TEXT,
289
+ label=CITATION_BUTTON_LABEL,
290
+ lines=20,
291
+ elem_id="citation-button",
292
+ show_copy_button=True,
293
+ )
294
 
295
  demo.launch()
src/text_content.py CHANGED
@@ -122,6 +122,16 @@ To submit your results create a **Pull Request** in the community tab to add the
122
  The title of the PR should be `[Community Submission] Model: org/model, Username: your_username`, replace org and model with those corresponding to the model you evaluated.
123
  """
124
 
 
 
 
 
 
 
 
 
 
 
125
  SUBMISSION_TEXT_3="""
126
- Coming soon...
127
  """
 
122
  The title of the PR should be `[Community Submission] Model: org/model, Username: your_username`, replace org and model with those corresponding to the model you evaluated.
123
  """
124
 
125
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
126
+
127
+ CITATION_BUTTON_TEXT = r"""
128
+ @article{bigcodebench,
129
+   title={BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions},
130
+   author={Zhuo, Terry Yue and Vu, Min Chien and Chim, Jenny and Hu, Han and Yu, Wenhao and Widyasari, Ratnadira and Yusuf, Imam Nur Bani and Zhan, Haolan and He, Junda and Paul, Indraneil and Brunner, Simon and Gong, Chen and Hoang, Thong and Zebaze, Armel Randy and Hong, Xiaoheng and Li, Wen-Ding and Kaddour, Jean and Xu, Ming and Zhang, Zhihan and Yadav, Prateek and Jain, Naman and Gu, Alex and Cheng, Zhoujun and Liu, Jiawei and Liu, Qian and Wang, Zijian and Lo, David and Hui, Binyuan and Muennighoff, Niklas and Fried, Daniel and Du, Xiaoning and de Vries, Harm and Von Werra, Leandro},
131
+   year={2024}
132
+ }
133
+ """
134
+
135
  SUBMISSION_TEXT_3="""
136
+ We welcome the community to request for new models to be added to the leaderboard. Please [submit a PR here](https://github.com/bigcode-project/bigcodebench/issues/new/choose) to add the model to the leaderboard 🤗
137
  """