Terry Zhuo commited on
Commit
80c91d0
·
1 Parent(s): c19490f
.gitignore CHANGED
@@ -1,4 +1,5 @@
1
  # Byte-compiled / optimized / DLL files
 
2
  __pycache__/
3
  *.py[cod]
4
  *$py.class
 
1
  # Byte-compiled / optimized / DLL files
2
+ _/
3
  __pycache__/
4
  *.py[cod]
5
  *$py.class
Makefile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: style format
2
+
3
+
4
+ style:
5
+ python -m black --line-length 119 .
6
+ python -m isort .
7
+ ruff check --fix .
8
+
9
+
10
+ quality:
11
+ python -m black --check --line-length 119 .
12
+ python -m isort --check-only .
13
+ ruff check .
README.md CHANGED
@@ -1,11 +1,18 @@
1
  ---
2
- title: BigCodeBench Evaluator
3
- emoji: 💻
4
- colorFrom: indigo
5
  colorTo: indigo
6
- sdk: docker
 
 
 
7
  pinned: false
8
  license: apache-2.0
 
 
 
 
 
9
  ---
10
-
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: BigCodeBench Leaderboard
3
+ emoji: 🥇
4
+ colorFrom: green
5
  colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 4.36.1
8
+ app_file: app.py
9
+ disable_embedding: true
10
  pinned: false
11
  license: apache-2.0
12
+ tags:
13
+ - leaderboard
14
+ - eval:code
15
+ - test:public
16
+ - judge:auto
17
  ---
18
+ Paper:arxiv.org/abs/2406.15877
 
app.py CHANGED
@@ -1,178 +1,649 @@
1
- import gradio as gr
2
- import subprocess
3
- import sys
4
  import os
5
- import threading
6
  import time
7
- import uuid
8
- import glob
9
- import shutil
10
- from pathlib import Path
 
 
 
11
  from apscheduler.schedulers.background import BackgroundScheduler
12
 
13
- default_command = "bigcodebench.evaluate"
14
- is_running = False
15
- lock = threading.Lock()
16
 
17
- def generate_command(
18
- jsonl_file, split, subset, parallel,
19
- min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
20
- check_gt_only, no_gt
21
- ):
22
- command = [default_command]
23
-
24
- if jsonl_file is not None:
25
- # Copy the uploaded file to the current directory
26
- local_filename = os.path.basename(jsonl_file.name)
27
- shutil.copy(jsonl_file.name, local_filename)
28
- command.extend(["--samples", local_filename])
29
-
30
- command.extend(["--split", split, "--subset", subset])
31
-
32
- if parallel is not None and parallel != 0:
33
- command.extend(["--parallel", str(int(parallel))])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- command.extend([
36
- "--min-time-limit", str(min_time_limit),
37
- "--max-as-limit", str(int(max_as_limit)),
38
- "--max-data-limit", str(int(max_data_limit)),
39
- "--max-stack-limit", str(int(max_stack_limit))
40
- ])
41
 
42
- if check_gt_only:
43
- command.append("--check-gt-only")
44
 
45
- if no_gt:
46
- command.append("--no-gt")
 
 
 
47
 
48
- return " ".join(command)
 
 
 
 
 
 
 
 
49
 
 
 
50
 
51
- def cleanup_previous_files(jsonl_file):
52
- if jsonl_file is not None:
53
- file_list = ['Dockerfile', 'app.py', 'README.md', os.path.basename(jsonl_file.name), "__pycache__"]
54
- else:
55
- file_list = ['Dockerfile', 'app.py', 'README.md', "__pycache__"]
56
- for file in glob.glob("*"):
 
 
 
 
 
 
 
 
 
 
 
 
57
  try:
58
- if file not in file_list:
59
- os.remove(file)
60
- except Exception as e:
61
- print(f"Error during cleanup of {file}: {e}")
62
-
63
- def find_result_file():
64
- json_files = glob.glob("*.json")
65
- if json_files:
66
- return max(json_files, key=os.path.getmtime)
67
- return None
68
-
69
- def run_bigcodebench(command):
70
- global is_running
71
- with lock:
72
- if is_running:
73
- yield "A command is already running. Please wait for it to finish.\n"
74
  return
75
- is_running = True
 
 
 
 
 
76
 
77
- try:
78
- yield f"Executing command: {command}\n"
79
-
80
- process = subprocess.Popen(command.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
81
-
82
- for line in process.stdout:
83
- yield line
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
- # process.wait()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
- if process.returncode != 0:
88
- yield f"Error: Command exited with status {process.returncode}\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- yield "Evaluation completed.\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
- result_file = find_result_file()
93
- if result_file:
94
- yield f"Result file found: {result_file}\n"
95
- else:
96
- yield "No result file found.\n"
97
- finally:
98
- with lock:
99
- is_running = False
100
-
101
- def stream_logs(command, jsonl_file=None):
102
- global is_running
 
 
 
 
 
 
 
103
 
104
- if is_running:
105
- yield "A command is already running. Please wait for it to finish.\n"
106
- return
107
-
108
- cleanup_previous_files(jsonl_file)
109
- yield "Cleaned up previous files.\n"
110
 
111
- log_content = []
112
- for log_line in run_bigcodebench(command):
113
- log_content.append(log_line)
114
- yield "".join(log_content)
 
 
 
 
 
 
 
115
 
116
- with gr.Blocks() as demo:
117
- gr.Markdown("# BigCodeBench Evaluator")
118
-
119
- with gr.Row():
120
- jsonl_file = gr.File(label="Upload JSONL file", file_types=[".jsonl"])
121
- split = gr.Dropdown(choices=["complete", "instruct"], label="Split", value="complete")
122
- subset = gr.Dropdown(choices=["hard"], label="Subset", value="hard")
 
 
 
 
 
 
 
 
 
 
 
123
 
124
- with gr.Row():
125
- parallel = gr.Number(label="Parallel (optional)", precision=0)
126
- min_time_limit = gr.Number(label="Min Time Limit", value=1, precision=1)
127
- max_as_limit = gr.Number(label="Max AS Limit", value=25*1024, precision=0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
- with gr.Row():
130
- max_data_limit = gr.Number(label="Max Data Limit", value=25*1024, precision=0)
131
- max_stack_limit = gr.Number(label="Max Stack Limit", value=10, precision=0)
132
- check_gt_only = gr.Checkbox(label="Check GT Only")
133
- no_gt = gr.Checkbox(label="No GT")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
- command_output = gr.Textbox(label="Command", value=default_command, interactive=False)
136
  with gr.Row():
137
- submit_btn = gr.Button("Run Evaluation")
138
- download_btn = gr.DownloadButton(label="Download Result")
139
- log_output = gr.Textbox(label="Execution Logs", lines=20)
140
-
141
- input_components = [
142
- jsonl_file, split, subset, parallel,
143
- min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
144
- check_gt_only, no_gt
145
- ]
146
-
147
- for component in input_components:
148
- component.change(generate_command, inputs=input_components, outputs=command_output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
-
151
- def start_evaluation(command, jsonl_file, subset, split):
152
- extra = subset + "_" if subset != "full" else ""
153
- if jsonl_file is not None:
154
- result_path = os.path.basename(jsonl_file.name).replace(".jsonl", f"_{extra}eval_results.json")
155
- else:
156
- result_path = None
157
-
158
- for log in stream_logs(command, jsonl_file):
159
- if jsonl_file is not None:
160
- yield log, gr.update(value=result_path, label=result_path), gr.update()
161
- else:
162
- yield log, gr.update(), gr.update()
163
- is_running = False
164
- result_file = find_result_file()
165
- if result_file:
166
- return gr.update(label="Evaluation completed. Result file found."), gr.update(value=result_file)
167
- # gr.Button(visible=False)#,
168
- # gr.DownloadButton(label="Download Result", value=result_file, visible=True))
169
- else:
170
- return gr.update(label="Evaluation completed. No result file found."), gr.update(value=result_path)
171
- # gr.Button("Run Evaluation", visible=True),
172
- # gr.DownloadButton(visible=False))
173
- submit_btn.click(start_evaluation,
174
- inputs=[command_output, jsonl_file, subset, split],
175
- outputs=[log_output, download_btn])
176
-
177
- demo.queue(max_size=300).launch(share=True, server_name="0.0.0.0", server_port=7860)
178
- scheduler = BackgroundScheduler()
 
 
 
 
1
  import os
2
+ import logging
3
  import time
4
+ import schedule
5
+ import datetime
6
+ import gradio as gr
7
+ from threading import Thread
8
+ import datasets
9
+ from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard
10
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
11
  from apscheduler.schedulers.background import BackgroundScheduler
12
 
13
+ # Start ephemeral Spaces on PRs (see config in README.md)
14
+ from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
 
15
 
16
+ from src.display.about import (
17
+ CITATION_BUTTON_LABEL,
18
+ CITATION_BUTTON_TEXT,
19
+ # INTRODUCTION_TEXT,
20
+ TITLE,
21
+ ABOUT_TEXT,
22
+ SUBMISSION_TEXT_3,
23
+ )
24
+ from src.display.css_html_js import custom_css
25
+ from src.display.utils import (
26
+ COLS,
27
+ EVAL_COLS,
28
+ EVAL_TYPES,
29
+ AutoEvalColumn,
30
+ fields,
31
+ EvalQueueColumn
32
+ )
33
+ from src.envs import (
34
+ API,
35
+ EVAL_REQUESTS_PATH,
36
+ RESULT_REPO,
37
+ DATA_VERSION,
38
+ DATA_REPO,
39
+ HARD_RESULT_REPO,
40
+ ELO_REPO,
41
+ HARD_ELO_REPO,
42
+ SOLVE_REPO,
43
+ HARD_SOLVE_REPO,
44
+ HF_TOKEN,
45
+ QUEUE_REPO,
46
+ REPO_ID,
47
+ VOTES_REPO,
48
+ VOTES_PATH,
49
+ HF_HOME,
50
+ )
51
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
52
+ from src.execute import generate_command, is_running, lock, stream_logs, find_result_file
53
+ from src.tools.plots import plot_elo_mle, plot_solve_rate
54
+ # from src.voting.vote_system import VoteManager, run_scheduler
55
+
56
+ # Configure logging
57
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
58
+
59
+ # Start ephemeral Spaces on PRs (see config in README.md)
60
+ from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
61
+
62
+ # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
63
+ # This controls whether a full initialization should be performed.
64
+ DO_FULL_INIT = True # os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
65
+ NEW_DATA_ON_LEADERBOARD = True
66
+ LEADERBOARD_DF = None
67
+ HARD_LEADERBOARD_DF = None
68
+ ELO_TASK_DF = None
69
+ ELO_BENCH_DF = None
70
+ HARD_ELO_TASK_DF = None
71
+ HARD_ELO_BENCH_DF = None
72
+ COMPLETE_SOLVE_DF = None
73
+ INSTRUCT_SOLVE_DF = None
74
+ HARD_COMPLETE_SOLVE_DF = None
75
+ HARD_INSTRUCT_SOLVE_DF = None
76
+
77
+ DATA = datasets.load_dataset(DATA_REPO, "default", cache_dir=HF_HOME, split=DATA_VERSION,
78
+ verification_mode="no_checks")
79
+
80
+
81
+ def filter_data(data, keyword):
82
+ if not keyword:
83
+ return data
84
+ filtered_data = [item for item in data if keyword.lower() in item['complete_prompt'].lower()]
85
+ return filtered_data
86
+
87
+
88
+ def update_display(search_keyword, index, show_test):
89
+ filtered_data = filter_data(DATA, search_keyword)
90
 
91
+ if not filtered_data:
92
+ return ["No data available. Check the search criteria."] + [""] * 4 + [0, gr.update(maximum=0, value=0)]
 
 
 
 
93
 
94
+ max_index = len(filtered_data) - 1
95
+ index = min(max(0, index), max_index)
96
 
97
+ task_id = filtered_data[index]['task_id']
98
+ snippet1 = filtered_data[index]['complete_prompt']
99
+ snippet2 = filtered_data[index]['instruct_prompt']
100
+ # snippet3 = filtered_data[index]['canonical_solution'] if show_solution else ""
101
+ snippet4 = filtered_data[index]['test'] if show_test else ""
102
 
103
+ return [
104
+ task_id,
105
+ snippet1,
106
+ snippet2,
107
+ # snippet3,
108
+ snippet4,
109
+ len(filtered_data),
110
+ gr.update(maximum=max_index, value=index)
111
+ ]
112
 
113
+ def restart_space():
114
+ API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
115
 
116
+
117
+ def time_diff_wrapper(func):
118
+ def wrapper(*args, **kwargs):
119
+ start_time = time.time()
120
+ result = func(*args, **kwargs)
121
+ end_time = time.time()
122
+ diff = end_time - start_time
123
+ logging.info(f"Time taken for {func.__name__}: {diff} seconds")
124
+ return result
125
+
126
+ return wrapper
127
+
128
+
129
+ @time_diff_wrapper
130
+ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
131
+ """Download dataset with exponential backoff retries."""
132
+ attempt = 0
133
+ while attempt < max_attempts:
134
  try:
135
+ logging.info(f"Downloading {repo_id} to {local_dir}")
136
+ snapshot_download(
137
+ repo_id=repo_id,
138
+ local_dir=local_dir,
139
+ repo_type=repo_type,
140
+ tqdm_class=None,
141
+ etag_timeout=30,
142
+ max_workers=8,
143
+ )
144
+ logging.info("Download successful")
 
 
 
 
 
 
145
  return
146
+ except Exception as e:
147
+ wait_time = backoff_factor**attempt
148
+ logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
149
+ time.sleep(wait_time)
150
+ attempt += 1
151
+ raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
152
 
153
+ def get_latest_data_leaderboard(
154
+ leaderboard_initial_df = None,
155
+ hard_leaderboard_initial_df = None,
156
+ elo_task_df = None,
157
+ elo_bench_df = None,
158
+ hard_elo_task_df = None,
159
+ hard_elo_bench_df = None,
160
+ complete_solve_df = None,
161
+ instruct_solve_df = None,
162
+ hard_complete_solve_df = None,
163
+ hard_instruct_solve_df = None
164
+ ):
165
+ global NEW_DATA_ON_LEADERBOARD
166
+ global LEADERBOARD_DF
167
+ global HARD_LEADERBOARD_DF
168
+ global ELO_TASK_DF
169
+ global ELO_BENCH_DF
170
+ global HARD_ELO_TASK_DF
171
+ global HARD_ELO_BENCH_DF
172
+ global COMPLETE_SOLVE_DF
173
+ global INSTRUCT_SOLVE_DF
174
+ global HARD_COMPLETE_SOLVE_DF
175
+ global HARD_INSTRUCT_SOLVE_DF
176
+
177
+ if NEW_DATA_ON_LEADERBOARD:
178
+ print("Leaderboard updated at reload!")
179
+ leaderboard_dataset = datasets.load_dataset(
180
+ RESULT_REPO,
181
+ "default",
182
+ split="train",
183
+ cache_dir=HF_HOME,
184
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
185
+ verification_mode="no_checks"
186
+ )
187
+ LEADERBOARD_DF = get_leaderboard_df(
188
+ leaderboard_dataset=leaderboard_dataset,
189
+ cols=COLS,
190
+ )
191
+ hard_leaderboard_dataset = datasets.load_dataset(
192
+ HARD_RESULT_REPO,
193
+ "default",
194
+ split="train",
195
+ cache_dir=HF_HOME,
196
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
197
+ verification_mode="no_checks"
198
+ )
199
+ hard_leaderboard_df = get_leaderboard_df(
200
+ leaderboard_dataset=hard_leaderboard_dataset,
201
+ cols=COLS,
202
+ )
203
+ HARD_LEADERBOARD_DF = hard_leaderboard_df
204
 
205
+ elo_task_df = datasets.load_dataset(
206
+ ELO_REPO,
207
+ "default",
208
+ split="task_no_tie",
209
+ cache_dir=HF_HOME,
210
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
211
+ verification_mode="no_checks"
212
+ ).to_pandas()
213
+ elo_bench_df = datasets.load_dataset(
214
+ ELO_REPO,
215
+ "default",
216
+ split="benchmark_tie",
217
+ cache_dir=HF_HOME,
218
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
219
+ verification_mode="no_checks"
220
+ ).to_pandas()
221
+ ELO_TASK_DF = elo_task_df
222
+ ELO_BENCH_DF = elo_bench_df
223
 
224
+ hard_elo_task_df = datasets.load_dataset(
225
+ HARD_ELO_REPO,
226
+ "default",
227
+ split="task_no_tie",
228
+ cache_dir=HF_HOME,
229
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
230
+ verification_mode="no_checks"
231
+ ).to_pandas()
232
+ hard_elo_bench_df = datasets.load_dataset(
233
+ HARD_ELO_REPO,
234
+ "default",
235
+ split="benchmark_tie",
236
+ cache_dir=HF_HOME,
237
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
238
+ verification_mode="no_checks"
239
+ ).to_pandas()
240
+ HARD_ELO_TASK_DF = hard_elo_task_df
241
+ HARD_ELO_BENCH_DF = hard_elo_bench_df
242
 
243
+ complete_solve_df = datasets.load_dataset(
244
+ SOLVE_REPO,
245
+ "default",
246
+ split="complete",
247
+ cache_dir=HF_HOME,
248
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
249
+ verification_mode="no_checks"
250
+ ).to_pandas()
251
+ instruct_solve_df = datasets.load_dataset(
252
+ SOLVE_REPO,
253
+ "default",
254
+ split="instruct",
255
+ cache_dir=HF_HOME,
256
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
257
+ verification_mode="no_checks"
258
+ ).to_pandas()
259
+ COMPLETE_SOLVE_DF = complete_solve_df
260
+ INSTRUCT_SOLVE_DF = instruct_solve_df
261
 
262
+ hard_complete_solve_df = datasets.load_dataset(
263
+ HARD_SOLVE_REPO,
264
+ "default",
265
+ split="complete",
266
+ cache_dir=HF_HOME,
267
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
268
+ verification_mode="no_checks"
269
+ ).to_pandas()
270
+ hard_instruct_solve_df = datasets.load_dataset(
271
+ HARD_SOLVE_REPO,
272
+ "default",
273
+ split="instruct",
274
+ cache_dir=HF_HOME,
275
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
276
+ verification_mode="no_checks"
277
+ ).to_pandas()
278
+ HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
279
+ HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
280
 
281
+ NEW_DATA_ON_LEADERBOARD = False
 
 
 
 
 
282
 
283
+ else:
284
+ LEADERBOARD_DF = leaderboard_initial_df
285
+ # HARD_LEADERBOARD_DF = hard_leaderboard_initial_df
286
+ ELO_TASK_DF = elo_task_df
287
+ # ELO_BENCH_DF = elo_bench_df
288
+ # HARD_ELO_TASK_DF = hard_elo_task_df
289
+ HARD_ELO_BENCH_DF = hard_elo_bench_df
290
+ COMPLETE_SOLVE_DF = complete_solve_df
291
+ # INSTRUCT_SOLVE_DF = instruct_solve_df
292
+ # HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
293
+ HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
294
 
295
+ return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
296
+ # return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
297
+
298
+
299
+ def init_space():
300
+ """Initializes the application space, loading only necessary data."""
301
+
302
+ # Always redownload the leaderboard DataFrame
303
+ global LEADERBOARD_DF
304
+ global HARD_LEADERBOARD_DF
305
+ global ELO_TASK_DF
306
+ global ELO_BENCH_DF
307
+ global HARD_ELO_TASK_DF
308
+ global HARD_ELO_BENCH_DF
309
+ global COMPLETE_SOLVE_DF
310
+ global INSTRUCT_SOLVE_DF
311
+ global HARD_COMPLETE_SOLVE_DF
312
+ global HARD_INSTRUCT_SOLVE_DF
313
 
314
+ LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
315
+ # HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
316
+
317
+ return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
318
+ # return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
319
+
320
+ # Initialize VoteManager
321
+ # vote_manager = VoteManager(VOTES_PATH, EVAL_REQUESTS_PATH, VOTES_REPO)
322
+
323
+
324
+ # Schedule the upload_votes method to run every 15 minutes
325
+ # schedule.every(15).minutes.do(vote_manager.upload_votes)
326
+
327
+ # Start the scheduler in a separate thread
328
+ # scheduler_thread = Thread(target=run_scheduler, args=(vote_manager,), daemon=True)
329
+ # scheduler_thread.start()
330
+
331
+ # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
332
+ # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
333
+ LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, \
334
+ ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, \
335
+ COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, \
336
+ HARD_INSTRUCT_SOLVE_DF = init_space()
337
+ # HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = init_space()
338
+
339
+ # Data processing for plots now only on demand in the respective Gradio tab
340
+ # def load_and_create_plots():
341
+ # plot_df = create_plot_df(create_scores_df(LEADERBOARD_DF))
342
+ # return plot_df
343
+
344
+ # Function to check if a user is logged in
345
+ def check_login(profile: gr.OAuthProfile | None) -> bool:
346
+ if profile is None:
347
+ return False
348
+ return True
349
+
350
+ def init_leaderboard(dataframe):
351
+ if dataframe is None or dataframe.empty:
352
+ raise ValueError("Leaderboard DataFrame is empty or None.")
353
+ return Leaderboard(
354
+ value=dataframe,
355
+ datatype=[c.type for c in fields(AutoEvalColumn)],
356
+ select_columns=SelectColumns(
357
+ default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
358
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
359
+ label="Select Columns to Display:",
360
+ ),
361
+ search_columns=[AutoEvalColumn.model.name],
362
+ hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
363
+ filter_columns=[
364
+ ColumnFilter(AutoEvalColumn.type.name, type="checkboxgroup", label="Model Types"),
365
+ ColumnFilter(AutoEvalColumn.openness.name, type="checkboxgroup", label="Openness"),
366
+ ColumnFilter(AutoEvalColumn.size_range.name, type="dropdown", label="Model Size"),
367
+ ColumnFilter(AutoEvalColumn.moe.name, type="checkboxgroup", label="Model Architecture"),
368
+ ],
369
+ bool_checkboxgroup_label="Hide models",
370
+ interactive=False,
371
+ )
372
+
373
+
374
+ def init_others(dataframe):
375
+ if dataframe is None or dataframe.empty:
376
+ raise ValueError("Gradio DataFrame is empty or None.")
377
+ return gr.Dataframe(dataframe, visible=False)
378
+
379
+ main_block = gr.Blocks(css=custom_css)
380
+ with main_block as demo:
381
+ with gr.Row(elem_id="header-row"):
382
+ gr.HTML(TITLE + "<p>Total models: " + str(len(HARD_LEADERBOARD_DF))+ "</p>")
383
 
384
+ # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
385
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
386
+ with gr.Tab("💎 Hard Set") as hard_tabs:
387
+ with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="hard_bench"):
388
+ hard_leaderboard = init_leaderboard(HARD_LEADERBOARD_DF)
389
+ gr.Markdown(
390
+ """
391
+ **Notes:**
392
+ - For the efficiency reasons, we only display the Hard Set leaderboard.
393
+ - _Hard Set_ vs _Full Set_:
394
+ - <u>Hard Set</u>: A subset of ~150 BigCodeBench tasks which is more user-facing and challenging.
395
+ - <u>Full Set</u>: The full set of 1140 BigCodeBench tasks.
396
+ - _Complete_ vs _Instruct_:
397
+ - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This split tests if the models are good at coding.
398
+ - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
399
+ - `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
400
+ - `Average` is the average of `Complete` and `Instruct` when both are available.
401
+ - `Elo Rating` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the Complete + Instruct splits. The rating starts from 1000 and is bootstrapped 500 times. We only consider the models having both `Complete` and `Instruct` scores.
402
+ - `#Act Params (B)` is the number of activated model parameters during inference.
403
+ - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
404
+ - For more details check the 📝 About section.
405
+ """,
406
+ elem_classes="markdown-text",
407
+ )
408
+
409
+ with gr.TabItem("📊 Elo Rating", id="hard_elo"):
410
+ with gr.Column():
411
+ with gr.Group():
412
+ gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
413
+ hard_task_elo_map = gr.Plot()
414
+ hard_elo_task_gr = init_others(HARD_ELO_TASK_DF)
415
+ demo.load(plot_elo_mle, [hard_elo_task_gr],
416
+ hard_task_elo_map)
417
+ with gr.Group():
418
+ gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
419
+ hard_bench_elo_map = gr.Plot()
420
+ hard_elo_bench_gr = init_others(HARD_ELO_BENCH_DF)
421
+ demo.load(plot_elo_mle, [hard_elo_bench_gr],
422
+ hard_bench_elo_map)
423
+
424
+ with gr.TabItem("🧩 Solve Rate", id="hard_solve"):
425
+ with gr.Column():
426
+ hard_complete_map = gr.Plot()
427
+ hard_complete_solve_gr = init_others(HARD_COMPLETE_SOLVE_DF)
428
+ demo.load(plot_solve_rate, [hard_complete_solve_gr,
429
+ gr.Textbox("Complete", visible=False),
430
+ gr.Number(10, visible=False),
431
+ gr.Number(16, visible=False),
432
+ ], hard_complete_map)
433
+ hard_instruct_map = gr.Plot()
434
+ hard_instruct_solve_gr = init_others(HARD_INSTRUCT_SOLVE_DF)
435
+ demo.load(plot_solve_rate, [hard_instruct_solve_gr,
436
+ gr.Textbox("Instruct", visible=False),
437
+ gr.Number(10, visible=False),
438
+ gr.Number(16, visible=False),
439
+ ], hard_instruct_map)
440
+ with gr.Tab("🎯 Full Set") as full_tabs:
441
+ with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="full_bench"):
442
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
443
+ gr.Markdown(
444
+ """
445
+ **Notes:**
446
+ - _Complete_ vs _Instruct_:
447
+ - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding.
448
+ - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
449
+ - `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
450
+ - `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the BigCodeBench-Complete split. The rating starts from 1000 and is bootstrapped 500 times.
451
+ - `size` is the amount of activated model weight during inference.
452
+ - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
453
+ - For more details check the 📝 About section.
454
+ """,
455
+ elem_classes="markdown-text",
456
+ )
457
+
458
+ with gr.TabItem("📊 Elo Rating", id="full_elo"):
459
+ with gr.Column():
460
+ with gr.Group():
461
+
462
+ gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
463
+ task_elo_map = gr.Plot()
464
+ elo_task_gr = init_others(ELO_TASK_DF)
465
+ demo.load(plot_elo_mle, [elo_task_gr], task_elo_map)
466
+ with gr.Group():
467
+ gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
468
+ bench_elo_map = gr.Plot()
469
+ elo_bench_gr = init_others(ELO_BENCH_DF)
470
+ demo.load(plot_elo_mle, [elo_bench_gr], bench_elo_map)
471
+
472
+ with gr.TabItem("🧩 Solve Rate", id="full_solve"):
473
+ with gr.Column():
474
+ complete_map = gr.Plot()
475
+ complete_solve_gr = init_others(COMPLETE_SOLVE_DF)
476
+ demo.load(plot_solve_rate, [complete_solve_gr,
477
+ gr.Textbox("Complete", visible=False),
478
+ ], complete_map)
479
+ instruct_map = gr.Plot()
480
+ instruct_solve_gr = init_others(INSTRUCT_SOLVE_DF)
481
+ demo.load(plot_solve_rate, [instruct_solve_gr,
482
+ gr.Textbox("Instruct", visible=False),
483
+ ], instruct_map)
484
+ with gr.TabItem("📝 About", id=3):
485
+ gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
486
+ with gr.TabItem("🔎 Data Viewer", id="viewer"):
487
+ search_input = gr.Textbox(label="Search by keyword")
488
+ count_output = gr.Number(label="Number of filtered items")
489
+ index_slider = gr.Slider(minimum=0, maximum=len(DATA)-1, step=1, label="Select Index")
490
+ # show_solution = gr.Checkbox(label="Show Solution")
491
+ show_test = gr.Checkbox(label="Show Test Cases")
492
+ update_button = gr.Button("Update")
493
+
494
+ task_id_output = gr.Textbox(label="Task ID")
495
+ code_completion = gr.Code(language="python", label="Code Completion")
496
+ nl_instruction = gr.Code(language="markdown", label="Natural Language Instruction")
497
+ # solution = gr.Code(language="python", label="Solution")
498
+ test_cases = gr.Code(language="python", label="Test Cases")
499
+
500
+ update_button.click(
501
+ update_display,
502
+ inputs=[search_input, index_slider, show_test],
503
+ outputs=[task_id_output, code_completion, nl_instruction, test_cases, count_output, index_slider]
504
+ )
505
+
506
+ # Initial load
507
+ demo.load(
508
+ update_display,
509
+ inputs=[search_input, index_slider, show_test],
510
+ outputs=[task_id_output, code_completion, nl_instruction, test_cases, count_output, index_slider]
511
+ )
512
+
513
+ with gr.TabItem("🚀 Request", id=4):
514
+ gr.Markdown(SUBMISSION_TEXT_3)
515
+
516
+ with gr.TabItem(" Execute", id=5):
517
+ gr.Markdown("# BigCodeBench Evaluator")
518
+
519
+ with gr.Row():
520
+ jsonl_file = gr.File(label="Upload JSONL file", file_types=[".jsonl"])
521
+ split = gr.Dropdown(choices=["complete", "instruct"], label="Split", value="complete")
522
+ subset = gr.Dropdown(choices=["hard"], label="Subset", value="hard")
523
+
524
+ with gr.Row():
525
+ parallel = gr.Number(label="Parallel (optional)", precision=0)
526
+ min_time_limit = gr.Number(label="Min Time Limit", value=1, precision=1)
527
+ max_as_limit = gr.Number(label="Max AS Limit", value=25*1024, precision=0)
528
+
529
+ with gr.Row():
530
+ max_data_limit = gr.Number(label="Max Data Limit", value=25*1024, precision=0)
531
+ max_stack_limit = gr.Number(label="Max Stack Limit", value=10, precision=0)
532
+ check_gt_only = gr.Checkbox(label="Check GT Only")
533
+ no_gt = gr.Checkbox(label="No GT")
534
+
535
+ command_output = gr.Textbox(label="Command", value=default_command, interactive=False)
536
+ with gr.Row():
537
+ submit_btn = gr.Button("Run Evaluation")
538
+ download_btn = gr.DownloadButton(label="Download Result")
539
+ log_output = gr.Textbox(label="Execution Logs", lines=20)
540
+
541
+ input_components = [
542
+ jsonl_file, split, subset, parallel,
543
+ min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
544
+ check_gt_only, no_gt
545
+ ]
546
+
547
+ for component in input_components:
548
+ component.change(generate_command, inputs=input_components, outputs=command_output)
549
+
550
+
551
+ def start_evaluation(command, jsonl_file, subset, split):
552
+ extra = subset + "_" if subset != "full" else ""
553
+ if jsonl_file is not None:
554
+ result_path = os.path.basename(jsonl_file.name).replace(".jsonl", f"_{extra}eval_results.json")
555
+ else:
556
+ result_path = None
557
+
558
+ for log in stream_logs(command, jsonl_file):
559
+ if jsonl_file is not None:
560
+ yield log, gr.update(value=result_path, label=result_path), gr.update()
561
+ else:
562
+ yield log, gr.update(), gr.update()
563
+ is_running = False
564
+ result_file = find_result_file()
565
+ if result_file:
566
+ return gr.update(label="Evaluation completed. Result file found."), gr.update(value=result_file)
567
+ # gr.Button(visible=False)#,
568
+ # gr.DownloadButton(label="Download Result", value=result_file, visible=True))
569
+ else:
570
+ return gr.update(label="Evaluation completed. No result file found."), gr.update(value=result_path)
571
+ # gr.Button("Run Evaluation", visible=True),
572
+ # gr.DownloadButton(visible=False))
573
+ submit_btn.click(start_evaluation,
574
+ inputs=[command_output, jsonl_file, subset, split],
575
+ outputs=[log_output, download_btn])
576
 
 
577
  with gr.Row():
578
+ with gr.Accordion("📙 Citation", open=False):
579
+ citation_button = gr.Textbox(
580
+ value=CITATION_BUTTON_TEXT,
581
+ label=CITATION_BUTTON_LABEL,
582
+ lines=20,
583
+ elem_id="citation-button",
584
+ show_copy_button=True,
585
+ )
586
+
587
+ main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
588
+ # main_block.load(fn=get_latest_data_leaderboard, inputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
589
+ # leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
590
+ # pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
591
+
592
+ main_block.queue(default_concurrency_limit=100)
593
+
594
+
595
+ def enable_space_ci_and_return_server(ui: gr.Blocks) -> WebhooksServer:
596
+ # Taken from https://huggingface.co/spaces/Wauplin/gradio-space-ci/blob/075119aee75ab5e7150bf0814eec91c83482e790/src/gradio_space_ci/webhook.py#L61
597
+ # Compared to original, this one do not monkeypatch Gradio which allows us to define more webhooks.
598
+ # ht to Lucain!
599
+ if SPACE_ID is None:
600
+ print("Not in a Space: Space CI disabled.")
601
+ return WebhooksServer(ui=main_block)
602
+
603
+ if IS_EPHEMERAL_SPACE:
604
+ print("In an ephemeral Space: Space CI disabled.")
605
+ return WebhooksServer(ui=main_block)
606
+
607
+ card = RepoCard.load(repo_id_or_path=SPACE_ID, repo_type="space")
608
+ config = card.data.get("space_ci", {})
609
+ print(f"Enabling Space CI with config from README: {config}")
610
+
611
+ return configure_space_ci(
612
+ blocks=ui,
613
+ trusted_authors=config.get("trusted_authors"),
614
+ private=config.get("private", "auto"),
615
+ variables=config.get("variables", "auto"),
616
+ secrets=config.get("secrets"),
617
+ hardware=config.get("hardware"),
618
+ storage=config.get("storage"),
619
+ )
620
+
621
+ # Create webhooks server (with CI url if in Space and not ephemeral)
622
+ webhooks_server = enable_space_ci_and_return_server(ui=main_block)
623
+
624
+ # Add webhooks
625
+ @webhooks_server.add_webhook
626
+ def update_leaderboard(payload: WebhookPayload) -> None:
627
+ """Redownloads the leaderboard dataset each time it updates"""
628
+ if payload.repo.type == "dataset" and payload.event.action == "update":
629
+ global NEW_DATA_ON_LEADERBOARD
630
+ if NEW_DATA_ON_LEADERBOARD:
631
+ return
632
+ NEW_DATA_ON_LEADERBOARD = True
633
+
634
+ for repo in [RESULT_REPO, HARD_RESULT_REPO, ELO_REPO, HARD_ELO_REPO, SOLVE_REPO, HARD_SOLVE_REPO]:
635
+ datasets.load_dataset(
636
+ repo,
637
+ "default",
638
+ cache_dir=HF_HOME,
639
+ download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,
640
+ verification_mode="no_checks"
641
+ )
642
 
643
+
644
+
645
+ webhooks_server.launch()
646
+
647
+ scheduler = BackgroundScheduler()
648
+ scheduler.add_job(restart_space, "interval", hours=5) # restarted every 3h as backup in case automatic updates are not working
649
+ scheduler.start()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
demo.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import subprocess
3
+ import sys
4
+ import os
5
+ import threading
6
+ import time
7
+ import uuid
8
+ import glob
9
+ import shutil
10
+ from pathlib import Path
11
+ from apscheduler.schedulers.background import BackgroundScheduler
12
+
13
+ default_command = "bigcodebench.evaluate"
14
+ is_running = False
15
+ lock = threading.Lock()
16
+
17
+ def generate_command(
18
+ jsonl_file, split, subset, parallel,
19
+ min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
20
+ check_gt_only, no_gt
21
+ ):
22
+ command = [default_command]
23
+
24
+ if jsonl_file is not None:
25
+ # Copy the uploaded file to the current directory
26
+ local_filename = os.path.basename(jsonl_file.name)
27
+ shutil.copy(jsonl_file.name, local_filename)
28
+ command.extend(["--samples", local_filename])
29
+
30
+ command.extend(["--split", split, "--subset", subset])
31
+
32
+ if parallel is not None and parallel != 0:
33
+ command.extend(["--parallel", str(int(parallel))])
34
+
35
+ command.extend([
36
+ "--min-time-limit", str(min_time_limit),
37
+ "--max-as-limit", str(int(max_as_limit)),
38
+ "--max-data-limit", str(int(max_data_limit)),
39
+ "--max-stack-limit", str(int(max_stack_limit))
40
+ ])
41
+
42
+ if check_gt_only:
43
+ command.append("--check-gt-only")
44
+
45
+ if no_gt:
46
+ command.append("--no-gt")
47
+
48
+ return " ".join(command)
49
+
50
+
51
+ def cleanup_previous_files(jsonl_file):
52
+ if jsonl_file is not None:
53
+ file_list = ['Dockerfile', 'app.py', 'README.md', os.path.basename(jsonl_file.name), "__pycache__"]
54
+ else:
55
+ file_list = ['Dockerfile', 'app.py', 'README.md', "__pycache__"]
56
+ for file in glob.glob("*"):
57
+ try:
58
+ if file not in file_list:
59
+ os.remove(file)
60
+ except Exception as e:
61
+ print(f"Error during cleanup of {file}: {e}")
62
+
63
+ def find_result_file():
64
+ json_files = glob.glob("*.json")
65
+ if json_files:
66
+ return max(json_files, key=os.path.getmtime)
67
+ return None
68
+
69
+ def run_bigcodebench(command):
70
+ global is_running
71
+ with lock:
72
+ if is_running:
73
+ yield "A command is already running. Please wait for it to finish.\n"
74
+ return
75
+ is_running = True
76
+
77
+ try:
78
+ yield f"Executing command: {command}\n"
79
+
80
+ process = subprocess.Popen(command.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
81
+
82
+ for line in process.stdout:
83
+ yield line
84
+
85
+ # process.wait()
86
+
87
+ if process.returncode != 0:
88
+ yield f"Error: Command exited with status {process.returncode}\n"
89
+
90
+ yield "Evaluation completed.\n"
91
+
92
+ result_file = find_result_file()
93
+ if result_file:
94
+ yield f"Result file found: {result_file}\n"
95
+ else:
96
+ yield "No result file found.\n"
97
+ finally:
98
+ with lock:
99
+ is_running = False
100
+
101
+ def stream_logs(command, jsonl_file=None):
102
+ global is_running
103
+
104
+ if is_running:
105
+ yield "A command is already running. Please wait for it to finish.\n"
106
+ return
107
+
108
+ cleanup_previous_files(jsonl_file)
109
+ yield "Cleaned up previous files.\n"
110
+
111
+ log_content = []
112
+ for log_line in run_bigcodebench(command):
113
+ log_content.append(log_line)
114
+ yield "".join(log_content)
115
+
116
+ with gr.Blocks() as demo:
117
+ gr.Markdown("# BigCodeBench Evaluator")
118
+
119
+ with gr.Row():
120
+ jsonl_file = gr.File(label="Upload JSONL file", file_types=[".jsonl"])
121
+ split = gr.Dropdown(choices=["complete", "instruct"], label="Split", value="complete")
122
+ subset = gr.Dropdown(choices=["hard"], label="Subset", value="hard")
123
+
124
+ with gr.Row():
125
+ parallel = gr.Number(label="Parallel (optional)", precision=0)
126
+ min_time_limit = gr.Number(label="Min Time Limit", value=1, precision=1)
127
+ max_as_limit = gr.Number(label="Max AS Limit", value=25*1024, precision=0)
128
+
129
+ with gr.Row():
130
+ max_data_limit = gr.Number(label="Max Data Limit", value=25*1024, precision=0)
131
+ max_stack_limit = gr.Number(label="Max Stack Limit", value=10, precision=0)
132
+ check_gt_only = gr.Checkbox(label="Check GT Only")
133
+ no_gt = gr.Checkbox(label="No GT")
134
+
135
+ command_output = gr.Textbox(label="Command", value=default_command, interactive=False)
136
+ with gr.Row():
137
+ submit_btn = gr.Button("Run Evaluation")
138
+ download_btn = gr.DownloadButton(label="Download Result")
139
+ log_output = gr.Textbox(label="Execution Logs", lines=20)
140
+
141
+ input_components = [
142
+ jsonl_file, split, subset, parallel,
143
+ min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
144
+ check_gt_only, no_gt
145
+ ]
146
+
147
+ for component in input_components:
148
+ component.change(generate_command, inputs=input_components, outputs=command_output)
149
+
150
+
151
+ def start_evaluation(command, jsonl_file, subset, split):
152
+ extra = subset + "_" if subset != "full" else ""
153
+ if jsonl_file is not None:
154
+ result_path = os.path.basename(jsonl_file.name).replace(".jsonl", f"_{extra}eval_results.json")
155
+ else:
156
+ result_path = None
157
+
158
+ for log in stream_logs(command, jsonl_file):
159
+ if jsonl_file is not None:
160
+ yield log, gr.update(value=result_path, label=result_path), gr.update()
161
+ else:
162
+ yield log, gr.update(), gr.update()
163
+ is_running = False
164
+ result_file = find_result_file()
165
+ if result_file:
166
+ return gr.update(label="Evaluation completed. Result file found."), gr.update(value=result_file)
167
+ # gr.Button(visible=False)#,
168
+ # gr.DownloadButton(label="Download Result", value=result_file, visible=True))
169
+ else:
170
+ return gr.update(label="Evaluation completed. No result file found."), gr.update(value=result_path)
171
+ # gr.Button("Run Evaluation", visible=True),
172
+ # gr.DownloadButton(visible=False))
173
+ submit_btn.click(start_evaluation,
174
+ inputs=[command_output, jsonl_file, subset, split],
175
+ outputs=[log_output, download_btn])
176
+
177
+ demo.queue(max_size=300).launch(share=True, server_name="0.0.0.0", server_port=7860)
178
+ scheduler = BackgroundScheduler()
pyproject.toml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.ruff]
2
+ # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
+ select = ["E", "F"]
4
+ ignore = ["E501"] # line too long (black is taking care of this)
5
+ line-length = 119
6
+ fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
+
8
+ [tool.isort]
9
+ profile = "black"
10
+ line_length = 119
11
+
12
+ [tool.black]
13
+ line-length = 119
src/display/about.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TITLE = """<div style="text-align: center;"><h1> 🌸<span style='color: #C867B5;'>BigCodeBench</span> Leaderboard</h1></div>\
2
+ <br>\
3
+ <p>Inspired from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">🤗 Open LLM Leaderboard</a> and <a href="https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard">⭐ Big Code Models Leaderboard</a>, we compare performance of LLMs on <a href="https://huggingface.co/datasets/bigcode/bigcodebench">BigCodeBench</a> benchmark.</p>
4
+ <p>To get started, please check out <a href="https://github.com/bigcode-project/bigcodebench">our GitHub repository</a>.
5
+ <br>\
6
+ For more details, please check our <a href="https://huggingface.co/blog/terryyz/bigcodebench-hard">blog on the Hard Set</a>, <a href="https://huggingface.co/blog/leaderboard-bigcodebench">blog on the Full Set</a> and <a href="https://arxiv.org/abs/2406.15877">paper</a>.</p>
7
+ """
8
+
9
+ ABOUT_TEXT = """# Context
10
+ We believe that there are three main expectations of a good execution-based programming benchmark:
11
+ 1. The benchmark should be easy to use and efficient in evaluating the fundamental capabilities of LLMs. Repo-level and agent-centric benchmarks (e.g., SWE-bench) are not suitable for this purpose.
12
+ 2. The benchmark should be practical, covering various programming scenarios. Algo-specific benchmarks (e.g., HumanEval and MBPP) are unsuitable. Domain-specific benchmarks (e.g., DS-1000) are also unsuitable for this purpose.
13
+ 3. The benchmark should be challenging, where the tasks require LLMs' strong compositional reasoning capabilities and instruction-following capabilities. The benchmarks with simple tasks (e.g., ODEX) are unsuitable.
14
+
15
+ BigCodeBench is the first benchmark that meets all three expectations. It is an <u>*__easy-to-use__*</u> benchmark that evaluates LLMs with <u>*__practical__*</u> and <u>*__challenging__*</u> programming tasks, accompanied by an end-to-end evaluation framework [`bigcodebench`](https://github.com/bigcode-project/bigcodebench). We aim to assess how well LLMs can solve programming tasks in an open-ended setting, with the following two focuses:
16
+
17
+ - Diverse Function Calls: This design requires LLMs to utilize diverse function calls.
18
+ - Complex Instructions: This design requires LLMs to follow complex instructions.
19
+
20
+
21
+ ### Benchamrks & Prompts
22
+ The dataset has 2 variants:
23
+ 1. `BigCodeBench-Complete`: _Code Completion based on the structured long-context docstrings_.
24
+ 1. `BigCodeBench-Instruct`: _Code Generation based on the NL-oriented instructions_.
25
+
26
+ Figure below shows the example of `Complete` vs `Instruct` prompt. For `Instruct`, we only focus on instruction-tuned LLMs.
27
+
28
+ <img src="https://github.com/bigcode-bench/bigcode-bench.github.io/blob/main/asset/bigcodebench_prompt.svg?raw=true" alt="OctoCoder vs Base HumanEval prompt" width="800px">
29
+
30
+ The specific prompt template can be found [here](https://github.com/bigcode-project/bigcodebench/blob/main/bigcodebench/model.py).
31
+
32
+ There are some edge cases:
33
+ - Due to the training flaws in StarCoder2 and Granite-Code, we additionally strip the trailing newlines for model inference.
34
+ - We have not included the `Instruct` results of Granite-Code-Instruct 8B & 3B as they constantly have empty outputs.
35
+
36
+ ### Evaluation Parameters
37
+ - All models were evaluated with the [bigcodebench](https://github.com/bigcode-project/bigcodebench). You can install the [PyPI package](https://pypi.org/project/bigcodebench/).
38
+ To get started, please first set up the environment:
39
+
40
+ ```bash
41
+ # Install to use bigcodebench.evaluate
42
+ pip install bigcodebench --upgrade
43
+ # If you want to use the evaluate locally, you need to install the requirements
44
+ pip install -I -r https://raw.githubusercontent.com/bigcode-project/bigcodebench/main/Requirements/requirements-eval.txt
45
+
46
+ # Install to use bigcodebench.generate
47
+ # You are strongly recommended to install the generate dependencies in a separate environment
48
+ pip install bigcodebench[generate] --upgrade
49
+ ```
50
+
51
+ ### Scoring and Rankings
52
+ - Models are ranked according to Pass@1 using greedy decoding. Setup details can be found <a href="https://github.com/bigcode-project/bigcodebench/blob/main/bigcodebench/generate.py">here</a>.
53
+ - The code to compute Elo rating is [here](https://github.com/bigcode-project/bigcodebench/blob/main/analysis/get_results.py), which is based on [Chatbot Arena Notebook](https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR#scrollTo=JdiJbB6pZB1B&line=2&uniqifier=1). We only compute the Elo rating for the `BigCodeBench-Complete` variant.
54
+
55
+ ### Contact
56
+ If you have any questions, feel free to reach out to us at [terry.zhuo@monash.edu](mailto:terry.zhuo@monash.edu) or [contact@bigcode-project.org](mailto:contact@bigcode-project.org)
57
+
58
+ ### Citation Information
59
+
60
+ ```bibtex
61
+ @article{zhuo2024bigcodebench,
62
+ title={BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions},
63
+ author={Terry Yue Zhuo and Minh Chien Vu and Jenny Chim and Han Hu and Wenhao Yu and Ratnadira Widyasari and Imam Nur Bani Yusuf and Haolan Zhan and Junda He and Indraneil Paul and Simon Brunner and Chen Gong and Thong Hoang and Armel Randy Zebaze and Xiaoheng Hong and Wen-Ding Li and Jean Kaddour and Ming Xu and Zhihan Zhang and Prateek Yadav and Naman Jain and Alex Gu and Zhoujun Cheng and Jiawei Liu and Qian Liu and Zijian Wang and David Lo and Binyuan Hui and Niklas Muennighoff and Daniel Fried and Xiaoning Du and Harm de Vries and Leandro Von Werra},
64
+ journal={arXiv preprint arXiv:2406.15877},
65
+ year={2024}
66
+ }
67
+ ```
68
+ """
69
+
70
+ SUBMISSION_TEXT = """
71
+ <h1 align="center">
72
+ How to submit models/results to the leaderboard?
73
+ </h1>
74
+ We welcome the community to submit evaluation results of new models. We also provide an experimental feature for submitting models that our team will evaluate on the 🤗 cluster.
75
+
76
+ ## Submitting Models (experimental feature)
77
+ Inspired from the Open LLM Leaderboard, we welcome code models submission from the community that will be automatically evaluated. Please note that this is still an experimental feature.
78
+ Below are some guidlines to follow before submitting your model:
79
+
80
+ #### 1) Make sure you can load your model and tokenizer using AutoClasses:
81
+ ```python
82
+ from transformers import AutoConfig, AutoModel, AutoTokenizer
83
+ config = AutoConfig.from_pretrained("your model name", revision=revision)
84
+ model = AutoModel.from_pretrained("your model name", revision=revision)
85
+ tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
86
+ ```
87
+ If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
88
+ Note: make sure your model is public!
89
+ Note: if your model needs `use_remote_code=True`, we do not support this option yet.
90
+ #### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
91
+ It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
92
+ #### 3) Make sure your model has an open license!
93
+ This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
94
+ #### 4) Fill up your model card
95
+ When we add extra information about models to the leaderboard, it will be automatically taken from the model card.
96
+ """
97
+
98
+ SUBMISSION_TEXT_2 = """
99
+ ## Sumbitting Results
100
+ You also have the option for running evaluation yourself and submitting results. These results will be added as non-verified, the authors are however required to upload their generations in case other members want to check.
101
+
102
+ ### 1 - Running Evaluation
103
+
104
+ We wrote a detailed guide for running the evaluation on your model. You can find the it in [bigcode-evaluation-harness/leaderboard](https://github.com/bigcode-project/bigcode-evaluation-harness/tree/main/leaderboard). This will generate a json file summarizing the results, in addition to the raw generations and metric files.
105
+
106
+ ### 2- Submitting Results 🚀
107
+
108
+ To submit your results create a **Pull Request** in the community tab to add them under the [folder](https://huggingface.co/spaces/bigcode/bigcodebench-code-evals/tree/main/community_results) `community_results` in this repository:
109
+ - Create a folder called `ORG_MODELNAME_USERNAME` for example `bigcode_my_model_terry`
110
+ - Put your json file with grouped scores from the guide, in addition generations folder and metrics folder in it.
111
+
112
+ The title of the PR should be `[Community Submission] Model: org/model, Username: your_username`, replace org and model with those corresponding to the model you evaluated.
113
+ """
114
+
115
+ SUBMISSION_TEXT_3 = """
116
+ <h1 align="center">
117
+ How to submit models/results to the leaderboard?
118
+ </h1>
119
+ We welcome the community to submit evaluation results of new models. These results will be added as non-verified, the authors are however required to upload their generations in case other members want to check.
120
+
121
+ ### 1 - Running Evaluation
122
+
123
+ We wrote a detailed guide for running the evaluation on your model. You can find the it in [bigcode-evaluation-harness/leaderboard](https://github.com/bigcode-project/bigcode-evaluation-harness/tree/main/leaderboard). This will generate a json file summarizing the results, in addition to the raw generations and metric files.
124
+
125
+ ### 2- Submitting Results 🚀
126
+
127
+ To submit your results create a **Pull Request** in the community tab to add them under the [folder](https://huggingface.co/spaces/bigcode/multilingual-code-evals/tree/main/community_results) `community_results` in this repository:
128
+ - Create a folder called `ORG_MODELNAME_USERNAME` for example `bigcode_starcoder_loubnabnl`
129
+ - Put your json file with grouped scores from the guide, in addition generations folder and metrics folder in it.
130
+
131
+ The title of the PR should be `[Community Submission] Model: org/model, Username: your_username`, replace org and model with those corresponding to the model you evaluated.
132
+ """
133
+
134
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
135
+
136
+ CITATION_BUTTON_TEXT = r"""
137
+ @article{zhuo2024bigcodebench,
138
+ title={BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions},
139
+ author={Zhuo, Terry Yue and Vu, Minh Chien and Chim, Jenny and Hu, Han and Yu, Wenhao and Widyasari, Ratnadira and Yusuf, Imam Nur Bani and Zhan, Haolan and He, Junda and Paul, Indraneil and others},
140
+ journal={arXiv preprint arXiv:2406.15877},
141
+ year={2024}
142
+ }
143
+ """
144
+
145
+ SUBMISSION_TEXT_3="""
146
+ ## We welcome the community to request for new models to be added to the leaderboard.
147
+ ## Please [file an issue](https://github.com/bigcode-project/bigcodebench/issues/new/choose) to add the model to the leaderboard or [start a discussion](https://huggingface.co/spaces/bigcode/bigcodebench-leaderboard/discussions/new) in the community🤗
148
+ """
src/display/css_html_js.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ custom_css = """
2
+ /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
3
+ table td:first-child,
4
+ table th:first-child {
5
+ max-width: 400px;
6
+ overflow: auto;
7
+ white-space: nowrap;
8
+ }
9
+
10
+ /* Full width space */
11
+ .gradio-container {
12
+ max-width: 95% !important;
13
+ }
14
+
15
+ /* Text style and margins */
16
+ .markdown-text {
17
+ font-size: 16px !important;
18
+ }
19
+
20
+ #models-to-add-text {
21
+ font-size: 18px !important;
22
+ }
23
+
24
+ #citation-button span {
25
+ font-size: 16px !important;
26
+ }
27
+
28
+ #citation-button textarea {
29
+ font-size: 16px !important;
30
+ }
31
+
32
+ #citation-button > label > button {
33
+ margin: 6px;
34
+ transform: scale(1.3);
35
+ }
36
+
37
+ #search-bar-table-box > div:first-child {
38
+ background: none;
39
+ border: none;
40
+ }
41
+
42
+ #search-bar {
43
+ padding: 0px;
44
+ }
45
+
46
+ .tab-buttons button {
47
+ font-size: 20px;
48
+ }
49
+
50
+ /* Filters style */
51
+ #filter_type {
52
+ border: 0;
53
+ padding-left: 0;
54
+ padding-top: 0;
55
+ }
56
+ #filter_type label {
57
+ display: flex;
58
+ }
59
+ #filter_type label > span {
60
+ margin-top: var(--spacing-lg);
61
+ margin-right: 0.5em;
62
+ }
63
+ #filter_type label > .wrap {
64
+ width: 103px;
65
+ }
66
+ #filter_type label > .wrap .wrap-inner {
67
+ padding: 2px;
68
+ }
69
+ #filter_type label > .wrap .wrap-inner input {
70
+ width: 1px;
71
+ }
72
+ #filter-columns-type {
73
+ border: 0;
74
+ padding: 0.5;
75
+ }
76
+ #filter-columns-size {
77
+ border: 0;
78
+ padding: 0.5;
79
+ }
80
+ #box-filter > .form {
81
+ border: 0;
82
+ }
83
+
84
+ /* Header styles */
85
+ #header-title {
86
+ text-align: left;
87
+ display: inline-block;
88
+ }
89
+
90
+ #header-row {
91
+ display: flex;
92
+ justify-content: space-between;
93
+ align-items: center;
94
+ }
95
+
96
+ #header-row .gradio-html {
97
+ flex-grow: 1;
98
+ }
99
+
100
+ #oauth-button {
101
+ height: auto;
102
+ min-width: max-content;
103
+ white-space: nowrap;
104
+ padding: 10px 20px;
105
+ border-radius: 4px;
106
+ }
107
+ """
108
+
109
+ get_window_url_params = """
110
+ function(url_params) {
111
+ const params = new URLSearchParams(window.location.search);
112
+ url_params = Object.fromEntries(params);
113
+ return url_params;
114
+ }
115
+ """
src/display/formatting.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import HfApi
2
+
3
+ API = HfApi()
4
+
5
+
6
+ def model_hyperlink(link, model_name):
7
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
8
+
9
+
10
+ def make_clickable_model(df, model_col, link_col):
11
+ df[model_col] = df.apply(
12
+ lambda row: model_hyperlink(row[link_col], row[model_col]), axis=1
13
+ )
14
+ df["Openness"] = df.apply(
15
+ lambda row: "Open" if "huggingface.co" in row[link_col] else "Closed", axis=1
16
+ )
17
+ return df
18
+
19
+
20
+ def styled_error(error):
21
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
22
+
23
+
24
+ def styled_warning(warn):
25
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
26
+
27
+
28
+ def styled_message(message):
29
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
30
+
31
+
32
+ def has_no_nan_values(df, columns):
33
+ return df[columns].notna().all(axis=1)
34
+
35
+
36
+ def has_nan_values(df, columns):
37
+ return df[columns].isna().any(axis=1)
src/display/utils.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, make_dataclass
2
+ from enum import Enum
3
+ import json
4
+ import logging
5
+ from datetime import datetime
6
+ import pandas as pd
7
+
8
+
9
+ # Configure logging
10
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
11
+
12
+ # Convert ISO 8601 dates to datetime objects for comparison
13
+ def parse_iso8601_datetime(date_str):
14
+ if date_str.endswith('Z'):
15
+ date_str = date_str[:-1] + '+00:00'
16
+ return datetime.fromisoformat(date_str)
17
+
18
+ def parse_datetime(datetime_str):
19
+ formats = [
20
+ "%Y-%m-%dT%H-%M-%S.%f", # Format with dashes
21
+ "%Y-%m-%dT%H:%M:%S.%f", # Standard format with colons
22
+ "%Y-%m-%dT%H %M %S.%f", # Spaces as separator
23
+ ]
24
+
25
+ for fmt in formats:
26
+ try:
27
+ return datetime.strptime(datetime_str, fmt)
28
+ except ValueError:
29
+ continue
30
+ # in rare cases set unix start time for files with incorrect time (legacy files)
31
+ logging.error(f"No valid date format found for: {datetime_str}")
32
+ return datetime(1970, 1, 1)
33
+
34
+
35
+ def load_json_data(file_path):
36
+ """Safely load JSON data from a file."""
37
+ try:
38
+ with open(file_path, "r") as file:
39
+ return json.load(file)
40
+ except json.JSONDecodeError:
41
+ print(f"Error reading JSON from {file_path}")
42
+ return None # Or raise an exception
43
+
44
+
45
+ def fields(raw_class):
46
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
47
+
48
+
49
+ column_map = {
50
+ "T": "T",
51
+ "model": "Model",
52
+ "type": "Model Type",
53
+ "size_range": "Size Range",
54
+ "complete": "Complete",
55
+ "instruct": "Instruct",
56
+ "average": "Average",
57
+ "elo_mle": "Elo Rating",
58
+ "link": "Link",
59
+ "act_param": "#Act Params (B)",
60
+ "size": "#Params (B)",
61
+ "moe": "MoE",
62
+ # "lazy": "Lazy",
63
+ "openness": "Openness",
64
+ # "direct_complete": "Direct Completion",
65
+ }
66
+
67
+ type_map = {
68
+ "🔶": "🔶 Chat Models (RLHF, DPO, IFT, ...)",
69
+ "🟢": "🟢 Base Models"
70
+ }
71
+
72
+ moe_map = {
73
+ True: "MoE",
74
+ False: "Dense"
75
+ }
76
+ # These classes are for user facing column names,
77
+ # to avoid having to change them all around the code
78
+ # when a modif is needed
79
+ @dataclass(frozen=True)
80
+ class ColumnContent:
81
+ name: str
82
+ type: str
83
+ displayed_by_default: bool
84
+ hidden: bool = False
85
+ never_hidden: bool = False
86
+ dummy: bool = False
87
+
88
+
89
+ auto_eval_column_dict = []
90
+ # Init
91
+ auto_eval_column_dict.append(["T", ColumnContent, ColumnContent(column_map["T"], "str", True, never_hidden=True)])
92
+ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent(column_map["model"], "markdown", True, never_hidden=True)])
93
+ auto_eval_column_dict.append(["type", ColumnContent, ColumnContent(column_map["type"], "str", False, True)])
94
+ auto_eval_column_dict.append(["size_range", ColumnContent, ColumnContent(column_map["size_range"], "str", False, True)])
95
+ # Scores
96
+ auto_eval_column_dict.append(["complete", ColumnContent, ColumnContent(column_map["complete"], "number", True)])
97
+ auto_eval_column_dict.append(["instruct", ColumnContent, ColumnContent(column_map["instruct"], "number", True)])
98
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent(column_map["average"], "number", True)])
99
+ auto_eval_column_dict.append(["elo_mle", ColumnContent, ColumnContent(column_map["elo_mle"], "number", True)])
100
+
101
+ # Model information
102
+ auto_eval_column_dict.append(["act_param", ColumnContent, ColumnContent(column_map["act_param"], "number", True)])
103
+ auto_eval_column_dict.append(["link", ColumnContent, ColumnContent(column_map["link"], "str", False, True)])
104
+ auto_eval_column_dict.append(["size", ColumnContent, ColumnContent(column_map["size"], "number", False)])
105
+ # auto_eval_column_dict.append(["lazy", ColumnContent, ColumnContent(column_map["lazy"], "bool", False, True)])
106
+ auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent(column_map["moe"], "str", False, True)])
107
+ auto_eval_column_dict.append(["openness", ColumnContent, ColumnContent(column_map["openness"], "str", False, True)])
108
+ # auto_eval_column_dict.append(["direct_complete", ColumnContent, ColumnContent(column_map["direct_complete"], "bool", False)])
109
+
110
+ # We use make dataclass to dynamically fill the scores from Tasks
111
+ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
112
+
113
+
114
+ @dataclass(frozen=True)
115
+ class EvalQueueColumn: # Queue column
116
+ model_link = ColumnContent("link", "markdown", True)
117
+ model_name = ColumnContent("model", "str", True)
118
+
119
+ @dataclass
120
+ class ModelDetails:
121
+ name: str
122
+ symbol: str = "" # emoji, only for the model type
123
+
124
+
125
+ # Column selection
126
+ COLS = [c.name for c in fields(AutoEvalColumn)]
127
+ TYPES = [c.type for c in fields(AutoEvalColumn)]
128
+
129
+ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
130
+ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
131
+
132
+
133
+ NUMERIC_INTERVALS = {
134
+ "?": pd.Interval(-1, 0, closed="right"),
135
+ "~1.5": pd.Interval(0, 2, closed="right"),
136
+ "~3": pd.Interval(2, 4, closed="right"),
137
+ "~7": pd.Interval(4, 9, closed="right"),
138
+ "~13": pd.Interval(9, 20, closed="right"),
139
+ "~35": pd.Interval(20, 45, closed="right"),
140
+ "~60": pd.Interval(45, 70, closed="right"),
141
+ "70+": pd.Interval(70, 10000, closed="right"),
142
+ }
src/envs.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from huggingface_hub import HfApi
3
+
4
+ # clone / pull the lmeh eval data
5
+ HF_TOKEN = os.environ.get("HF_TOKEN", None)
6
+
7
+ DATA_VERSION = "v0.1.0_hf"
8
+
9
+ REPO_ID = "bigcode/bigcodebench-leaderboard"
10
+ QUEUE_REPO = "bigcode/bigcodebench-requests"
11
+ DATA_REPO = "bigcode/bigcodebench"
12
+ RESULT_REPO = "bigcode/bigcodebench-results"
13
+ HARD_RESULT_REPO = "bigcode/bigcodebench-hard-results"
14
+
15
+ ELO_REPO = "bigcode/bigcodebench-elo"
16
+ HARD_ELO_REPO = "bigcode/bigcodebench-hard-elo"
17
+ SOLVE_REPO = "bigcode/bigcodebench-solve-rate"
18
+ HARD_SOLVE_REPO = "bigcode/bigcodebench-hard-solve-rate"
19
+
20
+ VOTES_REPO = "bigcode/bigcodebench-votes"
21
+
22
+ HF_HOME = os.getenv("HF_HOME", ".")
23
+
24
+ # Check HF_HOME write access
25
+ print(f"Initial HF_HOME set to: {HF_HOME}")
26
+
27
+ if not os.access(HF_HOME, os.W_OK):
28
+ print(f"No write access to HF_HOME: {HF_HOME}. Resetting to current directory.")
29
+ HF_HOME = "."
30
+ os.environ["HF_HOME"] = HF_HOME
31
+ else:
32
+ print("Write access confirmed for HF_HOME")
33
+
34
+ VOTES_PATH = os.path.join(HF_HOME, "model-votes")
35
+ EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
36
+
37
+ # Rate limit variables
38
+ RATE_LIMIT_PERIOD = 7
39
+ RATE_LIMIT_QUOTA = 5
40
+ HAS_HIGHER_RATE_LIMIT = []
41
+
42
+ API = HfApi(token=HF_TOKEN)
src/execute.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import subprocess
3
+ import sys
4
+ import os
5
+ import threading
6
+ import time
7
+ import uuid
8
+ import glob
9
+ import shutil
10
+ from pathlib import Path
11
+ from apscheduler.schedulers.background import BackgroundScheduler
12
+
13
+ default_command = "bigcodebench.evaluate"
14
+ is_running = False
15
+ lock = threading.Lock()
16
+
17
+ def generate_command(
18
+ jsonl_file, split, subset, parallel,
19
+ min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
20
+ check_gt_only, no_gt
21
+ ):
22
+ command = [default_command]
23
+
24
+ if jsonl_file is not None:
25
+ # Copy the uploaded file to the current directory
26
+ local_filename = os.path.basename(jsonl_file.name)
27
+ shutil.copy(jsonl_file.name, local_filename)
28
+ command.extend(["--samples", local_filename])
29
+
30
+ command.extend(["--split", split, "--subset", subset])
31
+
32
+ if parallel is not None and parallel != 0:
33
+ command.extend(["--parallel", str(int(parallel))])
34
+
35
+ command.extend([
36
+ "--min-time-limit", str(min_time_limit),
37
+ "--max-as-limit", str(int(max_as_limit)),
38
+ "--max-data-limit", str(int(max_data_limit)),
39
+ "--max-stack-limit", str(int(max_stack_limit))
40
+ ])
41
+
42
+ if check_gt_only:
43
+ command.append("--check-gt-only")
44
+
45
+ if no_gt:
46
+ command.append("--no-gt")
47
+
48
+ return " ".join(command)
49
+
50
+
51
+ def cleanup_previous_files(jsonl_file):
52
+ if jsonl_file is not None:
53
+ file_list = ['Dockerfile', 'app.py', 'README.md', os.path.basename(jsonl_file.name), "__pycache__"]
54
+ else:
55
+ file_list = ['Dockerfile', 'app.py', 'README.md', "__pycache__"]
56
+ for file in glob.glob("*"):
57
+ try:
58
+ if file not in file_list:
59
+ os.remove(file)
60
+ except Exception as e:
61
+ print(f"Error during cleanup of {file}: {e}")
62
+
63
+ def find_result_file():
64
+ json_files = glob.glob("*.json")
65
+ if json_files:
66
+ return max(json_files, key=os.path.getmtime)
67
+ return None
68
+
69
+ def run_bigcodebench(command):
70
+ global is_running
71
+ with lock:
72
+ if is_running:
73
+ yield "A command is already running. Please wait for it to finish.\n"
74
+ return
75
+ is_running = True
76
+
77
+ try:
78
+ yield f"Executing command: {command}\n"
79
+
80
+ process = subprocess.Popen(command.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
81
+
82
+ for line in process.stdout:
83
+ yield line
84
+
85
+ # process.wait()
86
+
87
+ if process.returncode != 0:
88
+ yield f"Error: Command exited with status {process.returncode}\n"
89
+
90
+ yield "Evaluation completed.\n"
91
+
92
+ result_file = find_result_file()
93
+ if result_file:
94
+ yield f"Result file found: {result_file}\n"
95
+ else:
96
+ yield "No result file found.\n"
97
+ finally:
98
+ with lock:
99
+ is_running = False
100
+
101
+ def stream_logs(command, jsonl_file=None):
102
+ global is_running
103
+
104
+ if is_running:
105
+ yield "A command is already running. Please wait for it to finish.\n"
106
+ return
107
+
108
+ cleanup_previous_files(jsonl_file)
109
+ yield "Cleaned up previous files.\n"
110
+
111
+ log_content = []
112
+ for log_line in run_bigcodebench(command):
113
+ log_content.append(log_line)
114
+ yield "".join(log_content)
115
+
116
+ with gr.Blocks() as demo:
117
+ gr.Markdown("# BigCodeBench Evaluator")
118
+
119
+ with gr.Row():
120
+ jsonl_file = gr.File(label="Upload JSONL file", file_types=[".jsonl"])
121
+ split = gr.Dropdown(choices=["complete", "instruct"], label="Split", value="complete")
122
+ subset = gr.Dropdown(choices=["hard"], label="Subset", value="hard")
123
+
124
+ with gr.Row():
125
+ parallel = gr.Number(label="Parallel (optional)", precision=0)
126
+ min_time_limit = gr.Number(label="Min Time Limit", value=1, precision=1)
127
+ max_as_limit = gr.Number(label="Max AS Limit", value=25*1024, precision=0)
128
+
129
+ with gr.Row():
130
+ max_data_limit = gr.Number(label="Max Data Limit", value=25*1024, precision=0)
131
+ max_stack_limit = gr.Number(label="Max Stack Limit", value=10, precision=0)
132
+ check_gt_only = gr.Checkbox(label="Check GT Only")
133
+ no_gt = gr.Checkbox(label="No GT")
134
+
135
+ command_output = gr.Textbox(label="Command", value=default_command, interactive=False)
136
+ with gr.Row():
137
+ submit_btn = gr.Button("Run Evaluation")
138
+ download_btn = gr.DownloadButton(label="Download Result")
139
+ log_output = gr.Textbox(label="Execution Logs", lines=20)
140
+
141
+ input_components = [
142
+ jsonl_file, split, subset, parallel,
143
+ min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
144
+ check_gt_only, no_gt
145
+ ]
146
+
147
+ for component in input_components:
148
+ component.change(generate_command, inputs=input_components, outputs=command_output)
149
+
150
+
151
+ def start_evaluation(command, jsonl_file, subset, split):
152
+ extra = subset + "_" if subset != "full" else ""
153
+ if jsonl_file is not None:
154
+ result_path = os.path.basename(jsonl_file.name).replace(".jsonl", f"_{extra}eval_results.json")
155
+ else:
156
+ result_path = None
157
+
158
+ for log in stream_logs(command, jsonl_file):
159
+ if jsonl_file is not None:
160
+ yield log, gr.update(value=result_path, label=result_path), gr.update()
161
+ else:
162
+ yield log, gr.update(), gr.update()
163
+ is_running = False
164
+ result_file = find_result_file()
165
+ if result_file:
166
+ return gr.update(label="Evaluation completed. Result file found."), gr.update(value=result_file)
167
+ # gr.Button(visible=False)#,
168
+ # gr.DownloadButton(label="Download Result", value=result_file, visible=True))
169
+ else:
170
+ return gr.update(label="Evaluation completed. No result file found."), gr.update(value=result_path)
171
+ # gr.Button("Run Evaluation", visible=True),
172
+ # gr.DownloadButton(visible=False))
173
+ submit_btn.click(start_evaluation,
174
+ inputs=[command_output, jsonl_file, subset, split],
175
+ outputs=[log_output, download_btn])
176
+
177
+ demo.queue(max_size=300).launch(share=True, server_name="0.0.0.0", server_port=7860)
178
+ scheduler = BackgroundScheduler()
src/populate.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pathlib
2
+ import pandas as pd
3
+ from datasets import Dataset
4
+ from src.display.formatting import has_no_nan_values, make_clickable_model
5
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn
6
+ from src.display.utils import load_json_data, column_map, type_map, moe_map, NUMERIC_INTERVALS
7
+
8
+
9
+
10
+ def get_evaluation_queue_df(save_path, cols):
11
+ """Generate dataframes for pending, running, and finished evaluation entries."""
12
+ save_path = pathlib.Path(save_path)
13
+ all_evals = []
14
+
15
+ for path in save_path.rglob("*.json"):
16
+ data = load_json_data(path)
17
+ # Organizing data by status
18
+ status_map = {
19
+ "PENDING": ["PENDING", "RERUN"],
20
+ "RUNNING": ["RUNNING"],
21
+ "FINISHED": ["FINISHED", "PENDING_NEW_EVAL"],
22
+ }
23
+ status_dfs = {status: [] for status in status_map}
24
+ for eval_data in all_evals:
25
+ for status, extra_statuses in status_map.items():
26
+ if eval_data["status"] in extra_statuses:
27
+ status_dfs[status].append(eval_data)
28
+
29
+ return tuple(pd.DataFrame(status_dfs[status], columns=cols) for status in ["FINISHED", "RUNNING", "PENDING"])
30
+
31
+
32
+ def get_leaderboard_df(leaderboard_dataset: Dataset, cols: list):
33
+ """Retrieve and process leaderboard data."""
34
+ all_data_json = leaderboard_dataset.to_dict()
35
+ num_items = leaderboard_dataset.num_rows
36
+ all_data_json_list = [{k: all_data_json[k][ix] for k in all_data_json.keys()} for ix in range(num_items)]
37
+
38
+ df = pd.DataFrame.from_records(all_data_json_list)
39
+ # replace df.moe true to false, false to true
40
+ # map column names
41
+ df = df.rename(columns=column_map)
42
+ df[AutoEvalColumn.moe.name] = df[AutoEvalColumn.moe.name].map(moe_map)
43
+ df[AutoEvalColumn.T.name] = df[AutoEvalColumn.type.name]
44
+ df[AutoEvalColumn.type.name] = df[AutoEvalColumn.type.name].map(type_map)
45
+ df[AutoEvalColumn.average.name] = df.apply(lambda x: round((x[AutoEvalColumn.complete.name] + x[AutoEvalColumn.instruct.name]) / 2, 1) if not pd.isna(x[AutoEvalColumn.complete.name]) and not pd.isna(x[AutoEvalColumn.instruct.name]) else None, axis=1)
46
+ df[AutoEvalColumn.size_range.name] = df[AutoEvalColumn.size.name].apply(lambda x: next((k for k, v in NUMERIC_INTERVALS.items() if x in v), "?"))
47
+ df = make_clickable_model(df, AutoEvalColumn.model.name, AutoEvalColumn.link.name)
48
+ df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
49
+ df = df[cols].round(decimals=2)
50
+ return df
src/tools/plots.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import plotly.graph_objects as go
2
+ import plotly.express as px
3
+ import numpy as np
4
+
5
+
6
+ def plot_elo_mle(df):
7
+ fig = px.scatter(df, x="model", y="rating", error_y="error_y",
8
+ error_y_minus="error_y_minus",
9
+ # title="Bootstrap of Elo MLE Estimates (BigCodeBench-Complete)"
10
+ )
11
+ fig.update_layout(xaxis_title="Model",
12
+ yaxis_title="Rating",
13
+ autosize=True,
14
+ # width=1300,
15
+ # height=900,
16
+ )
17
+ return fig
18
+
19
+
20
+ def plot_solve_rate(df, task, rows=30, cols=38):
21
+ keys = df["task_id"]
22
+ values = df["solve_rate"]
23
+
24
+ values = np.array(values, dtype=float) # Ensure values are floats
25
+
26
+ # Extract numerical IDs and sort by them
27
+ ids = [int(key.split('/')[-1]) for key in keys]
28
+ sorted_indices = np.argsort(ids)
29
+ keys = np.array(keys)[sorted_indices]
30
+ values = values[sorted_indices]
31
+
32
+ n = len(values)
33
+ pad_width = rows * cols - n
34
+
35
+ # Create a masked array
36
+ masked_values = np.ma.array(np.full(rows * cols, np.nan), mask=True)
37
+ masked_values[:n] = values
38
+ masked_values.mask[:n] = False
39
+ masked_values = masked_values.reshape((rows, cols))
40
+
41
+ keys_padded = np.pad(keys, (0, pad_width), 'constant', constant_values='')
42
+ keys_reshaped = keys_padded.reshape((rows, cols))
43
+
44
+ hover_text = np.empty_like(masked_values, dtype=object)
45
+ for i in range(rows):
46
+ for j in range(cols):
47
+ if not masked_values.mask[i, j]:
48
+ hover_text[i, j] = f"{keys_reshaped[i, j]}<br>Solve Rate: {masked_values[i, j]:.2f}"
49
+ else:
50
+ hover_text[i, j] = "NaN"
51
+
52
+ upper_solve_rate = round(np.count_nonzero(values) / n * 100, 2)
53
+
54
+ fig = go.Figure(data=go.Heatmap(
55
+ z=masked_values,
56
+ text=hover_text,
57
+ hoverinfo='text',
58
+ colorscale='teal',
59
+ zmin=0,
60
+ zmax=100
61
+ ))
62
+
63
+ fig.update_layout(
64
+ title=f'BigCodeBench-{task}<br><i>Lowest Upper Limit: {upper_solve_rate}%</i>',
65
+ xaxis_nticks=cols,
66
+ yaxis_nticks=rows,
67
+ xaxis=dict(showticklabels=False),
68
+ yaxis=dict(showticklabels=False),
69
+ autosize=True,
70
+ )
71
+
72
+ return fig
src/voting/vote_system.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import pathlib
4
+ import pandas as pd
5
+ import gradio as gr
6
+ import schedule
7
+ import time
8
+ from datetime import datetime, timezone
9
+
10
+ from src.envs import API
11
+
12
+ # Set up logging
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ class VoteManager:
17
+ def __init__(self, votes_path, eval_requests_path, repo_id):
18
+ self.votes_path = votes_path
19
+ self.eval_requests_path = eval_requests_path
20
+ self.repo_id = repo_id
21
+ self.vote_dataset = self.read_vote_dataset()
22
+ self.vote_check_set = self.make_check_set(self.vote_dataset)
23
+ self.votes_to_upload = []
24
+
25
+ def init_vote_dataset(self):
26
+ self.vote_dataset = self.read_vote_dataset()
27
+ self.vote_check_set = self.make_check_set(self.vote_dataset)
28
+
29
+ def read_vote_dataset(self):
30
+ result = []
31
+ votes_file = pathlib.Path(self.votes_path) / "votes_data.jsonl"
32
+ if votes_file.exists():
33
+ with open(votes_file, "r") as f:
34
+ for line in f:
35
+ data = json.loads(line.strip())
36
+ result.append(data)
37
+ result = pd.DataFrame(result)
38
+ return result
39
+
40
+ def make_check_set(self, vote_dataset: pd.DataFrame):
41
+ result = list()
42
+ for row in vote_dataset.itertuples(index=False, name='vote'):
43
+ result.append((row.model, row.revision, row.username))
44
+ return set(result)
45
+
46
+ def get_model_revision(self, selected_model: str) -> str:
47
+ """Fetch the revision for the given model from the request files."""
48
+ for user_folder in pathlib.Path(self.eval_requests_path).iterdir():
49
+ if user_folder.is_dir():
50
+ for file in user_folder.glob("*.json"):
51
+ with open(file, "r") as f:
52
+ data = json.load(f)
53
+ if data.get("model") == selected_model:
54
+ return data.get("revision", "main")
55
+ return "main"
56
+
57
+ def create_request_vote_df(self, pending_models_df: gr.Dataframe):
58
+ if pending_models_df.empty or not "model_name" in pending_models_df.columns:
59
+ return pending_models_df
60
+ self.vote_dataset = self.read_vote_dataset()
61
+ vote_counts = self.vote_dataset.groupby(['model', 'revision']).size().reset_index(name='vote_count')
62
+
63
+ pending_models_df_votes = pd.merge(
64
+ pending_models_df,
65
+ vote_counts,
66
+ left_on=["model_name", 'revision'],
67
+ right_on=['model', 'revision'],
68
+ how='left'
69
+ )
70
+ # Filling empty votes
71
+ pending_models_df_votes['vote_count'] = pending_models_df_votes['vote_count'].fillna(0)
72
+ pending_models_df_votes = pending_models_df_votes.sort_values(by=["vote_count", "model_name"], ascending=[False, True])
73
+ # Removing useless columns
74
+ pending_models_df_votes = pending_models_df_votes.drop(["model_name", "model"], axis=1)
75
+ return pending_models_df_votes
76
+
77
+ # Function to be called when a user votes for a model
78
+ def add_vote(
79
+ self,
80
+ selected_model: str,
81
+ pending_models_df: gr.Dataframe,
82
+ profile: gr.OAuthProfile | None
83
+ ):
84
+ logger.debug(f"Type of list before usage: {type(list)}")
85
+ # model_name, revision, user_id, timestamp
86
+ if selected_model in ["str", ""]:
87
+ gr.Warning("No model selected")
88
+ return
89
+
90
+ if profile is None:
91
+ gr.Warning("Hub Login required")
92
+ return
93
+
94
+ vote_username = profile.username
95
+ model_revision = self.get_model_revision(selected_model)
96
+
97
+ # tuple (immutable) for checking than already voted for model
98
+ check_tuple = (selected_model, model_revision, vote_username)
99
+ if check_tuple in self.vote_check_set:
100
+ gr.Warning("Already voted for this model")
101
+ return
102
+
103
+ current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
104
+
105
+ vote_obj = {
106
+ "model": selected_model,
107
+ "revision": model_revision,
108
+ "username": vote_username,
109
+ "timestamp": current_time
110
+ }
111
+
112
+ # Append the vote to the JSONL file
113
+ try:
114
+ votes_file = pathlib.Path(self.votes_path) / "votes_data.jsonl"
115
+ with open(votes_file, "a") as f:
116
+ f.write(json.dumps(vote_obj) + "\n")
117
+ logger.info(f"Vote added locally: {vote_obj}")
118
+
119
+ self.votes_to_upload.append(vote_obj)
120
+ except Exception as e:
121
+ logger.error(f"Failed to write vote to file: {e}")
122
+ gr.Warning("Failed to record vote. Please try again")
123
+ return
124
+
125
+ self.vote_check_set.add(check_tuple)
126
+ gr.Info(f"Voted for {selected_model}")
127
+
128
+ return self.create_request_vote_df(pending_models_df)
129
+
130
+ def upload_votes(self):
131
+ if self.votes_to_upload:
132
+ votes_file = pathlib.Path(self.votes_path) / "votes_data.jsonl"
133
+ try:
134
+ with open(votes_file, "rb") as f:
135
+ API.upload_file(
136
+ path_or_fileobj=f,
137
+ path_in_repo="votes_data.jsonl",
138
+ repo_id=self.repo_id,
139
+ repo_type="dataset",
140
+ commit_message="Updating votes_data.jsonl with new votes",
141
+ )
142
+ logger.info("Votes uploaded to votes repository")
143
+ self.votes_to_upload.clear()
144
+ except Exception as e:
145
+ logger.error(f"Failed to upload votes to repository: {e}")
146
+
147
+ def run_scheduler(vote_manager):
148
+ while True:
149
+ schedule.run_pending()
150
+ time.sleep(1)