Update app.py
Browse files
app.py
CHANGED
|
@@ -106,10 +106,12 @@ def evaluate(
|
|
| 106 |
max_as_limit: int = 30 * 1024,
|
| 107 |
max_data_limit: int = 30 * 1024,
|
| 108 |
max_stack_limit: int = 10,
|
|
|
|
| 109 |
check_gt_only: bool = False,
|
| 110 |
no_gt: bool = False,
|
|
|
|
| 111 |
):
|
| 112 |
-
|
| 113 |
if parallel < 1:
|
| 114 |
n_workers = max(1, multiprocessing.cpu_count() // 2)
|
| 115 |
else:
|
|
@@ -121,6 +123,14 @@ def evaluate(
|
|
| 121 |
extra = subset + "_" if subset != "full" else ""
|
| 122 |
|
| 123 |
problems = get_bigcodebench(subset=subset)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
dataset_hash = get_bigcodebench_hash(subset=subset)
|
| 125 |
|
| 126 |
if not no_gt:
|
|
@@ -156,7 +166,7 @@ def evaluate(
|
|
| 156 |
if "solution" in sample
|
| 157 |
else problems[task_id]["complete_prompt"] + sample["completion"]
|
| 158 |
)
|
| 159 |
-
if
|
| 160 |
solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution
|
| 161 |
remainings.add(sample["_identifier"])
|
| 162 |
args = (
|
|
@@ -213,7 +223,7 @@ def evaluate(
|
|
| 213 |
|
| 214 |
pass_at_k.update({
|
| 215 |
f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean()
|
| 216 |
-
for k in
|
| 217 |
if total.min() >= k
|
| 218 |
})
|
| 219 |
|
|
@@ -223,7 +233,7 @@ def evaluate(
|
|
| 223 |
pass_at_k["model"] = os.path.basename(samples).split("--bigcodebench-")[0]
|
| 224 |
pass_at_k["split"] = split
|
| 225 |
pass_at_k["subset"] = subset
|
| 226 |
-
pass_at_k["calibrated"] =
|
| 227 |
pass_at_k["gt_pass_rate"] = gt_pass_rate
|
| 228 |
pass_at_k["failed_tasks"] = failed_tasks
|
| 229 |
|
|
@@ -243,8 +253,10 @@ interface = gr.Interface(
|
|
| 243 |
gr.Slider(1, 100 * 1024, step=1024, label="Max AS Limit", value=30 * 1024),
|
| 244 |
gr.Slider(1, 100 * 1024, step=1024, label="Max Data Limit", value=30 * 1024),
|
| 245 |
gr.Slider(1, 100, step=1, label="Max Stack Limit", value=10),
|
|
|
|
| 246 |
gr.Checkbox(label="Check GT Only"),
|
| 247 |
gr.Checkbox(label="No GT"),
|
|
|
|
| 248 |
],
|
| 249 |
outputs=[
|
| 250 |
gr.JSON(label="Results"),
|
|
@@ -271,8 +283,14 @@ def restart_space():
|
|
| 271 |
|
| 272 |
|
| 273 |
# if __name__ == "__main__":
|
| 274 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
scheduler = BackgroundScheduler()
|
| 276 |
-
scheduler.add_job(restart_space, "interval", hours=
|
| 277 |
scheduler.start()
|
| 278 |
interface.launch(show_error=True)
|
|
|
|
| 106 |
max_as_limit: int = 30 * 1024,
|
| 107 |
max_data_limit: int = 30 * 1024,
|
| 108 |
max_stack_limit: int = 10,
|
| 109 |
+
calibrated: bool = True,
|
| 110 |
check_gt_only: bool = False,
|
| 111 |
no_gt: bool = False,
|
| 112 |
+
selective_evaluate: str = "",
|
| 113 |
):
|
| 114 |
+
passk = [int(k.strip()) for k in pass_k.split(',') if k.strip().isdigit()]
|
| 115 |
if parallel < 1:
|
| 116 |
n_workers = max(1, multiprocessing.cpu_count() // 2)
|
| 117 |
else:
|
|
|
|
| 123 |
extra = subset + "_" if subset != "full" else ""
|
| 124 |
|
| 125 |
problems = get_bigcodebench(subset=subset)
|
| 126 |
+
|
| 127 |
+
# Add selective evaluation logic
|
| 128 |
+
if selective_evaluate:
|
| 129 |
+
selected_ids = ["BigCodeBench/" + id for id in sorted(set(selective_evaluate.split(",")))]
|
| 130 |
+
problems = {k: v for k, v in problems.items() if k in selected_ids}
|
| 131 |
+
if not problems:
|
| 132 |
+
raise ValueError(f"None of the provided task IDs {selected_ids} were found in the dataset")
|
| 133 |
+
|
| 134 |
dataset_hash = get_bigcodebench_hash(subset=subset)
|
| 135 |
|
| 136 |
if not no_gt:
|
|
|
|
| 166 |
if "solution" in sample
|
| 167 |
else problems[task_id]["complete_prompt"] + sample["completion"]
|
| 168 |
)
|
| 169 |
+
if calibrated:
|
| 170 |
solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution
|
| 171 |
remainings.add(sample["_identifier"])
|
| 172 |
args = (
|
|
|
|
| 223 |
|
| 224 |
pass_at_k.update({
|
| 225 |
f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean()
|
| 226 |
+
for k in passk
|
| 227 |
if total.min() >= k
|
| 228 |
})
|
| 229 |
|
|
|
|
| 233 |
pass_at_k["model"] = os.path.basename(samples).split("--bigcodebench-")[0]
|
| 234 |
pass_at_k["split"] = split
|
| 235 |
pass_at_k["subset"] = subset
|
| 236 |
+
pass_at_k["calibrated"] = calibrated
|
| 237 |
pass_at_k["gt_pass_rate"] = gt_pass_rate
|
| 238 |
pass_at_k["failed_tasks"] = failed_tasks
|
| 239 |
|
|
|
|
| 253 |
gr.Slider(1, 100 * 1024, step=1024, label="Max AS Limit", value=30 * 1024),
|
| 254 |
gr.Slider(1, 100 * 1024, step=1024, label="Max Data Limit", value=30 * 1024),
|
| 255 |
gr.Slider(1, 100, step=1, label="Max Stack Limit", value=10),
|
| 256 |
+
gr.Checkbox(label="Calibrated", value=True),
|
| 257 |
gr.Checkbox(label="Check GT Only"),
|
| 258 |
gr.Checkbox(label="No GT"),
|
| 259 |
+
gr.Textbox(label="Selective Evaluated Task IDs (comma-separated, e.g. '0,1,2')", value=""),
|
| 260 |
],
|
| 261 |
outputs=[
|
| 262 |
gr.JSON(label="Results"),
|
|
|
|
| 283 |
|
| 284 |
|
| 285 |
# if __name__ == "__main__":
|
| 286 |
+
while True:
|
| 287 |
+
try:
|
| 288 |
+
preload_gt()
|
| 289 |
+
break
|
| 290 |
+
except:
|
| 291 |
+
continue
|
| 292 |
+
|
| 293 |
scheduler = BackgroundScheduler()
|
| 294 |
+
scheduler.add_job(restart_space, "interval", hours=1) # Restart every 2hs
|
| 295 |
scheduler.start()
|
| 296 |
interface.launch(show_error=True)
|