md896 commited on
Commit
d2042c8
·
verified ·
1 Parent(s): 05bd818

Upload folder using huggingface_hub

Browse files
artifacts/runs/20260426-060502-final-pass-32eval/api_errors.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "base": [],
3
+ "candidate_1": [],
4
+ "candidate_2": []
5
+ }
artifacts/runs/20260426-060502-final-pass-32eval/benchmark_style_final.png ADDED
artifacts/runs/20260426-060502-final-pass-32eval/checkpoint_leaderboard_final.png ADDED
artifacts/runs/20260426-060502-final-pass-32eval/comparison_table.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ task,baseline_reward,post_reward,delta,relative_delta_percent
2
+ easy_syntax_fix,0.120287,0.129662,0.009375,7.79
3
+ medium_logic_fix,0.117475,0.118725,0.001250,1.06
4
+ hard_multi_bug,0.100600,0.100600,0.000000,0.00
5
+ hard_finance_explosion,0.104584,0.108438,0.003853,3.68
6
+ overall,0.110737,0.114356,0.003620,3.27
artifacts/runs/20260426-060502-final-pass-32eval/comparison_table.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Final Corrected Comparison (Base vs Winner)
2
+
3
+ - winner_model: `md896/sql-debug-agent-qwen25-05b-grpo-wandb-continue-v2`
4
+ - samples_per_task: 32
5
+ - eval_best_of_n: 2
6
+
7
+ | task | baseline | winner | delta | relative delta % |
8
+ |---|---:|---:|---:|---:|
9
+ | easy_syntax_fix | 0.120287 | 0.129662 | 0.009375 | 7.79% |
10
+ | medium_logic_fix | 0.117475 | 0.118725 | 0.001250 | 1.06% |
11
+ | hard_multi_bug | 0.100600 | 0.100600 | 0.000000 | 0.00% |
12
+ | hard_finance_explosion | 0.104584 | 0.108438 | 0.003853 | 3.68% |
13
+ | overall | 0.110737 | 0.114356 | 0.003620 | 3.27% |
artifacts/runs/20260426-060502-final-pass-32eval/final_metrics.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_ids": [
3
+ "easy_syntax_fix",
4
+ "medium_logic_fix",
5
+ "hard_multi_bug",
6
+ "hard_finance_explosion"
7
+ ],
8
+ "samples_per_task": 32,
9
+ "eval_best_of_n": 2,
10
+ "models": {
11
+ "base": {
12
+ "model_id": "Qwen/Qwen2.5-0.5B-Instruct",
13
+ "overall_reward": 0.11073671874999999,
14
+ "error_count": 0,
15
+ "per_task_reward": {
16
+ "easy_syntax_fix": 0.12028749999999996,
17
+ "medium_logic_fix": 0.11747499999999998,
18
+ "hard_multi_bug": 0.10060000000000001,
19
+ "hard_finance_explosion": 0.10458437500000003
20
+ }
21
+ },
22
+ "candidate_1": {
23
+ "model_id": "md896/sql-debug-agent-qwen25-05b-grpo-wandb-best",
24
+ "overall_reward": 0.11326249999999999,
25
+ "error_count": 0,
26
+ "per_task_reward": {
27
+ "easy_syntax_fix": 0.12591249999999996,
28
+ "medium_logic_fix": 0.1177875,
29
+ "hard_multi_bug": 0.10060000000000001,
30
+ "hard_finance_explosion": 0.10875000000000001
31
+ }
32
+ },
33
+ "candidate_2": {
34
+ "model_id": "md896/sql-debug-agent-qwen25-05b-grpo-wandb-continue-v2",
35
+ "overall_reward": 0.11435624999999999,
36
+ "error_count": 0,
37
+ "per_task_reward": {
38
+ "easy_syntax_fix": 0.12966249999999996,
39
+ "medium_logic_fix": 0.118725,
40
+ "hard_multi_bug": 0.10060000000000001,
41
+ "hard_finance_explosion": 0.10843750000000002
42
+ }
43
+ }
44
+ },
45
+ "winner_label": "candidate_2",
46
+ "winner_model_id": "md896/sql-debug-agent-qwen25-05b-grpo-wandb-continue-v2",
47
+ "winner_delta_vs_base": 0.003619531250000002
48
+ }
artifacts/runs/20260426-060502-final-pass-32eval/leaderboard.csv ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ rank,label,model_id,overall_reward
2
+ 1,candidate_2,md896/sql-debug-agent-qwen25-05b-grpo-wandb-continue-v2,0.114356
3
+ 2,candidate_1,md896/sql-debug-agent-qwen25-05b-grpo-wandb-best,0.113262
4
+ 3,base,Qwen/Qwen2.5-0.5B-Instruct,0.110737
artifacts/runs/20260426-060502-final-pass-32eval/performance_comparison_final.png ADDED
artifacts/runs/20260426-060502-final-pass-32eval/reward_distribution_shift_final.png ADDED
artifacts/runs/20260426-060502-final-pass-32eval/task_delta_final.png ADDED