ycwhencpp commited on
Commit
17149c8
·
verified ·
1 Parent(s): 0c87e02

HF Job: train_grpo run output

Browse files
run-output/plots/io_log.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
run-output/plots/training_log.csv CHANGED
@@ -1,7 +1 @@
1
  phase,round,global_step,use_hint,avg_episode_reward,max_episode_reward,min_episode_reward,avg_grader,max_grader,n_training_samples,train_loss
2
- phase1_timing,1,1,True,5.127,5.315,4.96,0.9498,1.0,81,2.833
3
- phase1_timing,2,2,False,3.04,3.303,2.6,0.259,0.3614,96,3.1413
4
- phase1_timing,3,3,False,2.867,3.016,2.555,0.2083,0.3042,102,3.1255
5
- phase2_content,1,4,True,3.538,3.837,3.338,0.8697,1.0,77,2.8381
6
- phase2_content,2,5,False,2.15,2.807,1.587,0.3763,0.5979,90,2.9281
7
- phase2_content,3,6,False,1.924,2.609,1.375,0.2855,0.5027,76,2.9184
 
1
  phase,round,global_step,use_hint,avg_episode_reward,max_episode_reward,min_episode_reward,avg_grader,max_grader,n_training_samples,train_loss
 
 
 
 
 
 
run-output/plots/training_summary.json CHANGED
@@ -1,21 +1,18 @@
1
  {
2
  "model": "Qwen/Qwen2.5-3B-Instruct",
3
  "training": "Two-phase LoRA SFT (timing -> content) with hardcoded peak-hours hint on round 1 of each phase",
4
- "phases": [
5
- "phase1_timing",
6
- "phase2_content"
7
- ],
8
  "rounds_per_phase": 3,
9
  "episodes_per_round": 6,
10
  "before": {
11
- "monthly_engage": 0.0,
12
- "monthly_strategic": 0.175,
13
- "monthly_competitive": 0.035
14
  },
15
  "after": {
16
- "monthly_engage": 0.0,
17
- "monthly_strategic": 0.175,
18
- "monthly_competitive": 0.035
19
  },
20
  "smart_heuristic": {
21
  "monthly_engage": 0.7519,
@@ -23,98 +20,21 @@
23
  "monthly_competitive": 0.9141
24
  },
25
  "improvement": {
26
- "monthly_engage": 0.0,
27
- "monthly_strategic": 0.0,
28
- "monthly_competitive": 0.0
29
  },
30
  "training_log": {
31
- "phase": [
32
- "phase1_timing",
33
- "phase1_timing",
34
- "phase1_timing",
35
- "phase2_content",
36
- "phase2_content",
37
- "phase2_content"
38
- ],
39
- "round": [
40
- 1,
41
- 2,
42
- 3,
43
- 1,
44
- 2,
45
- 3
46
- ],
47
- "global_step": [
48
- 1,
49
- 2,
50
- 3,
51
- 4,
52
- 5,
53
- 6
54
- ],
55
- "use_hint": [
56
- true,
57
- false,
58
- false,
59
- true,
60
- false,
61
- false
62
- ],
63
- "avg_episode_reward": [
64
- 5.127,
65
- 3.04,
66
- 2.867,
67
- 3.538,
68
- 2.15,
69
- 1.924
70
- ],
71
- "max_episode_reward": [
72
- 5.315,
73
- 3.303,
74
- 3.016,
75
- 3.837,
76
- 2.807,
77
- 2.609
78
- ],
79
- "min_episode_reward": [
80
- 4.96,
81
- 2.6,
82
- 2.555,
83
- 3.338,
84
- 1.587,
85
- 1.375
86
- ],
87
- "avg_grader": [
88
- 0.9498,
89
- 0.259,
90
- 0.2083,
91
- 0.8697,
92
- 0.3763,
93
- 0.2855
94
- ],
95
- "max_grader": [
96
- 1.0,
97
- 0.3614,
98
- 0.3042,
99
- 1.0,
100
- 0.5979,
101
- 0.5027
102
- ],
103
- "n_training_samples": [
104
- 81,
105
- 96,
106
- 102,
107
- 77,
108
- 90,
109
- 76
110
- ],
111
- "train_loss": [
112
- 2.833,
113
- 3.1413,
114
- 3.1255,
115
- 2.8381,
116
- 2.9281,
117
- 2.9184
118
- ]
119
  }
120
  }
 
1
  {
2
  "model": "Qwen/Qwen2.5-3B-Instruct",
3
  "training": "Two-phase LoRA SFT (timing -> content) with hardcoded peak-hours hint on round 1 of each phase",
4
+ "phases": [],
 
 
 
5
  "rounds_per_phase": 3,
6
  "episodes_per_round": 6,
7
  "before": {
8
+ "monthly_engage": 1.0,
9
+ "monthly_strategic": 0.8357,
10
+ "monthly_competitive": 0.9414
11
  },
12
  "after": {
13
+ "monthly_engage": 0.999,
14
+ "monthly_strategic": 0.9321439559505211,
15
+ "monthly_competitive": 0.999
16
  },
17
  "smart_heuristic": {
18
  "monthly_engage": 0.7519,
 
20
  "monthly_competitive": 0.9141
21
  },
22
  "improvement": {
23
+ "monthly_engage": -0.0010000000000000009,
24
+ "monthly_strategic": 0.09644395595052113,
25
+ "monthly_competitive": 0.057599999999999985
26
  },
27
  "training_log": {
28
+ "phase": [],
29
+ "round": [],
30
+ "global_step": [],
31
+ "use_hint": [],
32
+ "avg_episode_reward": [],
33
+ "max_episode_reward": [],
34
+ "min_episode_reward": [],
35
+ "avg_grader": [],
36
+ "max_grader": [],
37
+ "n_training_samples": [],
38
+ "train_loss": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  }
40
  }
run-output/training/train_grpo.executed.ipynb CHANGED
The diff for this file is too large to render. See raw diff