winnieyangwannan commited on
Commit
ab48f50
·
verified ·
1 Parent(s): 331c3fa

Training in progress, step 300, checkpoint

Browse files
checkpoint-300/adapter_config.json CHANGED
@@ -24,12 +24,12 @@
24
  "revision": null,
25
  "target_modules": [
26
  "gate_proj",
27
- "q_proj",
28
  "k_proj",
29
- "up_proj",
30
  "o_proj",
31
- "down_proj",
32
- "v_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
24
  "revision": null,
25
  "target_modules": [
26
  "gate_proj",
27
+ "v_proj",
28
  "k_proj",
29
+ "q_proj",
30
  "o_proj",
31
+ "up_proj",
32
+ "down_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
checkpoint-300/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e620d5341c5f33452674a2cf5db54e7621d0d9ac6df59d54fb764032cc74308
3
  size 216151256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:181e4c0b26d165813ae306c3fc797adfd7f010b04e42f39f677759505aa04a82
3
  size 216151256
checkpoint-300/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c66713f966db3ce2979b04d4a23e75efe4df94fee22faf6069dec595e9d5716
3
  size 432640054
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa4039faaa76388412a2ebaf73d7acf3a15d82fd851184f4bf6aedfd251b0c75
3
  size 432640054
checkpoint-300/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cbda1e5a654d2530c53207896f8d4286870323a78a5385a32747e04d91df3aa2
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca45f95bcdfa3dc52d93139cc196214d00e007284ef333414453bd2343c3d8ea
3
  size 14244
checkpoint-300/trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 0.379746835443038,
5
- "eval_steps": 50,
6
  "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
@@ -10,260 +10,452 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.012658227848101266,
13
- "grad_norm": 16.027029037475586,
14
  "learning_rate": 4.9789029535864986e-05,
15
- "loss": 2.6925,
 
 
 
 
 
 
 
 
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.02531645569620253,
20
- "grad_norm": 1.0948777198791504,
21
  "learning_rate": 4.957805907172996e-05,
22
- "loss": 1.386,
 
 
 
 
 
 
 
 
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.0379746835443038,
27
- "grad_norm": 1.1341983079910278,
28
  "learning_rate": 4.936708860759494e-05,
29
- "loss": 1.108,
 
 
 
 
 
 
 
 
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.05063291139240506,
34
- "grad_norm": 1.152009129524231,
35
  "learning_rate": 4.9156118143459915e-05,
36
- "loss": 0.9436,
 
 
 
 
 
 
 
 
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.06329113924050633,
41
- "grad_norm": 1.0990614891052246,
42
  "learning_rate": 4.89451476793249e-05,
43
- "loss": 0.7499,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.06329113924050633,
48
- "eval_loss": 0.7157873511314392,
49
- "eval_runtime": 12.1594,
50
- "eval_samples_per_second": 39.476,
51
- "eval_steps_per_second": 2.467,
52
  "step": 50
53
  },
54
  {
55
  "epoch": 0.0759493670886076,
56
- "grad_norm": 1.0284477472305298,
57
  "learning_rate": 4.8734177215189874e-05,
58
- "loss": 0.6095,
 
 
 
 
 
 
 
 
59
  "step": 60
60
  },
61
  {
62
  "epoch": 0.08860759493670886,
63
- "grad_norm": 0.8609589338302612,
64
  "learning_rate": 4.852320675105486e-05,
65
- "loss": 0.5355,
 
 
 
 
 
 
 
 
66
  "step": 70
67
  },
68
  {
69
  "epoch": 0.10126582278481013,
70
- "grad_norm": 0.9162376523017883,
71
  "learning_rate": 4.8312236286919834e-05,
72
- "loss": 0.5705,
 
 
 
 
 
 
 
 
73
  "step": 80
74
  },
75
  {
76
  "epoch": 0.11392405063291139,
77
- "grad_norm": 0.9415847659111023,
78
  "learning_rate": 4.810126582278481e-05,
79
- "loss": 0.5449,
 
 
 
 
 
 
 
 
80
  "step": 90
81
  },
82
  {
83
  "epoch": 0.12658227848101267,
84
- "grad_norm": 0.8756884336471558,
85
  "learning_rate": 4.789029535864979e-05,
86
- "loss": 0.5157,
87
  "step": 100
88
  },
89
  {
90
  "epoch": 0.12658227848101267,
91
- "eval_loss": 0.563517153263092,
92
- "eval_runtime": 12.1236,
93
- "eval_samples_per_second": 39.592,
94
- "eval_steps_per_second": 2.475,
95
  "step": 100
96
  },
97
  {
98
  "epoch": 0.13924050632911392,
99
- "grad_norm": 0.8702118396759033,
100
  "learning_rate": 4.767932489451477e-05,
101
- "loss": 0.53,
 
 
 
 
 
 
 
 
102
  "step": 110
103
  },
104
  {
105
  "epoch": 0.1518987341772152,
106
- "grad_norm": 0.8843992352485657,
107
  "learning_rate": 4.7468354430379746e-05,
108
- "loss": 0.4923,
 
 
 
 
 
 
 
 
109
  "step": 120
110
  },
111
  {
112
  "epoch": 0.16455696202531644,
113
- "grad_norm": 0.8294386863708496,
114
  "learning_rate": 4.725738396624473e-05,
115
- "loss": 0.4936,
 
 
 
 
 
 
 
 
116
  "step": 130
117
  },
118
  {
119
  "epoch": 0.17721518987341772,
120
- "grad_norm": 0.8978216648101807,
121
  "learning_rate": 4.704641350210971e-05,
122
- "loss": 0.4581,
 
 
 
 
 
 
 
 
123
  "step": 140
124
  },
125
  {
126
  "epoch": 0.189873417721519,
127
- "grad_norm": 0.8757727742195129,
128
  "learning_rate": 4.683544303797468e-05,
129
- "loss": 0.4571,
130
  "step": 150
131
  },
132
  {
133
  "epoch": 0.189873417721519,
134
- "eval_loss": 0.5096740126609802,
135
- "eval_runtime": 12.0892,
136
- "eval_samples_per_second": 39.705,
137
- "eval_steps_per_second": 2.482,
138
  "step": 150
139
  },
140
  {
141
  "epoch": 0.20253164556962025,
142
- "grad_norm": 0.8959233164787292,
143
  "learning_rate": 4.6624472573839666e-05,
144
- "loss": 0.4429,
 
 
 
 
 
 
 
 
145
  "step": 160
146
  },
147
  {
148
  "epoch": 0.21518987341772153,
149
- "grad_norm": 0.9160757660865784,
150
  "learning_rate": 4.641350210970464e-05,
151
- "loss": 0.4167,
 
 
 
 
 
 
 
 
152
  "step": 170
153
  },
154
  {
155
  "epoch": 0.22784810126582278,
156
- "grad_norm": 0.8140855431556702,
157
  "learning_rate": 4.6202531645569625e-05,
158
- "loss": 0.4249,
 
 
 
 
 
 
 
 
159
  "step": 180
160
  },
161
  {
162
  "epoch": 0.24050632911392406,
163
- "grad_norm": 0.8790073990821838,
164
  "learning_rate": 4.59915611814346e-05,
165
- "loss": 0.4198,
 
 
 
 
 
 
 
 
166
  "step": 190
167
  },
168
  {
169
  "epoch": 0.25316455696202533,
170
- "grad_norm": 0.8366842269897461,
171
  "learning_rate": 4.5780590717299585e-05,
172
- "loss": 0.4148,
173
  "step": 200
174
  },
175
  {
176
  "epoch": 0.25316455696202533,
177
- "eval_loss": 0.47525277733802795,
178
- "eval_runtime": 12.0592,
179
- "eval_samples_per_second": 39.803,
180
- "eval_steps_per_second": 2.488,
181
  "step": 200
182
  },
183
  {
184
  "epoch": 0.26582278481012656,
185
- "grad_norm": 0.8733569383621216,
186
  "learning_rate": 4.556962025316456e-05,
187
- "loss": 0.3865,
 
 
 
 
 
 
 
 
188
  "step": 210
189
  },
190
  {
191
  "epoch": 0.27848101265822783,
192
- "grad_norm": 0.8695210218429565,
193
  "learning_rate": 4.535864978902954e-05,
194
- "loss": 0.3681,
 
 
 
 
 
 
 
 
195
  "step": 220
196
  },
197
  {
198
  "epoch": 0.2911392405063291,
199
- "grad_norm": 0.8353093862533569,
200
  "learning_rate": 4.5147679324894514e-05,
201
- "loss": 0.4121,
 
 
 
 
 
 
 
 
202
  "step": 230
203
  },
204
  {
205
  "epoch": 0.3037974683544304,
206
- "grad_norm": 0.9650343656539917,
207
  "learning_rate": 4.49367088607595e-05,
208
- "loss": 0.3987,
 
 
 
 
 
 
 
 
209
  "step": 240
210
  },
211
  {
212
  "epoch": 0.31645569620253167,
213
- "grad_norm": 0.9851623773574829,
214
  "learning_rate": 4.4725738396624474e-05,
215
- "loss": 0.4028,
216
  "step": 250
217
  },
218
  {
219
  "epoch": 0.31645569620253167,
220
- "eval_loss": 0.4751642048358917,
221
- "eval_runtime": 12.0237,
222
- "eval_samples_per_second": 39.921,
223
- "eval_steps_per_second": 2.495,
224
  "step": 250
225
  },
226
  {
227
  "epoch": 0.3291139240506329,
228
- "grad_norm": 0.8186360597610474,
229
  "learning_rate": 4.451476793248946e-05,
230
- "loss": 0.405,
 
 
 
 
 
 
 
 
231
  "step": 260
232
  },
233
  {
234
  "epoch": 0.34177215189873417,
235
- "grad_norm": 0.76863032579422,
236
  "learning_rate": 4.430379746835443e-05,
237
- "loss": 0.4134,
 
 
 
 
 
 
 
 
238
  "step": 270
239
  },
240
  {
241
  "epoch": 0.35443037974683544,
242
- "grad_norm": 0.785301685333252,
243
  "learning_rate": 4.409282700421941e-05,
244
- "loss": 0.3745,
 
 
 
 
 
 
 
 
245
  "step": 280
246
  },
247
  {
248
  "epoch": 0.3670886075949367,
249
- "grad_norm": 0.8613535165786743,
250
  "learning_rate": 4.388185654008439e-05,
251
- "loss": 0.3804,
 
 
 
 
 
 
 
 
252
  "step": 290
253
  },
254
  {
255
  "epoch": 0.379746835443038,
256
- "grad_norm": 0.8152708411216736,
257
  "learning_rate": 4.367088607594937e-05,
258
- "loss": 0.3757,
259
  "step": 300
260
  },
261
  {
262
  "epoch": 0.379746835443038,
263
- "eval_loss": 0.4623233675956726,
264
- "eval_runtime": 12.0099,
265
- "eval_samples_per_second": 39.967,
266
- "eval_steps_per_second": 2.498,
267
  "step": 300
268
  }
269
  ],
@@ -271,7 +463,7 @@
271
  "max_steps": 2370,
272
  "num_input_tokens_seen": 0,
273
  "num_train_epochs": 3,
274
- "save_steps": 100,
275
  "stateful_callbacks": {
276
  "TrainerControl": {
277
  "args": {
 
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 0.379746835443038,
5
+ "eval_steps": 10,
6
  "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.012658227848101266,
13
+ "grad_norm": 12.934767723083496,
14
  "learning_rate": 4.9789029535864986e-05,
15
+ "loss": 2.6869,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.012658227848101266,
20
+ "eval_loss": 1.651185393333435,
21
+ "eval_runtime": 11.831,
22
+ "eval_samples_per_second": 40.571,
23
+ "eval_steps_per_second": 2.536,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 0.02531645569620253,
28
+ "grad_norm": 1.104798674583435,
29
  "learning_rate": 4.957805907172996e-05,
30
+ "loss": 1.3694,
31
+ "step": 20
32
+ },
33
+ {
34
+ "epoch": 0.02531645569620253,
35
+ "eval_loss": 1.2200205326080322,
36
+ "eval_runtime": 11.8928,
37
+ "eval_samples_per_second": 40.361,
38
+ "eval_steps_per_second": 2.523,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 0.0379746835443038,
43
+ "grad_norm": 1.1069142818450928,
44
  "learning_rate": 4.936708860759494e-05,
45
+ "loss": 1.1029,
46
+ "step": 30
47
+ },
48
+ {
49
+ "epoch": 0.0379746835443038,
50
+ "eval_loss": 1.0691736936569214,
51
+ "eval_runtime": 11.9127,
52
+ "eval_samples_per_second": 40.293,
53
+ "eval_steps_per_second": 2.518,
54
  "step": 30
55
  },
56
  {
57
  "epoch": 0.05063291139240506,
58
+ "grad_norm": 1.1594161987304688,
59
  "learning_rate": 4.9156118143459915e-05,
60
+ "loss": 0.9395,
61
+ "step": 40
62
+ },
63
+ {
64
+ "epoch": 0.05063291139240506,
65
+ "eval_loss": 0.9162012934684753,
66
+ "eval_runtime": 11.9373,
67
+ "eval_samples_per_second": 40.21,
68
+ "eval_steps_per_second": 2.513,
69
  "step": 40
70
  },
71
  {
72
  "epoch": 0.06329113924050633,
73
+ "grad_norm": 1.1133538484573364,
74
  "learning_rate": 4.89451476793249e-05,
75
+ "loss": 0.7489,
76
  "step": 50
77
  },
78
  {
79
  "epoch": 0.06329113924050633,
80
+ "eval_loss": 0.713701605796814,
81
+ "eval_runtime": 11.9661,
82
+ "eval_samples_per_second": 40.113,
83
+ "eval_steps_per_second": 2.507,
84
  "step": 50
85
  },
86
  {
87
  "epoch": 0.0759493670886076,
88
+ "grad_norm": 1.0406183004379272,
89
  "learning_rate": 4.8734177215189874e-05,
90
+ "loss": 0.6096,
91
+ "step": 60
92
+ },
93
+ {
94
+ "epoch": 0.0759493670886076,
95
+ "eval_loss": 0.6309535503387451,
96
+ "eval_runtime": 11.9895,
97
+ "eval_samples_per_second": 40.035,
98
+ "eval_steps_per_second": 2.502,
99
  "step": 60
100
  },
101
  {
102
  "epoch": 0.08860759493670886,
103
+ "grad_norm": 0.8599340915679932,
104
  "learning_rate": 4.852320675105486e-05,
105
+ "loss": 0.5357,
106
+ "step": 70
107
+ },
108
+ {
109
+ "epoch": 0.08860759493670886,
110
+ "eval_loss": 0.6159886717796326,
111
+ "eval_runtime": 12.0107,
112
+ "eval_samples_per_second": 39.965,
113
+ "eval_steps_per_second": 2.498,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.10126582278481013,
118
+ "grad_norm": 0.9128267168998718,
119
  "learning_rate": 4.8312236286919834e-05,
120
+ "loss": 0.5703,
121
+ "step": 80
122
+ },
123
+ {
124
+ "epoch": 0.10126582278481013,
125
+ "eval_loss": 0.5933937430381775,
126
+ "eval_runtime": 11.9716,
127
+ "eval_samples_per_second": 40.095,
128
+ "eval_steps_per_second": 2.506,
129
  "step": 80
130
  },
131
  {
132
  "epoch": 0.11392405063291139,
133
+ "grad_norm": 0.9396541118621826,
134
  "learning_rate": 4.810126582278481e-05,
135
+ "loss": 0.5445,
136
+ "step": 90
137
+ },
138
+ {
139
+ "epoch": 0.11392405063291139,
140
+ "eval_loss": 0.5727818608283997,
141
+ "eval_runtime": 11.9685,
142
+ "eval_samples_per_second": 40.105,
143
+ "eval_steps_per_second": 2.507,
144
  "step": 90
145
  },
146
  {
147
  "epoch": 0.12658227848101267,
148
+ "grad_norm": 0.8805290460586548,
149
  "learning_rate": 4.789029535864979e-05,
150
+ "loss": 0.5151,
151
  "step": 100
152
  },
153
  {
154
  "epoch": 0.12658227848101267,
155
+ "eval_loss": 0.5640087127685547,
156
+ "eval_runtime": 11.9824,
157
+ "eval_samples_per_second": 40.059,
158
+ "eval_steps_per_second": 2.504,
159
  "step": 100
160
  },
161
  {
162
  "epoch": 0.13924050632911392,
163
+ "grad_norm": 0.8829126954078674,
164
  "learning_rate": 4.767932489451477e-05,
165
+ "loss": 0.5301,
166
+ "step": 110
167
+ },
168
+ {
169
+ "epoch": 0.13924050632911392,
170
+ "eval_loss": 0.5558986067771912,
171
+ "eval_runtime": 11.9942,
172
+ "eval_samples_per_second": 40.019,
173
+ "eval_steps_per_second": 2.501,
174
  "step": 110
175
  },
176
  {
177
  "epoch": 0.1518987341772152,
178
+ "grad_norm": 0.8889341950416565,
179
  "learning_rate": 4.7468354430379746e-05,
180
+ "loss": 0.4929,
181
+ "step": 120
182
+ },
183
+ {
184
+ "epoch": 0.1518987341772152,
185
+ "eval_loss": 0.5422877073287964,
186
+ "eval_runtime": 11.9624,
187
+ "eval_samples_per_second": 40.126,
188
+ "eval_steps_per_second": 2.508,
189
  "step": 120
190
  },
191
  {
192
  "epoch": 0.16455696202531644,
193
+ "grad_norm": 0.8428446054458618,
194
  "learning_rate": 4.725738396624473e-05,
195
+ "loss": 0.4932,
196
+ "step": 130
197
+ },
198
+ {
199
+ "epoch": 0.16455696202531644,
200
+ "eval_loss": 0.5370256900787354,
201
+ "eval_runtime": 11.9786,
202
+ "eval_samples_per_second": 40.071,
203
+ "eval_steps_per_second": 2.504,
204
  "step": 130
205
  },
206
  {
207
  "epoch": 0.17721518987341772,
208
+ "grad_norm": 0.8985374569892883,
209
  "learning_rate": 4.704641350210971e-05,
210
+ "loss": 0.4589,
211
+ "step": 140
212
+ },
213
+ {
214
+ "epoch": 0.17721518987341772,
215
+ "eval_loss": 0.5208094716072083,
216
+ "eval_runtime": 11.9711,
217
+ "eval_samples_per_second": 40.097,
218
+ "eval_steps_per_second": 2.506,
219
  "step": 140
220
  },
221
  {
222
  "epoch": 0.189873417721519,
223
+ "grad_norm": 0.8704663515090942,
224
  "learning_rate": 4.683544303797468e-05,
225
+ "loss": 0.4585,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.189873417721519,
230
+ "eval_loss": 0.5105039477348328,
231
+ "eval_runtime": 11.9751,
232
+ "eval_samples_per_second": 40.083,
233
+ "eval_steps_per_second": 2.505,
234
  "step": 150
235
  },
236
  {
237
  "epoch": 0.20253164556962025,
238
+ "grad_norm": 0.8930565714836121,
239
  "learning_rate": 4.6624472573839666e-05,
240
+ "loss": 0.4438,
241
+ "step": 160
242
+ },
243
+ {
244
+ "epoch": 0.20253164556962025,
245
+ "eval_loss": 0.4988265931606293,
246
+ "eval_runtime": 11.9819,
247
+ "eval_samples_per_second": 40.06,
248
+ "eval_steps_per_second": 2.504,
249
  "step": 160
250
  },
251
  {
252
  "epoch": 0.21518987341772153,
253
+ "grad_norm": 0.9236075282096863,
254
  "learning_rate": 4.641350210970464e-05,
255
+ "loss": 0.4171,
256
+ "step": 170
257
+ },
258
+ {
259
+ "epoch": 0.21518987341772153,
260
+ "eval_loss": 0.4941176176071167,
261
+ "eval_runtime": 11.9797,
262
+ "eval_samples_per_second": 40.068,
263
+ "eval_steps_per_second": 2.504,
264
  "step": 170
265
  },
266
  {
267
  "epoch": 0.22784810126582278,
268
+ "grad_norm": 0.8057528138160706,
269
  "learning_rate": 4.6202531645569625e-05,
270
+ "loss": 0.4244,
271
+ "step": 180
272
+ },
273
+ {
274
+ "epoch": 0.22784810126582278,
275
+ "eval_loss": 0.489634245634079,
276
+ "eval_runtime": 11.9664,
277
+ "eval_samples_per_second": 40.112,
278
+ "eval_steps_per_second": 2.507,
279
  "step": 180
280
  },
281
  {
282
  "epoch": 0.24050632911392406,
283
+ "grad_norm": 0.8802728056907654,
284
  "learning_rate": 4.59915611814346e-05,
285
+ "loss": 0.4196,
286
+ "step": 190
287
+ },
288
+ {
289
+ "epoch": 0.24050632911392406,
290
+ "eval_loss": 0.48033541440963745,
291
+ "eval_runtime": 11.9754,
292
+ "eval_samples_per_second": 40.082,
293
+ "eval_steps_per_second": 2.505,
294
  "step": 190
295
  },
296
  {
297
  "epoch": 0.25316455696202533,
298
+ "grad_norm": 0.8316253423690796,
299
  "learning_rate": 4.5780590717299585e-05,
300
+ "loss": 0.4144,
301
  "step": 200
302
  },
303
  {
304
  "epoch": 0.25316455696202533,
305
+ "eval_loss": 0.4757111966609955,
306
+ "eval_runtime": 11.9744,
307
+ "eval_samples_per_second": 40.086,
308
+ "eval_steps_per_second": 2.505,
309
  "step": 200
310
  },
311
  {
312
  "epoch": 0.26582278481012656,
313
+ "grad_norm": 0.8675290942192078,
314
  "learning_rate": 4.556962025316456e-05,
315
+ "loss": 0.3866,
316
+ "step": 210
317
+ },
318
+ {
319
+ "epoch": 0.26582278481012656,
320
+ "eval_loss": 0.4751383662223816,
321
+ "eval_runtime": 11.9976,
322
+ "eval_samples_per_second": 40.008,
323
+ "eval_steps_per_second": 2.501,
324
  "step": 210
325
  },
326
  {
327
  "epoch": 0.27848101265822783,
328
+ "grad_norm": 0.8654202818870544,
329
  "learning_rate": 4.535864978902954e-05,
330
+ "loss": 0.3685,
331
+ "step": 220
332
+ },
333
+ {
334
+ "epoch": 0.27848101265822783,
335
+ "eval_loss": 0.47636380791664124,
336
+ "eval_runtime": 11.988,
337
+ "eval_samples_per_second": 40.04,
338
+ "eval_steps_per_second": 2.502,
339
  "step": 220
340
  },
341
  {
342
  "epoch": 0.2911392405063291,
343
+ "grad_norm": 0.835969090461731,
344
  "learning_rate": 4.5147679324894514e-05,
345
+ "loss": 0.4117,
346
+ "step": 230
347
+ },
348
+ {
349
+ "epoch": 0.2911392405063291,
350
+ "eval_loss": 0.4723130762577057,
351
+ "eval_runtime": 11.9995,
352
+ "eval_samples_per_second": 40.002,
353
+ "eval_steps_per_second": 2.5,
354
  "step": 230
355
  },
356
  {
357
  "epoch": 0.3037974683544304,
358
+ "grad_norm": 0.9585816860198975,
359
  "learning_rate": 4.49367088607595e-05,
360
+ "loss": 0.398,
361
+ "step": 240
362
+ },
363
+ {
364
+ "epoch": 0.3037974683544304,
365
+ "eval_loss": 0.472170889377594,
366
+ "eval_runtime": 12.0019,
367
+ "eval_samples_per_second": 39.994,
368
+ "eval_steps_per_second": 2.5,
369
  "step": 240
370
  },
371
  {
372
  "epoch": 0.31645569620253167,
373
+ "grad_norm": 1.0022097826004028,
374
  "learning_rate": 4.4725738396624474e-05,
375
+ "loss": 0.403,
376
  "step": 250
377
  },
378
  {
379
  "epoch": 0.31645569620253167,
380
+ "eval_loss": 0.47611597180366516,
381
+ "eval_runtime": 12.0087,
382
+ "eval_samples_per_second": 39.971,
383
+ "eval_steps_per_second": 2.498,
384
  "step": 250
385
  },
386
  {
387
  "epoch": 0.3291139240506329,
388
+ "grad_norm": 0.8225414156913757,
389
  "learning_rate": 4.451476793248946e-05,
390
+ "loss": 0.4045,
391
+ "step": 260
392
+ },
393
+ {
394
+ "epoch": 0.3291139240506329,
395
+ "eval_loss": 0.47163695096969604,
396
+ "eval_runtime": 11.9976,
397
+ "eval_samples_per_second": 40.008,
398
+ "eval_steps_per_second": 2.501,
399
  "step": 260
400
  },
401
  {
402
  "epoch": 0.34177215189873417,
403
+ "grad_norm": 0.765426754951477,
404
  "learning_rate": 4.430379746835443e-05,
405
+ "loss": 0.4147,
406
+ "step": 270
407
+ },
408
+ {
409
+ "epoch": 0.34177215189873417,
410
+ "eval_loss": 0.4663979113101959,
411
+ "eval_runtime": 12.0321,
412
+ "eval_samples_per_second": 39.893,
413
+ "eval_steps_per_second": 2.493,
414
  "step": 270
415
  },
416
  {
417
  "epoch": 0.35443037974683544,
418
+ "grad_norm": 0.783598780632019,
419
  "learning_rate": 4.409282700421941e-05,
420
+ "loss": 0.3747,
421
+ "step": 280
422
+ },
423
+ {
424
+ "epoch": 0.35443037974683544,
425
+ "eval_loss": 0.4673294723033905,
426
+ "eval_runtime": 12.0086,
427
+ "eval_samples_per_second": 39.971,
428
+ "eval_steps_per_second": 2.498,
429
  "step": 280
430
  },
431
  {
432
  "epoch": 0.3670886075949367,
433
+ "grad_norm": 0.830932080745697,
434
  "learning_rate": 4.388185654008439e-05,
435
+ "loss": 0.3803,
436
+ "step": 290
437
+ },
438
+ {
439
+ "epoch": 0.3670886075949367,
440
+ "eval_loss": 0.4643842875957489,
441
+ "eval_runtime": 12.0174,
442
+ "eval_samples_per_second": 39.942,
443
+ "eval_steps_per_second": 2.496,
444
  "step": 290
445
  },
446
  {
447
  "epoch": 0.379746835443038,
448
+ "grad_norm": 0.8144668936729431,
449
  "learning_rate": 4.367088607594937e-05,
450
+ "loss": 0.3756,
451
  "step": 300
452
  },
453
  {
454
  "epoch": 0.379746835443038,
455
+ "eval_loss": 0.462455689907074,
456
+ "eval_runtime": 12.0337,
457
+ "eval_samples_per_second": 39.888,
458
+ "eval_steps_per_second": 2.493,
459
  "step": 300
460
  }
461
  ],
 
463
  "max_steps": 2370,
464
  "num_input_tokens_seen": 0,
465
  "num_train_epochs": 3,
466
+ "save_steps": 10,
467
  "stateful_callbacks": {
468
  "TrainerControl": {
469
  "args": {
checkpoint-300/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b8ef5fa4aad3a350c14df025074931ad8a003d4b851f4886f3b2f66ae6653e4b
3
  size 5880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1e7aae8b855413d55586dd498c7d7d805796f0c02067ce9d8ccb1ef37f72d29
3
  size 5880