tyzhu commited on
Commit
dad3bc1
1 Parent(s): 6b4dcbc

Training in progress, epoch 16, checkpoint

Browse files
checkpoint-17428/adapter_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "auto_mapping": null,
3
- "base_model_name_or_path": "Qwen/Qwen1.5-4B",
4
  "bias": "none",
5
  "fan_in_fan_out": false,
6
  "inference_mode": true,
 
1
  {
2
  "auto_mapping": null,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
  "bias": "none",
5
  "fan_in_fan_out": false,
6
  "inference_mode": true,
checkpoint-17428/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbccc949c921719f56981a54488e4eebc74dbf25b75a1cf7a0faf3845166adc4
3
+ size 143269386
checkpoint-17428/added_tokens.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "<|endoftext|>": 151643,
3
- "<|im_end|>": 151645,
4
- "<|im_start|>": 151644
5
  }
 
1
  {
2
+ "</s>": 2,
3
+ "<s>": 1,
4
+ "<unk>": 0
5
  }
checkpoint-17428/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a17139e8ace0333b1ea5195b42adaf89b2ce13c3faec826fae5433c5f0d33630
3
- size 224537202
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1933848ff103d4f0ff5fbe7340c0bac8597f6c9c94a6ecc5100c90e55dad0108
3
+ size 286585234
checkpoint-17428/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:96dca23719709783dbd6354c201b073dee6108546d2075aaa413dfa393d520bf
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3da29104047a0f8aea74256a89b1c58d7465755e27716aabfc15a2228e2e93e
3
  size 15024
checkpoint-17428/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bca8582bc96a9e72a1f076efb65e0982c408c99995cb3e30c9650588a0ccac4c
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc39083a5502d69b09c9d17fa44983d3506121ab6780dd18dc5b2396e451eeeb
3
  size 15024
checkpoint-17428/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a0dfec9f1e2b96a458d40112c36a9b7735918fdb3783c82ae4b7a9bb6db887f6
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4f75a6374f1a529f1facf703cc8e1ae8d0fb2cc88c5faa7946531f6c36cc1a0
3
  size 15024
checkpoint-17428/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d27c78c25dd9677ff6fe82cd4159a639a40e5b048e3c2fa63bef96e0896fcd0e
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be4831a3280a035cf2e361459f24890e28935f2e89acef8a48833e7d51e0529b
3
  size 15024
checkpoint-17428/special_tokens_map.json CHANGED
@@ -1,14 +1,6 @@
1
  {
2
- "additional_special_tokens": [
3
- "<|im_start|>",
4
- "<|im_end|>"
5
- ],
6
- "eos_token": {
7
- "content": "<|endoftext|>",
8
- "lstrip": false,
9
- "normalized": false,
10
- "rstrip": false,
11
- "single_word": false
12
- },
13
- "pad_token": "<|endoftext|>"
14
  }
 
1
  {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "</s>",
5
+ "unk_token": "<unk>"
 
 
 
 
 
 
 
 
6
  }
checkpoint-17428/tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
checkpoint-17428/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
checkpoint-17428/tokenizer_config.json CHANGED
@@ -1,24 +1,23 @@
1
  {
2
- "add_prefix_space": false,
3
  "added_tokens_decoder": {
4
- "151643": {
5
- "content": "<|endoftext|>",
6
  "lstrip": false,
7
  "normalized": false,
8
  "rstrip": false,
9
  "single_word": false,
10
  "special": true
11
  },
12
- "151644": {
13
- "content": "<|im_start|>",
14
  "lstrip": false,
15
  "normalized": false,
16
  "rstrip": false,
17
  "single_word": false,
18
  "special": true
19
  },
20
- "151645": {
21
- "content": "<|im_end|>",
22
  "lstrip": false,
23
  "normalized": false,
24
  "rstrip": false,
@@ -26,18 +25,16 @@
26
  "special": true
27
  }
28
  },
29
- "additional_special_tokens": [
30
- "<|im_start|>",
31
- "<|im_end|>"
32
- ],
33
- "bos_token": null,
34
- "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
35
  "clean_up_tokenization_spaces": false,
36
- "eos_token": "<|endoftext|>",
37
- "errors": "replace",
38
- "model_max_length": 32768,
39
- "pad_token": "<|endoftext|>",
40
- "split_special_tokens": false,
41
- "tokenizer_class": "Qwen2Tokenizer",
42
- "unk_token": null
 
 
43
  }
 
1
  {
 
2
  "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<unk>",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
8
  "single_word": false,
9
  "special": true
10
  },
11
+ "1": {
12
+ "content": "<s>",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
16
  "single_word": false,
17
  "special": true
18
  },
19
+ "2": {
20
+ "content": "</s>",
21
  "lstrip": false,
22
  "normalized": false,
23
  "rstrip": false,
 
25
  "special": true
26
  }
27
  },
28
+ "additional_special_tokens": [],
29
+ "bos_token": "<s>",
 
 
 
 
30
  "clean_up_tokenization_spaces": false,
31
+ "eos_token": "</s>",
32
+ "legacy": false,
33
+ "model_max_length": 1000000000000000019884624838656,
34
+ "pad_token": "</s>",
35
+ "padding_side": "left",
36
+ "sp_model_kwargs": {},
37
+ "tokenizer_class": "LlamaTokenizer",
38
+ "unk_token": "<unk>",
39
+ "use_default_system_prompt": true
40
  }
checkpoint-17428/trainer_state.json CHANGED
@@ -9,1483 +9,1295 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.09180628873077806,
13
- "grad_norm": 0.5307241678237915,
14
  "learning_rate": 3e-05,
15
- "loss": 2.0176,
16
  "step": 100
17
  },
18
  {
19
- "epoch": 0.18361257746155613,
20
- "grad_norm": 0.6362708210945129,
21
  "learning_rate": 3e-05,
22
- "loss": 1.7953,
23
  "step": 200
24
  },
25
  {
26
- "epoch": 0.2754188661923342,
27
- "grad_norm": 0.3823029696941376,
28
  "learning_rate": 3e-05,
29
- "loss": 1.7849,
30
  "step": 300
31
  },
32
  {
33
- "epoch": 0.36722515492311225,
34
- "grad_norm": 0.43036168813705444,
35
  "learning_rate": 3e-05,
36
- "loss": 1.7818,
37
  "step": 400
38
  },
39
  {
40
- "epoch": 0.4590314436538903,
41
- "grad_norm": 0.5922847390174866,
42
  "learning_rate": 3e-05,
43
- "loss": 1.7649,
44
  "step": 500
45
  },
46
  {
47
- "epoch": 0.5508377323846684,
48
- "grad_norm": 0.40470582246780396,
49
  "learning_rate": 3e-05,
50
- "loss": 1.7902,
51
  "step": 600
52
  },
53
  {
54
- "epoch": 0.6426440211154464,
55
- "grad_norm": 0.49739205837249756,
56
  "learning_rate": 3e-05,
57
- "loss": 1.7606,
58
  "step": 700
59
  },
60
  {
61
- "epoch": 0.7344503098462245,
62
- "grad_norm": 0.43984296917915344,
63
  "learning_rate": 3e-05,
64
- "loss": 1.7746,
65
  "step": 800
66
  },
67
  {
68
- "epoch": 0.8262565985770025,
69
- "grad_norm": 0.4566909968852997,
70
  "learning_rate": 3e-05,
71
- "loss": 1.7586,
72
  "step": 900
73
  },
74
  {
75
- "epoch": 0.9180628873077806,
76
- "grad_norm": 0.4243898093700409,
77
  "learning_rate": 3e-05,
78
- "loss": 1.7775,
79
  "step": 1000
80
  },
81
  {
82
- "epoch": 0.999770484278173,
83
- "eval_accuracy": 0.5157460317460317,
84
- "eval_loss": 2.3358356952667236,
85
- "eval_runtime": 5.2798,
86
- "eval_samples_per_second": 94.7,
87
- "eval_steps_per_second": 11.932,
88
  "step": 1089
89
  },
90
  {
91
- "epoch": 0.999770484278173,
92
- "eval_exact_match": 19.0,
93
- "eval_f1": 26.812075147075163,
94
  "step": 1089
95
  },
96
  {
97
- "epoch": 1.0098691760385587,
98
- "grad_norm": 0.47265738248825073,
99
  "learning_rate": 3e-05,
100
- "loss": 1.7676,
101
  "step": 1100
102
  },
103
  {
104
- "epoch": 1.1016754647693368,
105
- "grad_norm": 0.6015958189964294,
106
  "learning_rate": 3e-05,
107
- "loss": 1.7397,
108
  "step": 1200
109
  },
110
  {
111
- "epoch": 1.1934817535001148,
112
- "grad_norm": 0.5499740242958069,
113
  "learning_rate": 3e-05,
114
- "loss": 1.7254,
115
  "step": 1300
116
  },
117
  {
118
- "epoch": 1.2852880422308928,
119
- "grad_norm": 0.5272372961044312,
120
  "learning_rate": 3e-05,
121
- "loss": 1.73,
122
  "step": 1400
123
  },
124
  {
125
- "epoch": 1.377094330961671,
126
- "grad_norm": 0.5437406897544861,
127
  "learning_rate": 3e-05,
128
- "loss": 1.7292,
129
  "step": 1500
130
  },
131
  {
132
- "epoch": 1.468900619692449,
133
- "grad_norm": 0.5464481115341187,
134
  "learning_rate": 3e-05,
135
- "loss": 1.728,
136
  "step": 1600
137
  },
138
  {
139
- "epoch": 1.560706908423227,
140
- "grad_norm": 0.6755242347717285,
141
  "learning_rate": 3e-05,
142
- "loss": 1.7407,
143
  "step": 1700
144
  },
145
  {
146
- "epoch": 1.652513197154005,
147
- "grad_norm": 0.5893722176551819,
148
  "learning_rate": 3e-05,
149
- "loss": 1.7244,
150
  "step": 1800
151
  },
152
  {
153
- "epoch": 1.744319485884783,
154
- "grad_norm": 0.5930923223495483,
155
  "learning_rate": 3e-05,
156
- "loss": 1.7225,
157
  "step": 1900
158
  },
159
  {
160
- "epoch": 1.836125774615561,
161
- "grad_norm": 0.5917518734931946,
162
  "learning_rate": 3e-05,
163
- "loss": 1.7189,
164
  "step": 2000
165
  },
166
  {
167
- "epoch": 1.9279320633463393,
168
- "grad_norm": 0.6680920124053955,
169
  "learning_rate": 3e-05,
170
- "loss": 1.7313,
171
  "step": 2100
172
  },
173
  {
174
- "epoch": 1.999540968556346,
175
- "eval_accuracy": 0.5178730158730158,
176
- "eval_loss": 2.3068432807922363,
177
- "eval_runtime": 5.4629,
178
- "eval_samples_per_second": 91.527,
179
- "eval_steps_per_second": 11.532,
180
  "step": 2178
181
  },
182
  {
183
- "epoch": 1.999540968556346,
184
- "eval_exact_match": 17.8,
185
- "eval_f1": 26.628214855320138,
186
  "step": 2178
187
  },
188
  {
189
- "epoch": 2.0197383520771175,
190
- "grad_norm": 0.6736780405044556,
191
  "learning_rate": 3e-05,
192
- "loss": 1.7046,
193
  "step": 2200
194
  },
195
  {
196
- "epoch": 2.1115446408078955,
197
- "grad_norm": 0.7276094555854797,
198
  "learning_rate": 3e-05,
199
- "loss": 1.6607,
200
  "step": 2300
201
  },
202
  {
203
- "epoch": 2.2033509295386735,
204
- "grad_norm": 0.7279476523399353,
205
  "learning_rate": 3e-05,
206
- "loss": 1.6791,
207
  "step": 2400
208
  },
209
  {
210
- "epoch": 2.2951572182694515,
211
- "grad_norm": 0.7652662992477417,
212
  "learning_rate": 3e-05,
213
- "loss": 1.6709,
214
  "step": 2500
215
  },
216
  {
217
- "epoch": 2.3869635070002295,
218
- "grad_norm": 0.6291841268539429,
219
  "learning_rate": 3e-05,
220
- "loss": 1.6659,
221
  "step": 2600
222
  },
223
  {
224
- "epoch": 2.4787697957310075,
225
- "grad_norm": 0.7375943660736084,
226
  "learning_rate": 3e-05,
227
- "loss": 1.67,
228
  "step": 2700
229
  },
230
  {
231
- "epoch": 2.5705760844617855,
232
- "grad_norm": 0.7475445866584778,
233
  "learning_rate": 3e-05,
234
- "loss": 1.6769,
235
  "step": 2800
236
  },
237
  {
238
- "epoch": 2.6623823731925635,
239
- "grad_norm": 5.488095283508301,
240
  "learning_rate": 3e-05,
241
- "loss": 1.6726,
242
  "step": 2900
243
  },
244
  {
245
- "epoch": 2.754188661923342,
246
- "grad_norm": 11046773.0,
247
  "learning_rate": 3e-05,
248
- "loss": 2.4041,
249
  "step": 3000
250
  },
251
  {
252
- "epoch": 2.84599495065412,
253
- "grad_norm": 9348.0419921875,
254
  "learning_rate": 3e-05,
255
- "loss": 3.254,
256
  "step": 3100
257
  },
258
  {
259
- "epoch": 2.937801239384898,
260
- "grad_norm": 42427.7578125,
261
  "learning_rate": 3e-05,
262
- "loss": 3.648,
263
  "step": 3200
264
  },
265
  {
266
- "epoch": 2.9993114528345193,
267
- "eval_accuracy": 0.4067936507936508,
268
- "eval_loss": 4.151577949523926,
269
- "eval_runtime": 5.4271,
270
- "eval_samples_per_second": 92.13,
271
- "eval_steps_per_second": 11.608,
272
  "step": 3267
273
  },
274
  {
275
- "epoch": 2.9993114528345193,
276
- "eval_exact_match": 0.0,
277
- "eval_f1": 1.3877253548324662,
278
  "step": 3267
279
  },
280
  {
281
- "epoch": 3.029607528115676,
282
- "grad_norm": 69864.046875,
283
  "learning_rate": 3e-05,
284
- "loss": 3.6794,
285
  "step": 3300
286
  },
287
  {
288
- "epoch": 3.121413816846454,
289
- "grad_norm": 763.395263671875,
290
  "learning_rate": 3e-05,
291
- "loss": 3.6341,
292
  "step": 3400
293
  },
294
  {
295
- "epoch": 3.213220105577232,
296
- "grad_norm": 297052.0625,
297
  "learning_rate": 3e-05,
298
- "loss": 6.3864,
299
  "step": 3500
300
  },
301
  {
302
- "epoch": 3.30502639430801,
303
- "grad_norm": 327590.21875,
304
  "learning_rate": 3e-05,
305
- "loss": 8.1688,
306
  "step": 3600
307
  },
308
  {
309
- "epoch": 3.396832683038788,
310
- "grad_norm": 35.1007080078125,
311
  "learning_rate": 3e-05,
312
- "loss": 7.4099,
313
  "step": 3700
314
  },
315
  {
316
- "epoch": 3.488638971769566,
317
- "grad_norm": 709.9918823242188,
318
  "learning_rate": 3e-05,
319
- "loss": 3.7034,
320
  "step": 3800
321
  },
322
  {
323
- "epoch": 3.580445260500344,
324
- "grad_norm": 5584611.5,
325
  "learning_rate": 3e-05,
326
- "loss": 3.832,
327
  "step": 3900
328
  },
329
  {
330
- "epoch": 3.672251549231122,
331
- "grad_norm": 684474.4375,
332
  "learning_rate": 3e-05,
333
- "loss": 4.8077,
334
  "step": 4000
335
  },
336
  {
337
- "epoch": 3.7640578379619005,
338
- "grad_norm": 137445040.0,
339
  "learning_rate": 3e-05,
340
- "loss": 7.6647,
341
  "step": 4100
342
  },
343
  {
344
- "epoch": 3.8558641266926785,
345
- "grad_norm": 15619359.0,
346
  "learning_rate": 3e-05,
347
- "loss": 9.0532,
348
  "step": 4200
349
  },
350
  {
351
- "epoch": 3.9476704154234565,
352
- "grad_norm": 252526384.0,
353
  "learning_rate": 3e-05,
354
- "loss": 9.0562,
355
  "step": 4300
356
  },
357
  {
358
  "epoch": 4.0,
359
- "eval_accuracy": 0.26304761904761903,
360
- "eval_loss": 9.931685447692871,
361
- "eval_runtime": 5.1725,
362
- "eval_samples_per_second": 96.665,
363
- "eval_steps_per_second": 12.18,
364
  "step": 4357
365
  },
366
  {
367
  "epoch": 4.0,
368
- "eval_exact_match": 0.0,
369
- "eval_f1": 0.1,
370
  "step": 4357
371
  },
372
  {
373
- "epoch": 4.039476704154235,
374
- "grad_norm": 96136593408.0,
375
  "learning_rate": 3e-05,
376
- "loss": 9.1763,
377
  "step": 4400
378
  },
379
  {
380
- "epoch": 4.131282992885013,
381
- "grad_norm": 1025443364864.0,
382
  "learning_rate": 3e-05,
383
- "loss": 9.1246,
384
  "step": 4500
385
  },
386
  {
387
- "epoch": 4.223089281615791,
388
- "grad_norm": 265109962752.0,
389
  "learning_rate": 3e-05,
390
- "loss": 9.1531,
391
  "step": 4600
392
  },
393
  {
394
- "epoch": 4.314895570346569,
395
- "grad_norm": 3019844.25,
396
  "learning_rate": 3e-05,
397
- "loss": 9.7419,
398
  "step": 4700
399
  },
400
  {
401
- "epoch": 4.406701859077347,
402
- "grad_norm": 118806648.0,
403
  "learning_rate": 3e-05,
404
- "loss": 10.3409,
405
  "step": 4800
406
  },
407
  {
408
- "epoch": 4.498508147808125,
409
- "grad_norm": 14.5001859664917,
410
  "learning_rate": 3e-05,
411
- "loss": 9.5802,
412
  "step": 4900
413
  },
414
  {
415
- "epoch": 4.590314436538903,
416
- "grad_norm": 11.577363014221191,
417
  "learning_rate": 3e-05,
418
- "loss": 6.5391,
419
  "step": 5000
420
  },
421
  {
422
- "epoch": 4.682120725269681,
423
- "grad_norm": 6.0267863273620605,
424
  "learning_rate": 3e-05,
425
- "loss": 4.3604,
426
  "step": 5100
427
  },
428
  {
429
- "epoch": 4.773927014000459,
430
- "grad_norm": 99.47637176513672,
431
  "learning_rate": 3e-05,
432
- "loss": 2.5788,
433
  "step": 5200
434
  },
435
  {
436
- "epoch": 4.865733302731237,
437
- "grad_norm": 4.200494766235352,
438
  "learning_rate": 3e-05,
439
- "loss": 2.3746,
440
  "step": 5300
441
  },
442
  {
443
- "epoch": 4.957539591462015,
444
- "grad_norm": 43.855430603027344,
445
  "learning_rate": 3e-05,
446
- "loss": 2.4535,
447
  "step": 5400
448
  },
449
  {
450
- "epoch": 4.999770484278173,
451
- "eval_accuracy": 0.363015873015873,
452
- "eval_loss": 4.993338108062744,
453
- "eval_runtime": 5.5548,
454
- "eval_samples_per_second": 90.012,
455
- "eval_steps_per_second": 11.342,
456
  "step": 5446
457
  },
458
  {
459
- "epoch": 4.999770484278173,
460
- "eval_exact_match": 0.0,
461
- "eval_f1": 0.8575555035818295,
462
  "step": 5446
463
  },
464
  {
465
- "epoch": 5.049345880192793,
466
- "grad_norm": 751.8612060546875,
467
  "learning_rate": 3e-05,
468
- "loss": 2.6406,
469
  "step": 5500
470
  },
471
  {
472
- "epoch": 5.141152168923571,
473
- "grad_norm": 40140.3828125,
474
  "learning_rate": 3e-05,
475
- "loss": 3.7211,
476
  "step": 5600
477
  },
478
  {
479
- "epoch": 5.232958457654349,
480
- "grad_norm": 468036.75,
481
  "learning_rate": 3e-05,
482
- "loss": 6.2797,
483
  "step": 5700
484
  },
485
  {
486
- "epoch": 5.324764746385127,
487
- "grad_norm": 51365.5390625,
488
  "learning_rate": 3e-05,
489
- "loss": 7.3688,
490
  "step": 5800
491
  },
492
  {
493
- "epoch": 5.416571035115905,
494
- "grad_norm": 4080417.5,
495
  "learning_rate": 3e-05,
496
- "loss": 7.3855,
497
  "step": 5900
498
  },
499
  {
500
- "epoch": 5.508377323846684,
501
- "grad_norm": 1536171.625,
502
  "learning_rate": 3e-05,
503
- "loss": 7.5376,
504
  "step": 6000
505
  },
506
  {
507
- "epoch": 5.600183612577462,
508
- "grad_norm": 42693244.0,
509
  "learning_rate": 3e-05,
510
- "loss": 7.7232,
511
  "step": 6100
512
  },
513
  {
514
- "epoch": 5.69198990130824,
515
- "grad_norm": 776647.25,
516
  "learning_rate": 3e-05,
517
- "loss": 8.3079,
518
  "step": 6200
519
  },
520
  {
521
- "epoch": 5.783796190039018,
522
- "grad_norm": 153013.40625,
523
  "learning_rate": 3e-05,
524
- "loss": 8.3088,
525
  "step": 6300
526
  },
527
  {
528
- "epoch": 5.875602478769796,
529
- "grad_norm": 94984.4140625,
530
  "learning_rate": 3e-05,
531
- "loss": 8.1181,
532
  "step": 6400
533
  },
534
  {
535
- "epoch": 5.967408767500574,
536
- "grad_norm": 33103526.0,
537
  "learning_rate": 3e-05,
538
- "loss": 8.1527,
539
  "step": 6500
540
  },
541
  {
542
- "epoch": 5.999540968556346,
543
- "eval_accuracy": 0.25971428571428573,
544
- "eval_loss": 9.265142440795898,
545
- "eval_runtime": 6.318,
546
- "eval_samples_per_second": 79.139,
547
- "eval_steps_per_second": 9.971,
548
  "step": 6535
549
  },
550
  {
551
- "epoch": 5.999540968556346,
552
- "eval_exact_match": 0.0,
553
- "eval_f1": 0.0,
554
  "step": 6535
555
  },
556
  {
557
- "epoch": 6.059215056231352,
558
- "grad_norm": 33229708.0,
559
  "learning_rate": 3e-05,
560
- "loss": 8.2663,
561
  "step": 6600
562
  },
563
  {
564
- "epoch": 6.15102134496213,
565
- "grad_norm": 769951168.0,
566
  "learning_rate": 3e-05,
567
- "loss": 8.156,
568
  "step": 6700
569
  },
570
  {
571
- "epoch": 6.242827633692908,
572
- "grad_norm": 2425211904.0,
573
  "learning_rate": 3e-05,
574
- "loss": 8.1438,
575
  "step": 6800
576
  },
577
  {
578
- "epoch": 6.334633922423686,
579
- "grad_norm": 5989025710080.0,
580
  "learning_rate": 3e-05,
581
- "loss": 8.0825,
582
  "step": 6900
583
  },
584
  {
585
- "epoch": 6.426440211154464,
586
- "grad_norm": 175022960.0,
587
  "learning_rate": 3e-05,
588
- "loss": 8.0587,
589
  "step": 7000
590
  },
591
  {
592
- "epoch": 6.518246499885242,
593
- "grad_norm": 6010.58642578125,
594
  "learning_rate": 3e-05,
595
- "loss": 7.9277,
596
  "step": 7100
597
  },
598
  {
599
- "epoch": 6.61005278861602,
600
- "grad_norm": 1003.8111572265625,
601
  "learning_rate": 3e-05,
602
- "loss": 6.9742,
603
  "step": 7200
604
  },
605
  {
606
- "epoch": 6.701859077346798,
607
- "grad_norm": 4030.06787109375,
608
  "learning_rate": 3e-05,
609
- "loss": 7.0699,
610
  "step": 7300
611
  },
612
  {
613
- "epoch": 6.793665366077576,
614
- "grad_norm": 59434.1640625,
615
  "learning_rate": 3e-05,
616
- "loss": 7.3625,
617
  "step": 7400
618
  },
619
  {
620
- "epoch": 6.885471654808354,
621
- "grad_norm": 66880.4609375,
622
  "learning_rate": 3e-05,
623
- "loss": 7.4483,
624
  "step": 7500
625
  },
626
  {
627
- "epoch": 6.977277943539132,
628
- "grad_norm": 4376.875,
629
  "learning_rate": 3e-05,
630
- "loss": 7.3556,
631
  "step": 7600
632
  },
633
  {
634
- "epoch": 6.999311452834519,
635
- "eval_accuracy": 0.2757777777777778,
636
- "eval_loss": 7.365324974060059,
637
- "eval_runtime": 5.3947,
638
- "eval_samples_per_second": 92.684,
639
- "eval_steps_per_second": 11.678,
640
  "step": 7624
641
  },
642
  {
643
- "epoch": 6.999311452834519,
644
- "eval_exact_match": 0.0,
645
- "eval_f1": 0.0,
646
  "step": 7624
647
  },
648
  {
649
- "epoch": 7.06908423226991,
650
- "grad_norm": 1153324.75,
651
  "learning_rate": 3e-05,
652
- "loss": 7.3696,
653
  "step": 7700
654
  },
655
  {
656
- "epoch": 7.160890521000688,
657
- "grad_norm": 184950352.0,
658
  "learning_rate": 3e-05,
659
- "loss": 7.4402,
660
  "step": 7800
661
  },
662
  {
663
- "epoch": 7.252696809731467,
664
- "grad_norm": 153754730496.0,
665
  "learning_rate": 3e-05,
666
- "loss": 7.8353,
667
  "step": 7900
668
  },
669
  {
670
- "epoch": 7.344503098462245,
671
- "grad_norm": 1028478208.0,
672
  "learning_rate": 3e-05,
673
- "loss": 7.7107,
674
  "step": 8000
675
  },
676
  {
677
- "epoch": 7.436309387193023,
678
- "grad_norm": 168327040.0,
679
  "learning_rate": 3e-05,
680
- "loss": 7.8299,
681
  "step": 8100
682
  },
683
  {
684
- "epoch": 7.528115675923801,
685
- "grad_norm": 1722198784.0,
686
  "learning_rate": 3e-05,
687
- "loss": 7.5231,
688
  "step": 8200
689
  },
690
  {
691
- "epoch": 7.619921964654579,
692
- "grad_norm": 159030912.0,
693
  "learning_rate": 3e-05,
694
- "loss": 7.7563,
695
  "step": 8300
696
  },
697
  {
698
- "epoch": 7.711728253385357,
699
- "grad_norm": 196888912.0,
700
  "learning_rate": 3e-05,
701
- "loss": 7.5294,
702
  "step": 8400
703
  },
704
  {
705
- "epoch": 7.803534542116135,
706
- "grad_norm": 6131166720.0,
707
  "learning_rate": 3e-05,
708
- "loss": 7.6518,
709
  "step": 8500
710
  },
711
  {
712
- "epoch": 7.895340830846913,
713
- "grad_norm": 2235290880.0,
714
  "learning_rate": 3e-05,
715
- "loss": 7.6978,
716
  "step": 8600
717
  },
718
  {
719
- "epoch": 7.987147119577691,
720
- "grad_norm": 477273.15625,
721
  "learning_rate": 3e-05,
722
- "loss": 8.1458,
723
  "step": 8700
724
  },
725
  {
726
  "epoch": 8.0,
727
- "eval_accuracy": 0.26526984126984127,
728
- "eval_loss": 7.865727424621582,
729
- "eval_runtime": 5.9295,
730
- "eval_samples_per_second": 84.323,
731
- "eval_steps_per_second": 10.625,
732
  "step": 8714
733
  },
734
  {
735
  "epoch": 8.0,
736
- "eval_exact_match": 0.0,
737
- "eval_f1": 0.0,
738
  "step": 8714
739
  },
740
  {
741
- "epoch": 8.07895340830847,
742
- "grad_norm": 124264720.0,
743
  "learning_rate": 3e-05,
744
- "loss": 8.1452,
745
  "step": 8800
746
  },
747
  {
748
- "epoch": 8.170759697039248,
749
- "grad_norm": 5402585.5,
750
  "learning_rate": 3e-05,
751
- "loss": 8.2439,
752
  "step": 8900
753
  },
754
  {
755
- "epoch": 8.262565985770026,
756
- "grad_norm": 128002850816.0,
757
  "learning_rate": 3e-05,
758
- "loss": 8.2394,
759
  "step": 9000
760
  },
761
  {
762
- "epoch": 8.354372274500804,
763
- "grad_norm": 2158333184.0,
764
  "learning_rate": 3e-05,
765
- "loss": 8.2597,
766
  "step": 9100
767
  },
768
  {
769
- "epoch": 8.446178563231582,
770
- "grad_norm": 138844256.0,
771
  "learning_rate": 3e-05,
772
- "loss": 8.1603,
773
  "step": 9200
774
  },
775
  {
776
- "epoch": 8.53798485196236,
777
- "grad_norm": 1801925.875,
778
  "learning_rate": 3e-05,
779
- "loss": 7.8055,
780
  "step": 9300
781
  },
782
  {
783
- "epoch": 8.629791140693138,
784
- "grad_norm": 2586681856.0,
785
  "learning_rate": 3e-05,
786
- "loss": 7.8,
787
  "step": 9400
788
  },
789
  {
790
- "epoch": 8.721597429423916,
791
- "grad_norm": 164454704.0,
792
  "learning_rate": 3e-05,
793
- "loss": 7.7856,
794
  "step": 9500
795
  },
796
  {
797
- "epoch": 8.813403718154694,
798
- "grad_norm": 1604157.5,
799
  "learning_rate": 3e-05,
800
- "loss": 7.9458,
801
  "step": 9600
802
  },
803
  {
804
- "epoch": 8.905210006885472,
805
- "grad_norm": 845944.5,
806
  "learning_rate": 3e-05,
807
- "loss": 7.7714,
808
  "step": 9700
809
  },
810
  {
811
- "epoch": 8.99701629561625,
812
- "grad_norm": 123345.3125,
813
  "learning_rate": 3e-05,
814
- "loss": 7.7142,
815
  "step": 9800
816
  },
817
  {
818
- "epoch": 8.999770484278173,
819
- "eval_accuracy": 0.28819047619047616,
820
- "eval_loss": 6.984739303588867,
821
- "eval_runtime": 5.2286,
822
- "eval_samples_per_second": 95.628,
823
- "eval_steps_per_second": 12.049,
824
  "step": 9803
825
  },
826
  {
827
- "epoch": 8.999770484278173,
828
- "eval_exact_match": 0.0,
829
- "eval_f1": 0.05,
830
  "step": 9803
831
  },
832
  {
833
- "epoch": 9.088822584347028,
834
- "grad_norm": 4543730.5,
835
  "learning_rate": 3e-05,
836
- "loss": 7.5003,
837
  "step": 9900
838
  },
839
  {
840
- "epoch": 9.180628873077806,
841
- "grad_norm": 204568.890625,
842
  "learning_rate": 3e-05,
843
- "loss": 7.6354,
844
  "step": 10000
845
  },
846
  {
847
- "epoch": 9.272435161808584,
848
- "grad_norm": 10227080.0,
849
  "learning_rate": 3e-05,
850
- "loss": 7.6448,
851
  "step": 10100
852
  },
853
  {
854
- "epoch": 9.364241450539362,
855
- "grad_norm": 20528.154296875,
856
  "learning_rate": 3e-05,
857
- "loss": 7.8043,
858
  "step": 10200
859
  },
860
  {
861
- "epoch": 9.45604773927014,
862
- "grad_norm": 33088.14453125,
863
  "learning_rate": 3e-05,
864
- "loss": 7.7847,
865
  "step": 10300
866
  },
867
  {
868
- "epoch": 9.547854028000918,
869
- "grad_norm": 24437.92578125,
870
  "learning_rate": 3e-05,
871
- "loss": 7.6198,
872
  "step": 10400
873
  },
874
  {
875
- "epoch": 9.639660316731696,
876
- "grad_norm": 3420.399658203125,
877
  "learning_rate": 3e-05,
878
- "loss": 7.2853,
879
  "step": 10500
880
  },
881
  {
882
- "epoch": 9.731466605462474,
883
- "grad_norm": 405.9403381347656,
884
  "learning_rate": 3e-05,
885
- "loss": 7.0699,
886
  "step": 10600
887
  },
888
  {
889
- "epoch": 9.823272894193252,
890
- "grad_norm": 3901.121337890625,
891
  "learning_rate": 3e-05,
892
- "loss": 7.0934,
893
  "step": 10700
894
  },
895
  {
896
- "epoch": 9.91507918292403,
897
- "grad_norm": 1741.5684814453125,
898
  "learning_rate": 3e-05,
899
- "loss": 7.2638,
900
  "step": 10800
901
  },
902
  {
903
- "epoch": 9.999540968556346,
904
- "eval_accuracy": 0.29244444444444445,
905
- "eval_loss": 7.111032009124756,
906
- "eval_runtime": 5.2503,
907
- "eval_samples_per_second": 95.233,
908
- "eval_steps_per_second": 11.999,
909
  "step": 10892
910
  },
911
  {
912
- "epoch": 9.999540968556346,
913
  "eval_exact_match": 0.0,
914
- "eval_f1": 0.0,
915
  "step": 10892
916
  },
917
  {
918
- "epoch": 10.006885471654808,
919
- "grad_norm": 9326.337890625,
920
  "learning_rate": 3e-05,
921
- "loss": 7.1126,
922
  "step": 10900
923
  },
924
  {
925
- "epoch": 10.098691760385586,
926
- "grad_norm": 12598.7568359375,
927
  "learning_rate": 3e-05,
928
- "loss": 7.2214,
929
  "step": 11000
930
  },
931
  {
932
- "epoch": 10.190498049116364,
933
- "grad_norm": 6.796374797821045,
934
  "learning_rate": 3e-05,
935
- "loss": 6.8649,
936
  "step": 11100
937
  },
938
  {
939
- "epoch": 10.282304337847142,
940
- "grad_norm": 29789.6171875,
941
  "learning_rate": 3e-05,
942
- "loss": 4.532,
943
  "step": 11200
944
  },
945
  {
946
- "epoch": 10.37411062657792,
947
- "grad_norm": 213.58909606933594,
948
  "learning_rate": 3e-05,
949
- "loss": 3.7364,
950
  "step": 11300
951
  },
952
  {
953
- "epoch": 10.465916915308698,
954
- "grad_norm": 1166033.5,
955
  "learning_rate": 3e-05,
956
- "loss": 3.2517,
957
  "step": 11400
958
  },
959
  {
960
- "epoch": 10.557723204039476,
961
- "grad_norm": 3554672.25,
962
  "learning_rate": 3e-05,
963
- "loss": 3.4791,
964
  "step": 11500
965
  },
966
  {
967
- "epoch": 10.649529492770254,
968
- "grad_norm": 10.04858684539795,
969
  "learning_rate": 3e-05,
970
- "loss": 4.8491,
971
  "step": 11600
972
  },
973
  {
974
- "epoch": 10.741335781501032,
975
- "grad_norm": 860045.8125,
976
  "learning_rate": 3e-05,
977
- "loss": 4.2615,
978
  "step": 11700
979
  },
980
  {
981
- "epoch": 10.83314207023181,
982
- "grad_norm": 3906973.0,
983
  "learning_rate": 3e-05,
984
- "loss": 6.7154,
985
  "step": 11800
986
  },
987
  {
988
- "epoch": 10.924948358962588,
989
- "grad_norm": 4682.72265625,
990
  "learning_rate": 3e-05,
991
- "loss": 6.1172,
992
  "step": 11900
993
  },
994
  {
995
- "epoch": 10.999311452834519,
996
- "eval_accuracy": 0.25682539682539685,
997
- "eval_loss": 7.6613993644714355,
998
- "eval_runtime": 5.538,
999
- "eval_samples_per_second": 90.285,
1000
- "eval_steps_per_second": 11.376,
1001
  "step": 11981
1002
  },
1003
  {
1004
- "epoch": 10.999311452834519,
1005
  "eval_exact_match": 0.0,
1006
- "eval_f1": 0.02384313725490196,
1007
  "step": 11981
1008
  },
1009
  {
1010
- "epoch": 11.016754647693366,
1011
- "grad_norm": 1283400.375,
1012
  "learning_rate": 3e-05,
1013
- "loss": 6.0828,
1014
  "step": 12000
1015
  },
1016
  {
1017
- "epoch": 11.108560936424144,
1018
- "grad_norm": 26411734.0,
1019
  "learning_rate": 3e-05,
1020
- "loss": 6.5592,
1021
  "step": 12100
1022
  },
1023
  {
1024
- "epoch": 11.200367225154924,
1025
- "grad_norm": 16718.0234375,
1026
  "learning_rate": 3e-05,
1027
- "loss": 6.6523,
1028
  "step": 12200
1029
  },
1030
  {
1031
- "epoch": 11.292173513885702,
1032
- "grad_norm": 848.5704956054688,
1033
  "learning_rate": 3e-05,
1034
- "loss": 7.4346,
1035
  "step": 12300
1036
  },
1037
  {
1038
- "epoch": 11.38397980261648,
1039
- "grad_norm": 156.42295837402344,
1040
  "learning_rate": 3e-05,
1041
- "loss": 7.0466,
1042
  "step": 12400
1043
  },
1044
  {
1045
- "epoch": 11.475786091347258,
1046
- "grad_norm": 24240020.0,
1047
  "learning_rate": 3e-05,
1048
- "loss": 8.1477,
1049
  "step": 12500
1050
  },
1051
  {
1052
- "epoch": 11.567592380078036,
1053
- "grad_norm": 60810556.0,
1054
  "learning_rate": 3e-05,
1055
- "loss": 11.7305,
1056
  "step": 12600
1057
  },
1058
  {
1059
- "epoch": 11.659398668808814,
1060
- "grad_norm": 4.380503717183488e+16,
1061
  "learning_rate": 3e-05,
1062
- "loss": 11.6804,
1063
  "step": 12700
1064
  },
1065
  {
1066
- "epoch": 11.751204957539592,
1067
- "grad_norm": 79782666567680.0,
1068
  "learning_rate": 3e-05,
1069
- "loss": 9.5319,
1070
  "step": 12800
1071
  },
1072
  {
1073
- "epoch": 11.84301124627037,
1074
- "grad_norm": 100421464.0,
1075
  "learning_rate": 3e-05,
1076
- "loss": 9.3466,
1077
  "step": 12900
1078
  },
1079
  {
1080
- "epoch": 11.934817535001148,
1081
- "grad_norm": 335482.3125,
1082
  "learning_rate": 3e-05,
1083
- "loss": 9.2178,
1084
  "step": 13000
1085
  },
1086
  {
1087
  "epoch": 12.0,
1088
- "eval_accuracy": 0.2553968253968254,
1089
- "eval_loss": 9.910024642944336,
1090
- "eval_runtime": 5.5219,
1091
- "eval_samples_per_second": 90.549,
1092
- "eval_steps_per_second": 11.409,
1093
  "step": 13071
1094
  },
1095
  {
1096
  "epoch": 12.0,
1097
  "eval_exact_match": 0.0,
1098
- "eval_f1": 0.0,
1099
  "step": 13071
1100
  },
1101
  {
1102
- "epoch": 12.026623823731926,
1103
- "grad_norm": 34611664.0,
1104
  "learning_rate": 3e-05,
1105
- "loss": 7.8897,
1106
  "step": 13100
1107
  },
1108
  {
1109
- "epoch": 12.118430112462704,
1110
- "grad_norm": 136696896.0,
1111
  "learning_rate": 3e-05,
1112
- "loss": 8.4494,
1113
  "step": 13200
1114
  },
1115
  {
1116
- "epoch": 12.210236401193482,
1117
- "grad_norm": 7467604992.0,
1118
  "learning_rate": 3e-05,
1119
- "loss": 9.0639,
1120
  "step": 13300
1121
  },
1122
  {
1123
- "epoch": 12.30204268992426,
1124
- "grad_norm": 903450752.0,
1125
  "learning_rate": 3e-05,
1126
- "loss": 8.3847,
1127
  "step": 13400
1128
  },
1129
  {
1130
- "epoch": 12.393848978655038,
1131
- "grad_norm": 389607488.0,
1132
  "learning_rate": 3e-05,
1133
- "loss": 8.4695,
1134
  "step": 13500
1135
  },
1136
  {
1137
- "epoch": 12.485655267385816,
1138
- "grad_norm": 7033111040.0,
1139
  "learning_rate": 3e-05,
1140
- "loss": 8.252,
1141
  "step": 13600
1142
  },
1143
  {
1144
- "epoch": 12.577461556116594,
1145
- "grad_norm": 24849942.0,
1146
  "learning_rate": 3e-05,
1147
- "loss": 7.8665,
1148
  "step": 13700
1149
  },
1150
  {
1151
- "epoch": 12.669267844847372,
1152
- "grad_norm": 20334586.0,
1153
  "learning_rate": 3e-05,
1154
- "loss": 7.6115,
1155
  "step": 13800
1156
  },
1157
  {
1158
- "epoch": 12.76107413357815,
1159
- "grad_norm": 1128190848.0,
1160
  "learning_rate": 3e-05,
1161
- "loss": 7.8072,
1162
  "step": 13900
1163
  },
1164
  {
1165
- "epoch": 12.852880422308928,
1166
- "grad_norm": 26534666240.0,
1167
  "learning_rate": 3e-05,
1168
- "loss": 9.5496,
1169
  "step": 14000
1170
  },
1171
  {
1172
- "epoch": 12.944686711039706,
1173
- "grad_norm": 1785278496768.0,
1174
  "learning_rate": 3e-05,
1175
- "loss": 9.6825,
1176
  "step": 14100
1177
  },
1178
  {
1179
- "epoch": 12.999770484278173,
1180
- "eval_accuracy": 0.23657142857142857,
1181
- "eval_loss": 17.21571922302246,
1182
- "eval_runtime": 5.2648,
1183
- "eval_samples_per_second": 94.97,
1184
- "eval_steps_per_second": 11.966,
1185
  "step": 14160
1186
  },
1187
  {
1188
- "epoch": 12.999770484278173,
1189
  "eval_exact_match": 0.0,
1190
- "eval_f1": 0.0,
1191
  "step": 14160
1192
  },
1193
  {
1194
- "epoch": 13.036492999770484,
1195
- "grad_norm": 2082756100096.0,
1196
  "learning_rate": 3e-05,
1197
- "loss": 9.7452,
1198
  "step": 14200
1199
  },
1200
  {
1201
- "epoch": 13.128299288501262,
1202
- "grad_norm": 22699044864.0,
1203
  "learning_rate": 3e-05,
1204
- "loss": 9.8461,
1205
  "step": 14300
1206
  },
1207
  {
1208
- "epoch": 13.22010557723204,
1209
- "grad_norm": 636028125184.0,
1210
  "learning_rate": 3e-05,
1211
- "loss": 9.6601,
1212
  "step": 14400
1213
  },
1214
  {
1215
- "epoch": 13.311911865962818,
1216
- "grad_norm": 124859277312.0,
1217
  "learning_rate": 3e-05,
1218
- "loss": 9.6887,
1219
  "step": 14500
1220
  },
1221
  {
1222
- "epoch": 13.403718154693596,
1223
- "grad_norm": 175380627456.0,
1224
  "learning_rate": 3e-05,
1225
- "loss": 9.69,
1226
  "step": 14600
1227
  },
1228
  {
1229
- "epoch": 13.495524443424374,
1230
- "grad_norm": 369773576192.0,
1231
  "learning_rate": 3e-05,
1232
- "loss": 9.6817,
1233
  "step": 14700
1234
  },
1235
  {
1236
- "epoch": 13.587330732155152,
1237
- "grad_norm": 24862167040.0,
1238
  "learning_rate": 3e-05,
1239
- "loss": 8.522,
1240
  "step": 14800
1241
  },
1242
  {
1243
- "epoch": 13.67913702088593,
1244
- "grad_norm": 50220883968.0,
1245
  "learning_rate": 3e-05,
1246
- "loss": 7.7386,
1247
  "step": 14900
1248
  },
1249
  {
1250
- "epoch": 13.770943309616708,
1251
- "grad_norm": 225417920512.0,
1252
  "learning_rate": 3e-05,
1253
- "loss": 11.1681,
1254
  "step": 15000
1255
  },
1256
  {
1257
- "epoch": 13.862749598347486,
1258
- "grad_norm": 91464466432.0,
1259
  "learning_rate": 3e-05,
1260
- "loss": 9.9399,
1261
  "step": 15100
1262
  },
1263
  {
1264
- "epoch": 13.954555887078264,
1265
- "grad_norm": 10590986240.0,
1266
  "learning_rate": 3e-05,
1267
- "loss": 7.4843,
1268
  "step": 15200
1269
  },
1270
  {
1271
- "epoch": 13.999540968556346,
1272
- "eval_accuracy": 0.26212698412698415,
1273
- "eval_loss": 7.1726508140563965,
1274
- "eval_runtime": 5.982,
1275
- "eval_samples_per_second": 83.583,
1276
- "eval_steps_per_second": 10.532,
1277
  "step": 15249
1278
  },
1279
  {
1280
- "epoch": 13.999540968556346,
1281
  "eval_exact_match": 0.0,
1282
- "eval_f1": 0.0,
1283
  "step": 15249
1284
  },
1285
  {
1286
- "epoch": 14.046362175809042,
1287
- "grad_norm": 1116.731689453125,
1288
  "learning_rate": 3e-05,
1289
- "loss": 7.3998,
1290
  "step": 15300
1291
  },
1292
  {
1293
- "epoch": 14.13816846453982,
1294
- "grad_norm": 1628687.875,
1295
  "learning_rate": 3e-05,
1296
- "loss": 6.9832,
1297
  "step": 15400
1298
  },
1299
  {
1300
- "epoch": 14.229974753270598,
1301
- "grad_norm": 1284.560791015625,
1302
  "learning_rate": 3e-05,
1303
- "loss": 7.0072,
1304
  "step": 15500
1305
  },
1306
  {
1307
- "epoch": 14.321781042001376,
1308
- "grad_norm": 866471198588928.0,
1309
  "learning_rate": 3e-05,
1310
- "loss": 8.5636,
1311
  "step": 15600
1312
  },
1313
  {
1314
- "epoch": 14.413587330732156,
1315
- "grad_norm": 4296672934887424.0,
1316
  "learning_rate": 3e-05,
1317
- "loss": 9.1031,
1318
  "step": 15700
1319
  },
1320
  {
1321
- "epoch": 14.505393619462934,
1322
- "grad_norm": 1.4875405555073024e+16,
1323
  "learning_rate": 3e-05,
1324
- "loss": 9.2137,
1325
  "step": 15800
1326
  },
1327
  {
1328
- "epoch": 14.597199908193712,
1329
- "grad_norm": 1388941279232.0,
1330
  "learning_rate": 3e-05,
1331
- "loss": 11.3093,
1332
  "step": 15900
1333
  },
1334
  {
1335
- "epoch": 14.68900619692449,
1336
- "grad_norm": 1409362034688.0,
1337
  "learning_rate": 3e-05,
1338
- "loss": 9.0121,
1339
  "step": 16000
1340
  },
1341
  {
1342
- "epoch": 14.780812485655268,
1343
- "grad_norm": 4890259968.0,
1344
  "learning_rate": 3e-05,
1345
- "loss": 8.646,
1346
  "step": 16100
1347
  },
1348
  {
1349
- "epoch": 14.872618774386046,
1350
- "grad_norm": 15250687524864.0,
1351
  "learning_rate": 3e-05,
1352
- "loss": 7.8009,
1353
  "step": 16200
1354
  },
1355
  {
1356
- "epoch": 14.964425063116824,
1357
- "grad_norm": 8203539968.0,
1358
  "learning_rate": 3e-05,
1359
- "loss": 7.8643,
1360
  "step": 16300
1361
  },
1362
  {
1363
- "epoch": 14.999311452834519,
1364
- "eval_accuracy": 0.24946031746031747,
1365
- "eval_loss": 8.278827667236328,
1366
- "eval_runtime": 5.6796,
1367
- "eval_samples_per_second": 88.034,
1368
- "eval_steps_per_second": 11.092,
1369
  "step": 16338
1370
  },
1371
  {
1372
- "epoch": 14.999311452834519,
1373
  "eval_exact_match": 0.0,
1374
- "eval_f1": 0.48018819821072645,
1375
  "step": 16338
1376
  },
1377
  {
1378
- "epoch": 15.056231351847602,
1379
- "grad_norm": 828310464.0,
1380
  "learning_rate": 3e-05,
1381
- "loss": 7.3876,
1382
  "step": 16400
1383
  },
1384
  {
1385
- "epoch": 15.14803764057838,
1386
- "grad_norm": 568067031040.0,
1387
  "learning_rate": 3e-05,
1388
- "loss": 7.5295,
1389
  "step": 16500
1390
  },
1391
  {
1392
- "epoch": 15.239843929309158,
1393
- "grad_norm": 1.0880810018144256e+16,
1394
  "learning_rate": 3e-05,
1395
- "loss": 8.5293,
1396
  "step": 16600
1397
  },
1398
  {
1399
- "epoch": 15.331650218039936,
1400
- "grad_norm": 32871582.0,
1401
  "learning_rate": 3e-05,
1402
- "loss": 8.0883,
1403
  "step": 16700
1404
  },
1405
  {
1406
- "epoch": 15.423456506770714,
1407
- "grad_norm": 6865388.0,
1408
  "learning_rate": 3e-05,
1409
- "loss": 7.6397,
1410
  "step": 16800
1411
  },
1412
  {
1413
- "epoch": 15.515262795501492,
1414
- "grad_norm": 49280404.0,
1415
  "learning_rate": 3e-05,
1416
- "loss": 7.755,
1417
  "step": 16900
1418
  },
1419
  {
1420
- "epoch": 15.60706908423227,
1421
- "grad_norm": 390155370496.0,
1422
  "learning_rate": 3e-05,
1423
- "loss": 7.3984,
1424
  "step": 17000
1425
  },
1426
  {
1427
- "epoch": 15.698875372963048,
1428
- "grad_norm": 290927424.0,
1429
  "learning_rate": 3e-05,
1430
- "loss": 9.2541,
1431
  "step": 17100
1432
  },
1433
  {
1434
- "epoch": 15.790681661693826,
1435
- "grad_norm": 6678533120.0,
1436
  "learning_rate": 3e-05,
1437
- "loss": 7.6938,
1438
  "step": 17200
1439
  },
1440
  {
1441
- "epoch": 15.882487950424604,
1442
- "grad_norm": 35520180.0,
1443
  "learning_rate": 3e-05,
1444
- "loss": 8.1473,
1445
  "step": 17300
1446
  },
1447
  {
1448
- "epoch": 15.974294239155382,
1449
- "grad_norm": 600111680.0,
1450
  "learning_rate": 3e-05,
1451
- "loss": 9.0448,
1452
  "step": 17400
1453
  },
1454
  {
1455
  "epoch": 16.0,
1456
- "eval_accuracy": 0.25244444444444447,
1457
- "eval_loss": 17.031343460083008,
1458
- "eval_runtime": 5.5083,
1459
- "eval_samples_per_second": 90.773,
1460
- "eval_steps_per_second": 11.437,
1461
  "step": 17428
1462
  },
1463
  {
1464
  "epoch": 16.0,
1465
  "eval_exact_match": 0.0,
1466
- "eval_f1": 0.0,
1467
  "step": 17428
1468
  }
1469
  ],
1470
  "logging_steps": 100,
1471
  "max_steps": 54450,
1472
- "num_input_tokens_seen": 0,
1473
  "num_train_epochs": 50,
1474
  "save_steps": 500,
1475
- "stateful_callbacks": {
1476
- "TrainerControl": {
1477
- "args": {
1478
- "should_epoch_stop": false,
1479
- "should_evaluate": false,
1480
- "should_log": false,
1481
- "should_save": true,
1482
- "should_training_stop": false
1483
- },
1484
- "attributes": {}
1485
- }
1486
- },
1487
- "total_flos": 1.2024281860260495e+18,
1488
- "train_batch_size": 1,
1489
  "trial_name": null,
1490
  "trial_params": null
1491
  }
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.09,
 
13
  "learning_rate": 3e-05,
14
+ "loss": 1.5477,
15
  "step": 100
16
  },
17
  {
18
+ "epoch": 0.18,
 
19
  "learning_rate": 3e-05,
20
+ "loss": 1.3248,
21
  "step": 200
22
  },
23
  {
24
+ "epoch": 0.28,
 
25
  "learning_rate": 3e-05,
26
+ "loss": 1.2394,
27
  "step": 300
28
  },
29
  {
30
+ "epoch": 0.37,
 
31
  "learning_rate": 3e-05,
32
+ "loss": 1.2549,
33
  "step": 400
34
  },
35
  {
36
+ "epoch": 0.46,
 
37
  "learning_rate": 3e-05,
38
+ "loss": 1.2163,
39
  "step": 500
40
  },
41
  {
42
+ "epoch": 0.55,
 
43
  "learning_rate": 3e-05,
44
+ "loss": 1.2202,
45
  "step": 600
46
  },
47
  {
48
+ "epoch": 0.64,
 
49
  "learning_rate": 3e-05,
50
+ "loss": 1.2099,
51
  "step": 700
52
  },
53
  {
54
+ "epoch": 0.73,
 
55
  "learning_rate": 3e-05,
56
+ "loss": 1.2213,
57
  "step": 800
58
  },
59
  {
60
+ "epoch": 0.83,
 
61
  "learning_rate": 3e-05,
62
+ "loss": 1.2203,
63
  "step": 900
64
  },
65
  {
66
+ "epoch": 0.92,
 
67
  "learning_rate": 3e-05,
68
+ "loss": 1.2166,
69
  "step": 1000
70
  },
71
  {
72
+ "epoch": 1.0,
73
+ "eval_accuracy": 0.5925822784810126,
74
+ "eval_loss": 1.858124017715454,
75
+ "eval_runtime": 4.7843,
76
+ "eval_samples_per_second": 104.509,
77
+ "eval_steps_per_second": 13.168,
78
  "step": 1089
79
  },
80
  {
81
+ "epoch": 1.0,
82
+ "eval_exact_match": 25.0,
83
+ "eval_f1": 33.90013632566266,
84
  "step": 1089
85
  },
86
  {
87
+ "epoch": 1.01,
 
88
  "learning_rate": 3e-05,
89
+ "loss": 1.2141,
90
  "step": 1100
91
  },
92
  {
93
+ "epoch": 1.1,
 
94
  "learning_rate": 3e-05,
95
+ "loss": 1.2178,
96
  "step": 1200
97
  },
98
  {
99
+ "epoch": 1.19,
 
100
  "learning_rate": 3e-05,
101
+ "loss": 1.1986,
102
  "step": 1300
103
  },
104
  {
105
+ "epoch": 1.29,
 
106
  "learning_rate": 3e-05,
107
+ "loss": 1.1903,
108
  "step": 1400
109
  },
110
  {
111
+ "epoch": 1.38,
 
112
  "learning_rate": 3e-05,
113
+ "loss": 1.1925,
114
  "step": 1500
115
  },
116
  {
117
+ "epoch": 1.47,
 
118
  "learning_rate": 3e-05,
119
+ "loss": 1.1954,
120
  "step": 1600
121
  },
122
  {
123
+ "epoch": 1.56,
 
124
  "learning_rate": 3e-05,
125
+ "loss": 1.1859,
126
  "step": 1700
127
  },
128
  {
129
+ "epoch": 1.65,
 
130
  "learning_rate": 3e-05,
131
+ "loss": 1.1979,
132
  "step": 1800
133
  },
134
  {
135
+ "epoch": 1.74,
 
136
  "learning_rate": 3e-05,
137
+ "loss": 1.201,
138
  "step": 1900
139
  },
140
  {
141
+ "epoch": 1.84,
 
142
  "learning_rate": 3e-05,
143
+ "loss": 1.1871,
144
  "step": 2000
145
  },
146
  {
147
+ "epoch": 1.93,
 
148
  "learning_rate": 3e-05,
149
+ "loss": 1.1949,
150
  "step": 2100
151
  },
152
  {
153
+ "epoch": 2.0,
154
+ "eval_accuracy": 0.5960506329113924,
155
+ "eval_loss": 1.834545373916626,
156
+ "eval_runtime": 4.5511,
157
+ "eval_samples_per_second": 109.863,
158
+ "eval_steps_per_second": 13.843,
159
  "step": 2178
160
  },
161
  {
162
+ "epoch": 2.0,
163
+ "eval_exact_match": 25.2,
164
+ "eval_f1": 34.41373827926461,
165
  "step": 2178
166
  },
167
  {
168
+ "epoch": 2.02,
 
169
  "learning_rate": 3e-05,
170
+ "loss": 1.1775,
171
  "step": 2200
172
  },
173
  {
174
+ "epoch": 2.11,
 
175
  "learning_rate": 3e-05,
176
+ "loss": 1.1579,
177
  "step": 2300
178
  },
179
  {
180
+ "epoch": 2.2,
 
181
  "learning_rate": 3e-05,
182
+ "loss": 1.1678,
183
  "step": 2400
184
  },
185
  {
186
+ "epoch": 2.3,
 
187
  "learning_rate": 3e-05,
188
+ "loss": 1.1758,
189
  "step": 2500
190
  },
191
  {
192
+ "epoch": 2.39,
 
193
  "learning_rate": 3e-05,
194
+ "loss": 1.1374,
195
  "step": 2600
196
  },
197
  {
198
+ "epoch": 2.48,
 
199
  "learning_rate": 3e-05,
200
+ "loss": 1.16,
201
  "step": 2700
202
  },
203
  {
204
+ "epoch": 2.57,
 
205
  "learning_rate": 3e-05,
206
+ "loss": 1.1647,
207
  "step": 2800
208
  },
209
  {
210
+ "epoch": 2.66,
 
211
  "learning_rate": 3e-05,
212
+ "loss": 1.1579,
213
  "step": 2900
214
  },
215
  {
216
+ "epoch": 2.75,
 
217
  "learning_rate": 3e-05,
218
+ "loss": 1.1647,
219
  "step": 3000
220
  },
221
  {
222
+ "epoch": 2.85,
 
223
  "learning_rate": 3e-05,
224
+ "loss": 1.1581,
225
  "step": 3100
226
  },
227
  {
228
+ "epoch": 2.94,
 
229
  "learning_rate": 3e-05,
230
+ "loss": 1.1542,
231
  "step": 3200
232
  },
233
  {
234
+ "epoch": 3.0,
235
+ "eval_accuracy": 0.596379746835443,
236
+ "eval_loss": 1.8205524682998657,
237
+ "eval_runtime": 4.5724,
238
+ "eval_samples_per_second": 109.352,
239
+ "eval_steps_per_second": 13.778,
240
  "step": 3267
241
  },
242
  {
243
+ "epoch": 3.0,
244
+ "eval_exact_match": 24.6,
245
+ "eval_f1": 34.42903485403487,
246
  "step": 3267
247
  },
248
  {
249
+ "epoch": 3.03,
 
250
  "learning_rate": 3e-05,
251
+ "loss": 1.1336,
252
  "step": 3300
253
  },
254
  {
255
+ "epoch": 3.12,
 
256
  "learning_rate": 3e-05,
257
+ "loss": 1.1112,
258
  "step": 3400
259
  },
260
  {
261
+ "epoch": 3.21,
 
262
  "learning_rate": 3e-05,
263
+ "loss": 1.123,
264
  "step": 3500
265
  },
266
  {
267
+ "epoch": 3.31,
 
268
  "learning_rate": 3e-05,
269
+ "loss": 1.1278,
270
  "step": 3600
271
  },
272
  {
273
+ "epoch": 3.4,
 
274
  "learning_rate": 3e-05,
275
+ "loss": 1.1226,
276
  "step": 3700
277
  },
278
  {
279
+ "epoch": 3.49,
 
280
  "learning_rate": 3e-05,
281
+ "loss": 1.1132,
282
  "step": 3800
283
  },
284
  {
285
+ "epoch": 3.58,
 
286
  "learning_rate": 3e-05,
287
+ "loss": 1.1075,
288
  "step": 3900
289
  },
290
  {
291
+ "epoch": 3.67,
 
292
  "learning_rate": 3e-05,
293
+ "loss": 1.128,
294
  "step": 4000
295
  },
296
  {
297
+ "epoch": 3.76,
 
298
  "learning_rate": 3e-05,
299
+ "loss": 1.1182,
300
  "step": 4100
301
  },
302
  {
303
+ "epoch": 3.86,
 
304
  "learning_rate": 3e-05,
305
+ "loss": 1.1288,
306
  "step": 4200
307
  },
308
  {
309
+ "epoch": 3.95,
 
310
  "learning_rate": 3e-05,
311
+ "loss": 1.1236,
312
  "step": 4300
313
  },
314
  {
315
  "epoch": 4.0,
316
+ "eval_accuracy": 0.600379746835443,
317
+ "eval_loss": 1.822967290878296,
318
+ "eval_runtime": 4.5641,
319
+ "eval_samples_per_second": 109.55,
320
+ "eval_steps_per_second": 13.803,
321
  "step": 4357
322
  },
323
  {
324
  "epoch": 4.0,
325
+ "eval_exact_match": 23.0,
326
+ "eval_f1": 32.55568709068711,
327
  "step": 4357
328
  },
329
  {
330
+ "epoch": 4.04,
 
331
  "learning_rate": 3e-05,
332
+ "loss": 1.1034,
333
  "step": 4400
334
  },
335
  {
336
+ "epoch": 4.13,
 
337
  "learning_rate": 3e-05,
338
+ "loss": 1.0822,
339
  "step": 4500
340
  },
341
  {
342
+ "epoch": 4.22,
 
343
  "learning_rate": 3e-05,
344
+ "loss": 1.0869,
345
  "step": 4600
346
  },
347
  {
348
+ "epoch": 4.31,
 
349
  "learning_rate": 3e-05,
350
+ "loss": 1.0813,
351
  "step": 4700
352
  },
353
  {
354
+ "epoch": 4.41,
 
355
  "learning_rate": 3e-05,
356
+ "loss": 1.081,
357
  "step": 4800
358
  },
359
  {
360
+ "epoch": 4.5,
 
361
  "learning_rate": 3e-05,
362
+ "loss": 1.1065,
363
  "step": 4900
364
  },
365
  {
366
+ "epoch": 4.59,
 
367
  "learning_rate": 3e-05,
368
+ "loss": 1.1018,
369
  "step": 5000
370
  },
371
  {
372
+ "epoch": 4.68,
 
373
  "learning_rate": 3e-05,
374
+ "loss": 1.0737,
375
  "step": 5100
376
  },
377
  {
378
+ "epoch": 4.77,
 
379
  "learning_rate": 3e-05,
380
+ "loss": 1.0823,
381
  "step": 5200
382
  },
383
  {
384
+ "epoch": 4.87,
 
385
  "learning_rate": 3e-05,
386
+ "loss": 1.0916,
387
  "step": 5300
388
  },
389
  {
390
+ "epoch": 4.96,
 
391
  "learning_rate": 3e-05,
392
+ "loss": 1.0989,
393
  "step": 5400
394
  },
395
  {
396
+ "epoch": 5.0,
397
+ "eval_accuracy": 0.6035696202531645,
398
+ "eval_loss": 1.8370839357376099,
399
+ "eval_runtime": 4.3939,
400
+ "eval_samples_per_second": 113.795,
401
+ "eval_steps_per_second": 14.338,
402
  "step": 5446
403
  },
404
  {
405
+ "epoch": 5.0,
406
+ "eval_exact_match": 23.8,
407
+ "eval_f1": 33.387894882894905,
408
  "step": 5446
409
  },
410
  {
411
+ "epoch": 5.05,
 
412
  "learning_rate": 3e-05,
413
+ "loss": 1.0542,
414
  "step": 5500
415
  },
416
  {
417
+ "epoch": 5.14,
 
418
  "learning_rate": 3e-05,
419
+ "loss": 1.0188,
420
  "step": 5600
421
  },
422
  {
423
+ "epoch": 5.23,
 
424
  "learning_rate": 3e-05,
425
+ "loss": 1.0364,
426
  "step": 5700
427
  },
428
  {
429
+ "epoch": 5.32,
 
430
  "learning_rate": 3e-05,
431
+ "loss": 1.0497,
432
  "step": 5800
433
  },
434
  {
435
+ "epoch": 5.42,
 
436
  "learning_rate": 3e-05,
437
+ "loss": 1.0561,
438
  "step": 5900
439
  },
440
  {
441
+ "epoch": 5.51,
 
442
  "learning_rate": 3e-05,
443
+ "loss": 1.0538,
444
  "step": 6000
445
  },
446
  {
447
+ "epoch": 5.6,
 
448
  "learning_rate": 3e-05,
449
+ "loss": 1.0574,
450
  "step": 6100
451
  },
452
  {
453
+ "epoch": 5.69,
 
454
  "learning_rate": 3e-05,
455
+ "loss": 1.0468,
456
  "step": 6200
457
  },
458
  {
459
+ "epoch": 5.78,
 
460
  "learning_rate": 3e-05,
461
+ "loss": 1.0524,
462
  "step": 6300
463
  },
464
  {
465
+ "epoch": 5.88,
 
466
  "learning_rate": 3e-05,
467
+ "loss": 1.0637,
468
  "step": 6400
469
  },
470
  {
471
+ "epoch": 5.97,
 
472
  "learning_rate": 3e-05,
473
+ "loss": 1.0543,
474
  "step": 6500
475
  },
476
  {
477
+ "epoch": 6.0,
478
+ "eval_accuracy": 0.6021518987341772,
479
+ "eval_loss": 1.8654963970184326,
480
+ "eval_runtime": 4.5885,
481
+ "eval_samples_per_second": 108.969,
482
+ "eval_steps_per_second": 13.73,
483
  "step": 6535
484
  },
485
  {
486
+ "epoch": 6.0,
487
+ "eval_exact_match": 23.4,
488
+ "eval_f1": 31.89286435786437,
489
  "step": 6535
490
  },
491
  {
492
+ "epoch": 6.06,
 
493
  "learning_rate": 3e-05,
494
+ "loss": 1.0225,
495
  "step": 6600
496
  },
497
  {
498
+ "epoch": 6.15,
 
499
  "learning_rate": 3e-05,
500
+ "loss": 1.0118,
501
  "step": 6700
502
  },
503
  {
504
+ "epoch": 6.24,
 
505
  "learning_rate": 3e-05,
506
+ "loss": 1.0165,
507
  "step": 6800
508
  },
509
  {
510
+ "epoch": 6.33,
 
511
  "learning_rate": 3e-05,
512
+ "loss": 1.0042,
513
  "step": 6900
514
  },
515
  {
516
+ "epoch": 6.43,
 
517
  "learning_rate": 3e-05,
518
+ "loss": 1.0064,
519
  "step": 7000
520
  },
521
  {
522
+ "epoch": 6.52,
 
523
  "learning_rate": 3e-05,
524
+ "loss": 1.0245,
525
  "step": 7100
526
  },
527
  {
528
+ "epoch": 6.61,
 
529
  "learning_rate": 3e-05,
530
+ "loss": 1.0091,
531
  "step": 7200
532
  },
533
  {
534
+ "epoch": 6.7,
 
535
  "learning_rate": 3e-05,
536
+ "loss": 1.0259,
537
  "step": 7300
538
  },
539
  {
540
+ "epoch": 6.79,
 
541
  "learning_rate": 3e-05,
542
+ "loss": 1.0,
543
  "step": 7400
544
  },
545
  {
546
+ "epoch": 6.89,
 
547
  "learning_rate": 3e-05,
548
+ "loss": 0.995,
549
  "step": 7500
550
  },
551
  {
552
+ "epoch": 6.98,
 
553
  "learning_rate": 3e-05,
554
+ "loss": 1.0139,
555
  "step": 7600
556
  },
557
  {
558
+ "epoch": 7.0,
559
+ "eval_accuracy": 0.592886075949367,
560
+ "eval_loss": 1.927990198135376,
561
+ "eval_runtime": 4.6988,
562
+ "eval_samples_per_second": 106.411,
563
+ "eval_steps_per_second": 13.408,
564
  "step": 7624
565
  },
566
  {
567
+ "epoch": 7.0,
568
+ "eval_exact_match": 23.0,
569
+ "eval_f1": 32.51321067821068,
570
  "step": 7624
571
  },
572
  {
573
+ "epoch": 7.07,
 
574
  "learning_rate": 3e-05,
575
+ "loss": 0.9788,
576
  "step": 7700
577
  },
578
  {
579
+ "epoch": 7.16,
 
580
  "learning_rate": 3e-05,
581
+ "loss": 0.9597,
582
  "step": 7800
583
  },
584
  {
585
+ "epoch": 7.25,
 
586
  "learning_rate": 3e-05,
587
+ "loss": 0.9623,
588
  "step": 7900
589
  },
590
  {
591
+ "epoch": 7.34,
 
592
  "learning_rate": 3e-05,
593
+ "loss": 0.9815,
594
  "step": 8000
595
  },
596
  {
597
+ "epoch": 7.44,
 
598
  "learning_rate": 3e-05,
599
+ "loss": 0.9831,
600
  "step": 8100
601
  },
602
  {
603
+ "epoch": 7.53,
 
604
  "learning_rate": 3e-05,
605
+ "loss": 0.9762,
606
  "step": 8200
607
  },
608
  {
609
+ "epoch": 7.62,
 
610
  "learning_rate": 3e-05,
611
+ "loss": 0.9769,
612
  "step": 8300
613
  },
614
  {
615
+ "epoch": 7.71,
 
616
  "learning_rate": 3e-05,
617
+ "loss": 0.9882,
618
  "step": 8400
619
  },
620
  {
621
+ "epoch": 7.8,
 
622
  "learning_rate": 3e-05,
623
+ "loss": 0.9736,
624
  "step": 8500
625
  },
626
  {
627
+ "epoch": 7.9,
 
628
  "learning_rate": 3e-05,
629
+ "loss": 0.9764,
630
  "step": 8600
631
  },
632
  {
633
+ "epoch": 7.99,
 
634
  "learning_rate": 3e-05,
635
+ "loss": 0.9764,
636
  "step": 8700
637
  },
638
  {
639
  "epoch": 8.0,
640
+ "eval_accuracy": 0.5912658227848101,
641
+ "eval_loss": 1.991403579711914,
642
+ "eval_runtime": 4.6999,
643
+ "eval_samples_per_second": 106.385,
644
+ "eval_steps_per_second": 13.404,
645
  "step": 8714
646
  },
647
  {
648
  "epoch": 8.0,
649
+ "eval_exact_match": 20.8,
650
+ "eval_f1": 30.54087925962926,
651
  "step": 8714
652
  },
653
  {
654
+ "epoch": 8.08,
 
655
  "learning_rate": 3e-05,
656
+ "loss": 0.9509,
657
  "step": 8800
658
  },
659
  {
660
+ "epoch": 8.17,
 
661
  "learning_rate": 3e-05,
662
+ "loss": 0.9319,
663
  "step": 8900
664
  },
665
  {
666
+ "epoch": 8.26,
 
667
  "learning_rate": 3e-05,
668
+ "loss": 0.9254,
669
  "step": 9000
670
  },
671
  {
672
+ "epoch": 8.35,
 
673
  "learning_rate": 3e-05,
674
+ "loss": 0.9396,
675
  "step": 9100
676
  },
677
  {
678
+ "epoch": 8.45,
 
679
  "learning_rate": 3e-05,
680
+ "loss": 0.9439,
681
  "step": 9200
682
  },
683
  {
684
+ "epoch": 8.54,
 
685
  "learning_rate": 3e-05,
686
+ "loss": 0.9434,
687
  "step": 9300
688
  },
689
  {
690
+ "epoch": 8.63,
 
691
  "learning_rate": 3e-05,
692
+ "loss": 0.9283,
693
  "step": 9400
694
  },
695
  {
696
+ "epoch": 8.72,
 
697
  "learning_rate": 3e-05,
698
+ "loss": 0.9409,
699
  "step": 9500
700
  },
701
  {
702
+ "epoch": 8.81,
 
703
  "learning_rate": 3e-05,
704
+ "loss": 0.9474,
705
  "step": 9600
706
  },
707
  {
708
+ "epoch": 8.91,
 
709
  "learning_rate": 3e-05,
710
+ "loss": 0.9538,
711
  "step": 9700
712
  },
713
  {
714
+ "epoch": 9.0,
 
715
  "learning_rate": 3e-05,
716
+ "loss": 0.9351,
717
  "step": 9800
718
  },
719
  {
720
+ "epoch": 9.0,
721
+ "eval_accuracy": 0.5909367088607594,
722
+ "eval_loss": 2.0564663410186768,
723
+ "eval_runtime": 4.8989,
724
+ "eval_samples_per_second": 102.064,
725
+ "eval_steps_per_second": 12.86,
726
  "step": 9803
727
  },
728
  {
729
+ "epoch": 9.0,
730
+ "eval_exact_match": 20.2,
731
+ "eval_f1": 29.28648074148075,
732
  "step": 9803
733
  },
734
  {
735
+ "epoch": 9.09,
 
736
  "learning_rate": 3e-05,
737
+ "loss": 0.8896,
738
  "step": 9900
739
  },
740
  {
741
+ "epoch": 9.18,
 
742
  "learning_rate": 3e-05,
743
+ "loss": 0.8923,
744
  "step": 10000
745
  },
746
  {
747
+ "epoch": 9.27,
 
748
  "learning_rate": 3e-05,
749
+ "loss": 0.9074,
750
  "step": 10100
751
  },
752
  {
753
+ "epoch": 9.36,
 
754
  "learning_rate": 3e-05,
755
+ "loss": 0.905,
756
  "step": 10200
757
  },
758
  {
759
+ "epoch": 9.46,
 
760
  "learning_rate": 3e-05,
761
+ "loss": 0.8954,
762
  "step": 10300
763
  },
764
  {
765
+ "epoch": 9.55,
 
766
  "learning_rate": 3e-05,
767
+ "loss": 0.8953,
768
  "step": 10400
769
  },
770
  {
771
+ "epoch": 9.64,
 
772
  "learning_rate": 3e-05,
773
+ "loss": 0.8943,
774
  "step": 10500
775
  },
776
  {
777
+ "epoch": 9.73,
 
778
  "learning_rate": 3e-05,
779
+ "loss": 0.9173,
780
  "step": 10600
781
  },
782
  {
783
+ "epoch": 9.82,
 
784
  "learning_rate": 3e-05,
785
+ "loss": 0.9148,
786
  "step": 10700
787
  },
788
  {
789
+ "epoch": 9.92,
 
790
  "learning_rate": 3e-05,
791
+ "loss": 0.9177,
792
  "step": 10800
793
  },
794
  {
795
+ "epoch": 10.0,
796
+ "eval_accuracy": 0.5891898734177216,
797
+ "eval_loss": 2.124812126159668,
798
+ "eval_runtime": 4.5237,
799
+ "eval_samples_per_second": 110.528,
800
+ "eval_steps_per_second": 13.927,
801
  "step": 10892
802
  },
803
  {
804
+ "epoch": 10.0,
805
  "eval_exact_match": 0.0,
806
+ "eval_f1": 1.653482807812982,
807
  "step": 10892
808
  },
809
  {
810
+ "epoch": 10.01,
 
811
  "learning_rate": 3e-05,
812
+ "loss": 0.9157,
813
  "step": 10900
814
  },
815
  {
816
+ "epoch": 10.1,
 
817
  "learning_rate": 3e-05,
818
+ "loss": 0.8335,
819
  "step": 11000
820
  },
821
  {
822
+ "epoch": 10.19,
 
823
  "learning_rate": 3e-05,
824
+ "loss": 0.8485,
825
  "step": 11100
826
  },
827
  {
828
+ "epoch": 10.28,
 
829
  "learning_rate": 3e-05,
830
+ "loss": 0.8712,
831
  "step": 11200
832
  },
833
  {
834
+ "epoch": 10.37,
 
835
  "learning_rate": 3e-05,
836
+ "loss": 0.869,
837
  "step": 11300
838
  },
839
  {
840
+ "epoch": 10.47,
 
841
  "learning_rate": 3e-05,
842
+ "loss": 0.8655,
843
  "step": 11400
844
  },
845
  {
846
+ "epoch": 10.56,
 
847
  "learning_rate": 3e-05,
848
+ "loss": 0.8868,
849
  "step": 11500
850
  },
851
  {
852
+ "epoch": 10.65,
 
853
  "learning_rate": 3e-05,
854
+ "loss": 0.8775,
855
  "step": 11600
856
  },
857
  {
858
+ "epoch": 10.74,
 
859
  "learning_rate": 3e-05,
860
+ "loss": 0.8861,
861
  "step": 11700
862
  },
863
  {
864
+ "epoch": 10.83,
 
865
  "learning_rate": 3e-05,
866
+ "loss": 0.892,
867
  "step": 11800
868
  },
869
  {
870
+ "epoch": 10.92,
 
871
  "learning_rate": 3e-05,
872
+ "loss": 0.8872,
873
  "step": 11900
874
  },
875
  {
876
+ "epoch": 11.0,
877
+ "eval_accuracy": 0.5874683544303797,
878
+ "eval_loss": 2.2181763648986816,
879
+ "eval_runtime": 4.6178,
880
+ "eval_samples_per_second": 108.277,
881
+ "eval_steps_per_second": 13.643,
882
  "step": 11981
883
  },
884
  {
885
+ "epoch": 11.0,
886
  "eval_exact_match": 0.0,
887
+ "eval_f1": 0.014945652173913042,
888
  "step": 11981
889
  },
890
  {
891
+ "epoch": 11.02,
 
892
  "learning_rate": 3e-05,
893
+ "loss": 0.8808,
894
  "step": 12000
895
  },
896
  {
897
+ "epoch": 11.11,
 
898
  "learning_rate": 3e-05,
899
+ "loss": 0.8145,
900
  "step": 12100
901
  },
902
  {
903
+ "epoch": 11.2,
 
904
  "learning_rate": 3e-05,
905
+ "loss": 0.8301,
906
  "step": 12200
907
  },
908
  {
909
+ "epoch": 11.29,
 
910
  "learning_rate": 3e-05,
911
+ "loss": 0.8281,
912
  "step": 12300
913
  },
914
  {
915
+ "epoch": 11.38,
 
916
  "learning_rate": 3e-05,
917
+ "loss": 0.8464,
918
  "step": 12400
919
  },
920
  {
921
+ "epoch": 11.48,
 
922
  "learning_rate": 3e-05,
923
+ "loss": 0.8547,
924
  "step": 12500
925
  },
926
  {
927
+ "epoch": 11.57,
 
928
  "learning_rate": 3e-05,
929
+ "loss": 0.8424,
930
  "step": 12600
931
  },
932
  {
933
+ "epoch": 11.66,
 
934
  "learning_rate": 3e-05,
935
+ "loss": 0.8452,
936
  "step": 12700
937
  },
938
  {
939
+ "epoch": 11.75,
 
940
  "learning_rate": 3e-05,
941
+ "loss": 0.8528,
942
  "step": 12800
943
  },
944
  {
945
+ "epoch": 11.84,
 
946
  "learning_rate": 3e-05,
947
+ "loss": 0.862,
948
  "step": 12900
949
  },
950
  {
951
+ "epoch": 11.93,
 
952
  "learning_rate": 3e-05,
953
+ "loss": 0.8458,
954
  "step": 13000
955
  },
956
  {
957
  "epoch": 12.0,
958
+ "eval_accuracy": 0.5862784810126582,
959
+ "eval_loss": 2.286332130432129,
960
+ "eval_runtime": 4.5481,
961
+ "eval_samples_per_second": 109.935,
962
+ "eval_steps_per_second": 13.852,
963
  "step": 13071
964
  },
965
  {
966
  "epoch": 12.0,
967
  "eval_exact_match": 0.0,
968
+ "eval_f1": 0.05640597466324206,
969
  "step": 13071
970
  },
971
  {
972
+ "epoch": 12.03,
 
973
  "learning_rate": 3e-05,
974
+ "loss": 0.8407,
975
  "step": 13100
976
  },
977
  {
978
+ "epoch": 12.12,
 
979
  "learning_rate": 3e-05,
980
+ "loss": 0.8103,
981
  "step": 13200
982
  },
983
  {
984
+ "epoch": 12.21,
 
985
  "learning_rate": 3e-05,
986
+ "loss": 0.8044,
987
  "step": 13300
988
  },
989
  {
990
+ "epoch": 12.3,
 
991
  "learning_rate": 3e-05,
992
+ "loss": 0.819,
993
  "step": 13400
994
  },
995
  {
996
+ "epoch": 12.39,
 
997
  "learning_rate": 3e-05,
998
+ "loss": 0.7983,
999
  "step": 13500
1000
  },
1001
  {
1002
+ "epoch": 12.49,
 
1003
  "learning_rate": 3e-05,
1004
+ "loss": 0.8057,
1005
  "step": 13600
1006
  },
1007
  {
1008
+ "epoch": 12.58,
 
1009
  "learning_rate": 3e-05,
1010
+ "loss": 0.8133,
1011
  "step": 13700
1012
  },
1013
  {
1014
+ "epoch": 12.67,
 
1015
  "learning_rate": 3e-05,
1016
+ "loss": 0.8267,
1017
  "step": 13800
1018
  },
1019
  {
1020
+ "epoch": 12.76,
 
1021
  "learning_rate": 3e-05,
1022
+ "loss": 0.8082,
1023
  "step": 13900
1024
  },
1025
  {
1026
+ "epoch": 12.85,
 
1027
  "learning_rate": 3e-05,
1028
+ "loss": 0.8316,
1029
  "step": 14000
1030
  },
1031
  {
1032
+ "epoch": 12.94,
 
1033
  "learning_rate": 3e-05,
1034
+ "loss": 0.8148,
1035
  "step": 14100
1036
  },
1037
  {
1038
+ "epoch": 13.0,
1039
+ "eval_accuracy": 0.5842278481012658,
1040
+ "eval_loss": 2.352457284927368,
1041
+ "eval_runtime": 4.464,
1042
+ "eval_samples_per_second": 112.008,
1043
+ "eval_steps_per_second": 14.113,
1044
  "step": 14160
1045
  },
1046
  {
1047
+ "epoch": 13.0,
1048
  "eval_exact_match": 0.0,
1049
+ "eval_f1": 0.08618200930627685,
1050
  "step": 14160
1051
  },
1052
  {
1053
+ "epoch": 13.04,
 
1054
  "learning_rate": 3e-05,
1055
+ "loss": 0.798,
1056
  "step": 14200
1057
  },
1058
  {
1059
+ "epoch": 13.13,
 
1060
  "learning_rate": 3e-05,
1061
+ "loss": 0.7788,
1062
  "step": 14300
1063
  },
1064
  {
1065
+ "epoch": 13.22,
 
1066
  "learning_rate": 3e-05,
1067
+ "loss": 0.7727,
1068
  "step": 14400
1069
  },
1070
  {
1071
+ "epoch": 13.31,
 
1072
  "learning_rate": 3e-05,
1073
+ "loss": 0.7831,
1074
  "step": 14500
1075
  },
1076
  {
1077
+ "epoch": 13.4,
 
1078
  "learning_rate": 3e-05,
1079
+ "loss": 0.7901,
1080
  "step": 14600
1081
  },
1082
  {
1083
+ "epoch": 13.5,
 
1084
  "learning_rate": 3e-05,
1085
+ "loss": 0.7993,
1086
  "step": 14700
1087
  },
1088
  {
1089
+ "epoch": 13.59,
 
1090
  "learning_rate": 3e-05,
1091
+ "loss": 0.7864,
1092
  "step": 14800
1093
  },
1094
  {
1095
+ "epoch": 13.68,
 
1096
  "learning_rate": 3e-05,
1097
+ "loss": 0.7813,
1098
  "step": 14900
1099
  },
1100
  {
1101
+ "epoch": 13.77,
 
1102
  "learning_rate": 3e-05,
1103
+ "loss": 0.8024,
1104
  "step": 15000
1105
  },
1106
  {
1107
+ "epoch": 13.86,
 
1108
  "learning_rate": 3e-05,
1109
+ "loss": 0.7945,
1110
  "step": 15100
1111
  },
1112
  {
1113
+ "epoch": 13.95,
 
1114
  "learning_rate": 3e-05,
1115
+ "loss": 0.7955,
1116
  "step": 15200
1117
  },
1118
  {
1119
+ "epoch": 14.0,
1120
+ "eval_accuracy": 0.5834177215189873,
1121
+ "eval_loss": 2.444202184677124,
1122
+ "eval_runtime": 4.5506,
1123
+ "eval_samples_per_second": 109.877,
1124
+ "eval_steps_per_second": 13.844,
1125
  "step": 15249
1126
  },
1127
  {
1128
+ "epoch": 14.0,
1129
  "eval_exact_match": 0.0,
1130
+ "eval_f1": 0.04849394305657365,
1131
  "step": 15249
1132
  },
1133
  {
1134
+ "epoch": 14.05,
 
1135
  "learning_rate": 3e-05,
1136
+ "loss": 0.7808,
1137
  "step": 15300
1138
  },
1139
  {
1140
+ "epoch": 14.14,
 
1141
  "learning_rate": 3e-05,
1142
+ "loss": 0.752,
1143
  "step": 15400
1144
  },
1145
  {
1146
+ "epoch": 14.23,
 
1147
  "learning_rate": 3e-05,
1148
+ "loss": 0.7533,
1149
  "step": 15500
1150
  },
1151
  {
1152
+ "epoch": 14.32,
 
1153
  "learning_rate": 3e-05,
1154
+ "loss": 0.7458,
1155
  "step": 15600
1156
  },
1157
  {
1158
+ "epoch": 14.41,
 
1159
  "learning_rate": 3e-05,
1160
+ "loss": 0.7527,
1161
  "step": 15700
1162
  },
1163
  {
1164
+ "epoch": 14.51,
 
1165
  "learning_rate": 3e-05,
1166
+ "loss": 0.7625,
1167
  "step": 15800
1168
  },
1169
  {
1170
+ "epoch": 14.6,
 
1171
  "learning_rate": 3e-05,
1172
+ "loss": 0.7424,
1173
  "step": 15900
1174
  },
1175
  {
1176
+ "epoch": 14.69,
 
1177
  "learning_rate": 3e-05,
1178
+ "loss": 0.7618,
1179
  "step": 16000
1180
  },
1181
  {
1182
+ "epoch": 14.78,
 
1183
  "learning_rate": 3e-05,
1184
+ "loss": 0.7589,
1185
  "step": 16100
1186
  },
1187
  {
1188
+ "epoch": 14.87,
 
1189
  "learning_rate": 3e-05,
1190
+ "loss": 0.7657,
1191
  "step": 16200
1192
  },
1193
  {
1194
+ "epoch": 14.96,
 
1195
  "learning_rate": 3e-05,
1196
+ "loss": 0.7765,
1197
  "step": 16300
1198
  },
1199
  {
1200
+ "epoch": 15.0,
1201
+ "eval_accuracy": 0.583620253164557,
1202
+ "eval_loss": 2.4962217807769775,
1203
+ "eval_runtime": 4.7551,
1204
+ "eval_samples_per_second": 105.15,
1205
+ "eval_steps_per_second": 13.249,
1206
  "step": 16338
1207
  },
1208
  {
1209
+ "epoch": 15.0,
1210
  "eval_exact_match": 0.0,
1211
+ "eval_f1": 0.013684210526315792,
1212
  "step": 16338
1213
  },
1214
  {
1215
+ "epoch": 15.06,
 
1216
  "learning_rate": 3e-05,
1217
+ "loss": 0.7377,
1218
  "step": 16400
1219
  },
1220
  {
1221
+ "epoch": 15.15,
 
1222
  "learning_rate": 3e-05,
1223
+ "loss": 0.7204,
1224
  "step": 16500
1225
  },
1226
  {
1227
+ "epoch": 15.24,
 
1228
  "learning_rate": 3e-05,
1229
+ "loss": 0.7176,
1230
  "step": 16600
1231
  },
1232
  {
1233
+ "epoch": 15.33,
 
1234
  "learning_rate": 3e-05,
1235
+ "loss": 0.7164,
1236
  "step": 16700
1237
  },
1238
  {
1239
+ "epoch": 15.42,
 
1240
  "learning_rate": 3e-05,
1241
+ "loss": 0.7266,
1242
  "step": 16800
1243
  },
1244
  {
1245
+ "epoch": 15.52,
 
1246
  "learning_rate": 3e-05,
1247
+ "loss": 0.7393,
1248
  "step": 16900
1249
  },
1250
  {
1251
+ "epoch": 15.61,
 
1252
  "learning_rate": 3e-05,
1253
+ "loss": 0.727,
1254
  "step": 17000
1255
  },
1256
  {
1257
+ "epoch": 15.7,
 
1258
  "learning_rate": 3e-05,
1259
+ "loss": 0.7364,
1260
  "step": 17100
1261
  },
1262
  {
1263
+ "epoch": 15.79,
 
1264
  "learning_rate": 3e-05,
1265
+ "loss": 0.7337,
1266
  "step": 17200
1267
  },
1268
  {
1269
+ "epoch": 15.88,
 
1270
  "learning_rate": 3e-05,
1271
+ "loss": 0.7458,
1272
  "step": 17300
1273
  },
1274
  {
1275
+ "epoch": 15.97,
 
1276
  "learning_rate": 3e-05,
1277
+ "loss": 0.7412,
1278
  "step": 17400
1279
  },
1280
  {
1281
  "epoch": 16.0,
1282
+ "eval_accuracy": 0.5817974683544304,
1283
+ "eval_loss": 2.574023962020874,
1284
+ "eval_runtime": 4.5479,
1285
+ "eval_samples_per_second": 109.941,
1286
+ "eval_steps_per_second": 13.853,
1287
  "step": 17428
1288
  },
1289
  {
1290
  "epoch": 16.0,
1291
  "eval_exact_match": 0.0,
1292
+ "eval_f1": 0.006153846153846153,
1293
  "step": 17428
1294
  }
1295
  ],
1296
  "logging_steps": 100,
1297
  "max_steps": 54450,
 
1298
  "num_train_epochs": 50,
1299
  "save_steps": 500,
1300
+ "total_flos": 3.2798113522306253e+18,
 
 
 
 
 
 
 
 
 
 
 
 
 
1301
  "trial_name": null,
1302
  "trial_params": null
1303
  }
checkpoint-17428/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5377ce2541570511bdbb29b699438bab217d7b0b49a543f6293cd608457a8a9d
3
- size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:990ba0ec6800794569e1f142d0a98c1c27c344ee04644eb9619680965b92e617
3
+ size 4728