duraad commited on
Commit
ee86c86
1 Parent(s): 5806b8c

After 10 epochs

Browse files
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "duraad/nep-spell-hft",
3
+ "architectures": [
4
+ "MT5ForConditionalGeneration"
5
+ ],
6
+ "classifier_dropout": 0.0,
7
+ "d_ff": 1024,
8
+ "d_kv": 64,
9
+ "d_model": 512,
10
+ "decoder_start_token_id": 0,
11
+ "dense_act_fn": "gelu_new",
12
+ "dropout_rate": 0.1,
13
+ "eos_token_id": 1,
14
+ "feed_forward_proj": "gated-gelu",
15
+ "initializer_factor": 1.0,
16
+ "is_encoder_decoder": true,
17
+ "is_gated_act": true,
18
+ "layer_norm_epsilon": 1e-06,
19
+ "model_type": "mt5",
20
+ "num_decoder_layers": 8,
21
+ "num_heads": 6,
22
+ "num_layers": 8,
23
+ "pad_token_id": 0,
24
+ "relative_attention_max_distance": 128,
25
+ "relative_attention_num_buckets": 32,
26
+ "tie_word_embeddings": false,
27
+ "tokenizer_class": "T5Tokenizer",
28
+ "torch_dtype": "float32",
29
+ "transformers_version": "4.37.2",
30
+ "use_cache": true,
31
+ "vocab_size": 250112
32
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "decoder_start_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.37.2"
7
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71a88dd7c7f1cbb37952e9b1f0b8f18b3302c3287135d70933705fb66682bbb8
3
+ size 1200729512
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9914f35ec4166caaefa365fc45342de2ff25e5e31820e04a2e7d1cc4b5461c6
3
+ size 2401573893
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:575164422ffb07dce40f4f226304ae580f088580435499d59fc33efa74eb8a4e
3
+ size 14575
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3771f987b89d5be40d08367559c2ba7770e79a7c012a7171d9578f9ad0ee0dbd
3
+ size 627
trainer_state.json ADDED
@@ -0,0 +1,692 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 8.0,
5
+ "eval_steps": 20000,
6
+ "global_step": 101152,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.08,
13
+ "learning_rate": 4.980149277433699e-05,
14
+ "loss": 0.0094,
15
+ "step": 1000
16
+ },
17
+ {
18
+ "epoch": 0.16,
19
+ "learning_rate": 4.9404478323010964e-05,
20
+ "loss": 0.0102,
21
+ "step": 2000
22
+ },
23
+ {
24
+ "epoch": 0.24,
25
+ "learning_rate": 4.900746387168493e-05,
26
+ "loss": 0.0098,
27
+ "step": 3000
28
+ },
29
+ {
30
+ "epoch": 0.32,
31
+ "learning_rate": 4.86104494203589e-05,
32
+ "loss": 0.0101,
33
+ "step": 4000
34
+ },
35
+ {
36
+ "epoch": 0.4,
37
+ "learning_rate": 4.821343496903287e-05,
38
+ "loss": 0.0098,
39
+ "step": 5000
40
+ },
41
+ {
42
+ "epoch": 0.47,
43
+ "learning_rate": 4.781642051770684e-05,
44
+ "loss": 0.0097,
45
+ "step": 6000
46
+ },
47
+ {
48
+ "epoch": 0.55,
49
+ "learning_rate": 4.741940606638082e-05,
50
+ "loss": 0.0099,
51
+ "step": 7000
52
+ },
53
+ {
54
+ "epoch": 0.63,
55
+ "learning_rate": 4.702239161505479e-05,
56
+ "loss": 0.01,
57
+ "step": 8000
58
+ },
59
+ {
60
+ "epoch": 0.71,
61
+ "learning_rate": 4.6625377163728765e-05,
62
+ "loss": 0.01,
63
+ "step": 9000
64
+ },
65
+ {
66
+ "epoch": 0.79,
67
+ "learning_rate": 4.6228362712402734e-05,
68
+ "loss": 0.0119,
69
+ "step": 10000
70
+ },
71
+ {
72
+ "epoch": 0.87,
73
+ "learning_rate": 4.5831348261076704e-05,
74
+ "loss": 0.0095,
75
+ "step": 11000
76
+ },
77
+ {
78
+ "epoch": 0.95,
79
+ "learning_rate": 4.543433380975068e-05,
80
+ "loss": 0.0097,
81
+ "step": 12000
82
+ },
83
+ {
84
+ "epoch": 1.03,
85
+ "learning_rate": 4.503731935842465e-05,
86
+ "loss": 0.0095,
87
+ "step": 13000
88
+ },
89
+ {
90
+ "epoch": 1.11,
91
+ "learning_rate": 4.464030490709862e-05,
92
+ "loss": 0.0089,
93
+ "step": 14000
94
+ },
95
+ {
96
+ "epoch": 1.19,
97
+ "learning_rate": 4.424329045577259e-05,
98
+ "loss": 0.0088,
99
+ "step": 15000
100
+ },
101
+ {
102
+ "epoch": 1.27,
103
+ "learning_rate": 4.3846276004446566e-05,
104
+ "loss": 0.0088,
105
+ "step": 16000
106
+ },
107
+ {
108
+ "epoch": 1.34,
109
+ "learning_rate": 4.3449261553120535e-05,
110
+ "loss": 0.0085,
111
+ "step": 17000
112
+ },
113
+ {
114
+ "epoch": 1.42,
115
+ "learning_rate": 4.3052247101794505e-05,
116
+ "loss": 0.0087,
117
+ "step": 18000
118
+ },
119
+ {
120
+ "epoch": 1.5,
121
+ "learning_rate": 4.265523265046848e-05,
122
+ "loss": 0.009,
123
+ "step": 19000
124
+ },
125
+ {
126
+ "epoch": 1.58,
127
+ "learning_rate": 4.225821819914245e-05,
128
+ "loss": 0.0087,
129
+ "step": 20000
130
+ },
131
+ {
132
+ "epoch": 1.58,
133
+ "eval_accuracy": 0.5290210343191523,
134
+ "eval_exact_match": 0.5290210343191523,
135
+ "eval_f1": 0.531319521324266,
136
+ "eval_loss": 0.005351942032575607,
137
+ "eval_precision": 0.5358216036691444,
138
+ "eval_recall": 0.5290210343191523,
139
+ "eval_runtime": 356.516,
140
+ "eval_samples_per_second": 17.736,
141
+ "eval_steps_per_second": 8.869,
142
+ "step": 20000
143
+ },
144
+ {
145
+ "epoch": 1.66,
146
+ "learning_rate": 4.186120374781643e-05,
147
+ "loss": 0.0082,
148
+ "step": 21000
149
+ },
150
+ {
151
+ "epoch": 1.74,
152
+ "learning_rate": 4.14641892964904e-05,
153
+ "loss": 0.0084,
154
+ "step": 22000
155
+ },
156
+ {
157
+ "epoch": 1.82,
158
+ "learning_rate": 4.1067174845164366e-05,
159
+ "loss": 0.0082,
160
+ "step": 23000
161
+ },
162
+ {
163
+ "epoch": 1.9,
164
+ "learning_rate": 4.0670160393838336e-05,
165
+ "loss": 0.0084,
166
+ "step": 24000
167
+ },
168
+ {
169
+ "epoch": 1.98,
170
+ "learning_rate": 4.0273145942512306e-05,
171
+ "loss": 0.0084,
172
+ "step": 25000
173
+ },
174
+ {
175
+ "epoch": 2.06,
176
+ "learning_rate": 3.987613149118628e-05,
177
+ "loss": 0.008,
178
+ "step": 26000
179
+ },
180
+ {
181
+ "epoch": 2.14,
182
+ "learning_rate": 3.947911703986025e-05,
183
+ "loss": 0.0075,
184
+ "step": 27000
185
+ },
186
+ {
187
+ "epoch": 2.21,
188
+ "learning_rate": 3.908210258853423e-05,
189
+ "loss": 0.0078,
190
+ "step": 28000
191
+ },
192
+ {
193
+ "epoch": 2.29,
194
+ "learning_rate": 3.86850881372082e-05,
195
+ "loss": 0.008,
196
+ "step": 29000
197
+ },
198
+ {
199
+ "epoch": 2.37,
200
+ "learning_rate": 3.828807368588217e-05,
201
+ "loss": 0.0075,
202
+ "step": 30000
203
+ },
204
+ {
205
+ "epoch": 2.45,
206
+ "learning_rate": 3.7891059234556144e-05,
207
+ "loss": 0.0077,
208
+ "step": 31000
209
+ },
210
+ {
211
+ "epoch": 2.53,
212
+ "learning_rate": 3.749404478323011e-05,
213
+ "loss": 0.0074,
214
+ "step": 32000
215
+ },
216
+ {
217
+ "epoch": 2.61,
218
+ "learning_rate": 3.709703033190408e-05,
219
+ "loss": 0.0078,
220
+ "step": 33000
221
+ },
222
+ {
223
+ "epoch": 2.69,
224
+ "learning_rate": 3.670001588057805e-05,
225
+ "loss": 0.008,
226
+ "step": 34000
227
+ },
228
+ {
229
+ "epoch": 2.77,
230
+ "learning_rate": 3.630300142925203e-05,
231
+ "loss": 0.0073,
232
+ "step": 35000
233
+ },
234
+ {
235
+ "epoch": 2.85,
236
+ "learning_rate": 3.5905986977926e-05,
237
+ "loss": 0.0076,
238
+ "step": 36000
239
+ },
240
+ {
241
+ "epoch": 2.93,
242
+ "learning_rate": 3.550897252659997e-05,
243
+ "loss": 0.0076,
244
+ "step": 37000
245
+ },
246
+ {
247
+ "epoch": 3.01,
248
+ "learning_rate": 3.5111958075273945e-05,
249
+ "loss": 0.0078,
250
+ "step": 38000
251
+ },
252
+ {
253
+ "epoch": 3.08,
254
+ "learning_rate": 3.4714943623947914e-05,
255
+ "loss": 0.007,
256
+ "step": 39000
257
+ },
258
+ {
259
+ "epoch": 3.16,
260
+ "learning_rate": 3.431792917262189e-05,
261
+ "loss": 0.0065,
262
+ "step": 40000
263
+ },
264
+ {
265
+ "epoch": 3.16,
266
+ "eval_accuracy": 0.5473667562865728,
267
+ "eval_exact_match": 0.5473667562865728,
268
+ "eval_f1": 0.5494543729242448,
269
+ "eval_loss": 0.005287344101816416,
270
+ "eval_precision": 0.5535347145342401,
271
+ "eval_recall": 0.5473667562865728,
272
+ "eval_runtime": 354.3494,
273
+ "eval_samples_per_second": 17.844,
274
+ "eval_steps_per_second": 8.923,
275
+ "step": 40000
276
+ },
277
+ {
278
+ "epoch": 3.24,
279
+ "learning_rate": 3.3920914721295854e-05,
280
+ "loss": 0.007,
281
+ "step": 41000
282
+ },
283
+ {
284
+ "epoch": 3.32,
285
+ "learning_rate": 3.352390026996983e-05,
286
+ "loss": 0.0071,
287
+ "step": 42000
288
+ },
289
+ {
290
+ "epoch": 3.4,
291
+ "learning_rate": 3.31268858186438e-05,
292
+ "loss": 0.0074,
293
+ "step": 43000
294
+ },
295
+ {
296
+ "epoch": 3.48,
297
+ "learning_rate": 3.272987136731777e-05,
298
+ "loss": 0.0068,
299
+ "step": 44000
300
+ },
301
+ {
302
+ "epoch": 3.56,
303
+ "learning_rate": 3.2332856915991746e-05,
304
+ "loss": 0.0067,
305
+ "step": 45000
306
+ },
307
+ {
308
+ "epoch": 3.64,
309
+ "learning_rate": 3.1935842464665715e-05,
310
+ "loss": 0.0071,
311
+ "step": 46000
312
+ },
313
+ {
314
+ "epoch": 3.72,
315
+ "learning_rate": 3.153882801333969e-05,
316
+ "loss": 0.0067,
317
+ "step": 47000
318
+ },
319
+ {
320
+ "epoch": 3.8,
321
+ "learning_rate": 3.114181356201366e-05,
322
+ "loss": 0.0071,
323
+ "step": 48000
324
+ },
325
+ {
326
+ "epoch": 3.88,
327
+ "learning_rate": 3.074479911068763e-05,
328
+ "loss": 0.0073,
329
+ "step": 49000
330
+ },
331
+ {
332
+ "epoch": 3.95,
333
+ "learning_rate": 3.03477846593616e-05,
334
+ "loss": 0.0067,
335
+ "step": 50000
336
+ },
337
+ {
338
+ "epoch": 4.03,
339
+ "learning_rate": 2.9950770208035574e-05,
340
+ "loss": 0.0067,
341
+ "step": 51000
342
+ },
343
+ {
344
+ "epoch": 4.11,
345
+ "learning_rate": 2.9553755756709543e-05,
346
+ "loss": 0.0064,
347
+ "step": 52000
348
+ },
349
+ {
350
+ "epoch": 4.19,
351
+ "learning_rate": 2.9156741305383516e-05,
352
+ "loss": 0.0062,
353
+ "step": 53000
354
+ },
355
+ {
356
+ "epoch": 4.27,
357
+ "learning_rate": 2.875972685405749e-05,
358
+ "loss": 0.0066,
359
+ "step": 54000
360
+ },
361
+ {
362
+ "epoch": 4.35,
363
+ "learning_rate": 2.8362712402731462e-05,
364
+ "loss": 0.0062,
365
+ "step": 55000
366
+ },
367
+ {
368
+ "epoch": 4.43,
369
+ "learning_rate": 2.7965697951405435e-05,
370
+ "loss": 0.0065,
371
+ "step": 56000
372
+ },
373
+ {
374
+ "epoch": 4.51,
375
+ "learning_rate": 2.7568683500079405e-05,
376
+ "loss": 0.0066,
377
+ "step": 57000
378
+ },
379
+ {
380
+ "epoch": 4.59,
381
+ "learning_rate": 2.7171669048753375e-05,
382
+ "loss": 0.0063,
383
+ "step": 58000
384
+ },
385
+ {
386
+ "epoch": 4.67,
387
+ "learning_rate": 2.6774654597427344e-05,
388
+ "loss": 0.0064,
389
+ "step": 59000
390
+ },
391
+ {
392
+ "epoch": 4.75,
393
+ "learning_rate": 2.6377640146101317e-05,
394
+ "loss": 0.006,
395
+ "step": 60000
396
+ },
397
+ {
398
+ "epoch": 4.75,
399
+ "eval_accuracy": 0.5701407559702673,
400
+ "eval_exact_match": 0.5701407559702673,
401
+ "eval_f1": 0.5724919605672413,
402
+ "eval_loss": 0.0050529008731245995,
403
+ "eval_precision": 0.5770994780958406,
404
+ "eval_recall": 0.5701407559702673,
405
+ "eval_runtime": 354.5246,
406
+ "eval_samples_per_second": 17.835,
407
+ "eval_steps_per_second": 8.919,
408
+ "step": 60000
409
+ },
410
+ {
411
+ "epoch": 4.82,
412
+ "learning_rate": 2.598062569477529e-05,
413
+ "loss": 0.0065,
414
+ "step": 61000
415
+ },
416
+ {
417
+ "epoch": 4.9,
418
+ "learning_rate": 2.5583611243449263e-05,
419
+ "loss": 0.0063,
420
+ "step": 62000
421
+ },
422
+ {
423
+ "epoch": 4.98,
424
+ "learning_rate": 2.5186596792123236e-05,
425
+ "loss": 0.0068,
426
+ "step": 63000
427
+ },
428
+ {
429
+ "epoch": 5.06,
430
+ "learning_rate": 2.4789582340797206e-05,
431
+ "loss": 0.006,
432
+ "step": 64000
433
+ },
434
+ {
435
+ "epoch": 5.14,
436
+ "learning_rate": 2.4392567889471176e-05,
437
+ "loss": 0.006,
438
+ "step": 65000
439
+ },
440
+ {
441
+ "epoch": 5.22,
442
+ "learning_rate": 2.399555343814515e-05,
443
+ "loss": 0.0058,
444
+ "step": 66000
445
+ },
446
+ {
447
+ "epoch": 5.3,
448
+ "learning_rate": 2.359853898681912e-05,
449
+ "loss": 0.0056,
450
+ "step": 67000
451
+ },
452
+ {
453
+ "epoch": 5.38,
454
+ "learning_rate": 2.3201524535493095e-05,
455
+ "loss": 0.0057,
456
+ "step": 68000
457
+ },
458
+ {
459
+ "epoch": 5.46,
460
+ "learning_rate": 2.2804510084167068e-05,
461
+ "loss": 0.0059,
462
+ "step": 69000
463
+ },
464
+ {
465
+ "epoch": 5.54,
466
+ "learning_rate": 2.2407495632841037e-05,
467
+ "loss": 0.0057,
468
+ "step": 70000
469
+ },
470
+ {
471
+ "epoch": 5.62,
472
+ "learning_rate": 2.2010481181515007e-05,
473
+ "loss": 0.0058,
474
+ "step": 71000
475
+ },
476
+ {
477
+ "epoch": 5.69,
478
+ "learning_rate": 2.161346673018898e-05,
479
+ "loss": 0.0059,
480
+ "step": 72000
481
+ },
482
+ {
483
+ "epoch": 5.77,
484
+ "learning_rate": 2.1216452278862953e-05,
485
+ "loss": 0.0059,
486
+ "step": 73000
487
+ },
488
+ {
489
+ "epoch": 5.85,
490
+ "learning_rate": 2.0819437827536923e-05,
491
+ "loss": 0.006,
492
+ "step": 74000
493
+ },
494
+ {
495
+ "epoch": 5.93,
496
+ "learning_rate": 2.0422423376210896e-05,
497
+ "loss": 0.0056,
498
+ "step": 75000
499
+ },
500
+ {
501
+ "epoch": 6.01,
502
+ "learning_rate": 2.002540892488487e-05,
503
+ "loss": 0.0058,
504
+ "step": 76000
505
+ },
506
+ {
507
+ "epoch": 6.09,
508
+ "learning_rate": 1.9628394473558838e-05,
509
+ "loss": 0.0054,
510
+ "step": 77000
511
+ },
512
+ {
513
+ "epoch": 6.17,
514
+ "learning_rate": 1.9231380022232808e-05,
515
+ "loss": 0.0061,
516
+ "step": 78000
517
+ },
518
+ {
519
+ "epoch": 6.25,
520
+ "learning_rate": 1.883436557090678e-05,
521
+ "loss": 0.0057,
522
+ "step": 79000
523
+ },
524
+ {
525
+ "epoch": 6.33,
526
+ "learning_rate": 1.8437351119580754e-05,
527
+ "loss": 0.0053,
528
+ "step": 80000
529
+ },
530
+ {
531
+ "epoch": 6.33,
532
+ "eval_accuracy": 0.5827929780167642,
533
+ "eval_exact_match": 0.5827929780167642,
534
+ "eval_f1": 0.5851441826137382,
535
+ "eval_loss": 0.004922935273498297,
536
+ "eval_precision": 0.5897517001423375,
537
+ "eval_recall": 0.5827929780167642,
538
+ "eval_runtime": 354.2336,
539
+ "eval_samples_per_second": 17.85,
540
+ "eval_steps_per_second": 8.926,
541
+ "step": 80000
542
+ },
543
+ {
544
+ "epoch": 6.41,
545
+ "learning_rate": 1.8040336668254727e-05,
546
+ "loss": 0.0054,
547
+ "step": 81000
548
+ },
549
+ {
550
+ "epoch": 6.49,
551
+ "learning_rate": 1.76433222169287e-05,
552
+ "loss": 0.0054,
553
+ "step": 82000
554
+ },
555
+ {
556
+ "epoch": 6.56,
557
+ "learning_rate": 1.7246307765602666e-05,
558
+ "loss": 0.0056,
559
+ "step": 83000
560
+ },
561
+ {
562
+ "epoch": 6.64,
563
+ "learning_rate": 1.684929331427664e-05,
564
+ "loss": 0.0055,
565
+ "step": 84000
566
+ },
567
+ {
568
+ "epoch": 6.72,
569
+ "learning_rate": 1.6452278862950612e-05,
570
+ "loss": 0.0051,
571
+ "step": 85000
572
+ },
573
+ {
574
+ "epoch": 6.8,
575
+ "learning_rate": 1.6055264411624585e-05,
576
+ "loss": 0.0057,
577
+ "step": 86000
578
+ },
579
+ {
580
+ "epoch": 6.88,
581
+ "learning_rate": 1.5658249960298555e-05,
582
+ "loss": 0.0053,
583
+ "step": 87000
584
+ },
585
+ {
586
+ "epoch": 6.96,
587
+ "learning_rate": 1.5261235508972528e-05,
588
+ "loss": 0.0053,
589
+ "step": 88000
590
+ },
591
+ {
592
+ "epoch": 7.04,
593
+ "learning_rate": 1.4864221057646499e-05,
594
+ "loss": 0.0049,
595
+ "step": 89000
596
+ },
597
+ {
598
+ "epoch": 7.12,
599
+ "learning_rate": 1.4467206606320472e-05,
600
+ "loss": 0.0054,
601
+ "step": 90000
602
+ },
603
+ {
604
+ "epoch": 7.2,
605
+ "learning_rate": 1.4070192154994443e-05,
606
+ "loss": 0.0052,
607
+ "step": 91000
608
+ },
609
+ {
610
+ "epoch": 7.28,
611
+ "learning_rate": 1.3673177703668413e-05,
612
+ "loss": 0.0052,
613
+ "step": 92000
614
+ },
615
+ {
616
+ "epoch": 7.36,
617
+ "learning_rate": 1.3276163252342386e-05,
618
+ "loss": 0.0053,
619
+ "step": 93000
620
+ },
621
+ {
622
+ "epoch": 7.43,
623
+ "learning_rate": 1.2879148801016357e-05,
624
+ "loss": 0.005,
625
+ "step": 94000
626
+ },
627
+ {
628
+ "epoch": 7.51,
629
+ "learning_rate": 1.2482134349690329e-05,
630
+ "loss": 0.0054,
631
+ "step": 95000
632
+ },
633
+ {
634
+ "epoch": 7.59,
635
+ "learning_rate": 1.2085119898364302e-05,
636
+ "loss": 0.0052,
637
+ "step": 96000
638
+ },
639
+ {
640
+ "epoch": 7.67,
641
+ "learning_rate": 1.1688105447038273e-05,
642
+ "loss": 0.0051,
643
+ "step": 97000
644
+ },
645
+ {
646
+ "epoch": 7.75,
647
+ "learning_rate": 1.1291090995712244e-05,
648
+ "loss": 0.005,
649
+ "step": 98000
650
+ },
651
+ {
652
+ "epoch": 7.83,
653
+ "learning_rate": 1.0894076544386216e-05,
654
+ "loss": 0.005,
655
+ "step": 99000
656
+ },
657
+ {
658
+ "epoch": 7.91,
659
+ "learning_rate": 1.0497062093060187e-05,
660
+ "loss": 0.005,
661
+ "step": 100000
662
+ },
663
+ {
664
+ "epoch": 7.91,
665
+ "eval_accuracy": 0.5921239917760557,
666
+ "eval_exact_match": 0.5921239917760557,
667
+ "eval_f1": 0.594369761189309,
668
+ "eval_loss": 0.004835059866309166,
669
+ "eval_precision": 0.5987664083504666,
670
+ "eval_recall": 0.5921239917760557,
671
+ "eval_runtime": 354.4854,
672
+ "eval_samples_per_second": 17.837,
673
+ "eval_steps_per_second": 8.92,
674
+ "step": 100000
675
+ },
676
+ {
677
+ "epoch": 7.99,
678
+ "learning_rate": 1.010004764173416e-05,
679
+ "loss": 0.005,
680
+ "step": 101000
681
+ }
682
+ ],
683
+ "logging_steps": 1000,
684
+ "max_steps": 126440,
685
+ "num_input_tokens_seen": 0,
686
+ "num_train_epochs": 10,
687
+ "save_steps": 500,
688
+ "total_flos": 1.0696817746378752e+17,
689
+ "train_batch_size": 2,
690
+ "trial_name": null,
691
+ "trial_params": null
692
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c55e3399602b93c4ba608798a65f62e367a8d7bc1a83bdefd9ed7b7a6a55170
3
+ size 4219