allstax commited on
Commit
a2b618a
1 Parent(s): 3238ff2

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +3 -323
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:164b5494ac7dc63ddd647f8c38f8282e5bab9fd22022d2076c188a5d2a5b1cfe
3
  size 1625422896
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76d474395b6ded24d34bcba3b256936e3bae1cfde6619e057cb3113a59307aa8
3
  size 1625422896
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:88054f4b96e2c48d52f7e14d9dd88d01981d1c768ba436efeaceade549897e7f
3
  size 3250751759
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86e3fe5f60ee64213cce6953f073415e24ea7076ada3726ea4f9730db4bf2b8f
3
  size 3250751759
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a6f76dc88a49fbe8084c46058a6690efc16d635806e103f2dda5d02d870b82e5
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45b46a8489ea46d5b64d8445e0eb4a38d6580878b08ba5dfb0ca38bbfb48916c
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:420fc5f51a05e0274a48344decd52036c9e49d1a1fff581ca68f3034646a3c19
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bef528486c1f6fe0aba355f463beb7a35f44ba66f2f32f593828dbf6daf991fb
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9848398495383563,
5
  "eval_steps": 720,
6
- "global_step": 17280,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -167,326 +167,6 @@
167
  "eval_samples_per_second": 1.158,
168
  "eval_steps_per_second": 0.3,
169
  "step": 5760
170
- },
171
- {
172
- "epoch": 0.37,
173
- "grad_norm": 0.4224866032600403,
174
- "learning_rate": 8.154797526431279e-05,
175
- "loss": 0.0904,
176
- "step": 6480
177
- },
178
- {
179
- "epoch": 0.37,
180
- "eval_bertscore": 0.7329062819480896,
181
- "eval_loss": 0.10619346052408218,
182
- "eval_rouge1": 0.564360420431112,
183
- "eval_rouge2": 0.3430202743146231,
184
- "eval_rougeL": 0.43258159937823415,
185
- "eval_rougeLsum": 0.4342682613180943,
186
- "eval_runtime": 43.9471,
187
- "eval_samples_per_second": 1.229,
188
- "eval_steps_per_second": 0.319,
189
- "step": 6480
190
- },
191
- {
192
- "epoch": 0.41,
193
- "grad_norm": 0.326298326253891,
194
- "learning_rate": 7.949616710837537e-05,
195
- "loss": 0.088,
196
- "step": 7200
197
- },
198
- {
199
- "epoch": 0.41,
200
- "eval_bertscore": 0.7319415807723999,
201
- "eval_loss": 0.10810621827840805,
202
- "eval_rouge1": 0.567691068841891,
203
- "eval_rouge2": 0.3516256462902142,
204
- "eval_rougeL": 0.42838027396990275,
205
- "eval_rougeLsum": 0.4324782503340911,
206
- "eval_runtime": 46.4934,
207
- "eval_samples_per_second": 1.161,
208
- "eval_steps_per_second": 0.301,
209
- "step": 7200
210
- },
211
- {
212
- "epoch": 0.45,
213
- "grad_norm": 0.38782864809036255,
214
- "learning_rate": 7.744720868598786e-05,
215
- "loss": 0.099,
216
- "step": 7920
217
- },
218
- {
219
- "epoch": 0.45,
220
- "eval_bertscore": 0.7281920909881592,
221
- "eval_loss": 0.1160830408334732,
222
- "eval_rouge1": 0.5638078274676868,
223
- "eval_rouge2": 0.33447827597228297,
224
- "eval_rougeL": 0.42103610080157494,
225
- "eval_rougeLsum": 0.421241459135431,
226
- "eval_runtime": 43.8757,
227
- "eval_samples_per_second": 1.231,
228
- "eval_steps_per_second": 0.319,
229
- "step": 7920
230
- },
231
- {
232
- "epoch": 0.49,
233
- "grad_norm": 0.3241174519062042,
234
- "learning_rate": 7.539540053005044e-05,
235
- "loss": 0.0872,
236
- "step": 8640
237
- },
238
- {
239
- "epoch": 0.49,
240
- "eval_bertscore": 0.7242642045021057,
241
- "eval_loss": 0.10293756425380707,
242
- "eval_rouge1": 0.5632532860588884,
243
- "eval_rouge2": 0.34322756987289393,
244
- "eval_rougeL": 0.42262041814568485,
245
- "eval_rougeLsum": 0.423440338568295,
246
- "eval_runtime": 46.2455,
247
- "eval_samples_per_second": 1.168,
248
- "eval_steps_per_second": 0.303,
249
- "step": 8640
250
- },
251
- {
252
- "epoch": 0.53,
253
- "grad_norm": 0.3583599030971527,
254
- "learning_rate": 7.334359237411302e-05,
255
- "loss": 0.0873,
256
- "step": 9360
257
- },
258
- {
259
- "epoch": 0.53,
260
- "eval_bertscore": 0.6995793581008911,
261
- "eval_loss": 0.1057819277048111,
262
- "eval_rouge1": 0.5146289688453939,
263
- "eval_rouge2": 0.31515963367438793,
264
- "eval_rougeL": 0.3849697348906047,
265
- "eval_rougeLsum": 0.3853863269798617,
266
- "eval_runtime": 41.7431,
267
- "eval_samples_per_second": 1.294,
268
- "eval_steps_per_second": 0.335,
269
- "step": 9360
270
- },
271
- {
272
- "epoch": 0.57,
273
- "grad_norm": 0.2910310626029968,
274
- "learning_rate": 7.12917842181756e-05,
275
- "loss": 0.0867,
276
- "step": 10080
277
- },
278
- {
279
- "epoch": 0.57,
280
- "eval_bertscore": 0.6976329684257507,
281
- "eval_loss": 0.1045340821146965,
282
- "eval_rouge1": 0.5207467825430098,
283
- "eval_rouge2": 0.3033279665805546,
284
- "eval_rougeL": 0.3864676775585604,
285
- "eval_rougeLsum": 0.38810836905339663,
286
- "eval_runtime": 42.5457,
287
- "eval_samples_per_second": 1.269,
288
- "eval_steps_per_second": 0.329,
289
- "step": 10080
290
- },
291
- {
292
- "epoch": 0.62,
293
- "grad_norm": 0.3698158860206604,
294
- "learning_rate": 6.92428257957881e-05,
295
- "loss": 0.0877,
296
- "step": 10800
297
- },
298
- {
299
- "epoch": 0.62,
300
- "eval_bertscore": 0.6928555369377136,
301
- "eval_loss": 0.10213906317949295,
302
- "eval_rouge1": 0.5052125219715984,
303
- "eval_rouge2": 0.2901105424948756,
304
- "eval_rougeL": 0.38512895532246294,
305
- "eval_rougeLsum": 0.38700266390157023,
306
- "eval_runtime": 43.2036,
307
- "eval_samples_per_second": 1.25,
308
- "eval_steps_per_second": 0.324,
309
- "step": 10800
310
- },
311
- {
312
- "epoch": 0.66,
313
- "grad_norm": 0.31327977776527405,
314
- "learning_rate": 6.719101763985067e-05,
315
- "loss": 0.0863,
316
- "step": 11520
317
- },
318
- {
319
- "epoch": 0.66,
320
- "eval_bertscore": 0.7075583338737488,
321
- "eval_loss": 0.10120192915201187,
322
- "eval_rouge1": 0.5255501659415194,
323
- "eval_rouge2": 0.313837826701734,
324
- "eval_rougeL": 0.40085373784140194,
325
- "eval_rougeLsum": 0.40434970576567464,
326
- "eval_runtime": 42.8949,
327
- "eval_samples_per_second": 1.259,
328
- "eval_steps_per_second": 0.326,
329
- "step": 11520
330
- },
331
- {
332
- "epoch": 0.7,
333
- "grad_norm": 0.3461964428424835,
334
- "learning_rate": 6.514205921746318e-05,
335
- "loss": 0.0872,
336
- "step": 12240
337
- },
338
- {
339
- "epoch": 0.7,
340
- "eval_bertscore": 0.7073290348052979,
341
- "eval_loss": 0.10406655818223953,
342
- "eval_rouge1": 0.5477131085772904,
343
- "eval_rouge2": 0.33149979593857803,
344
- "eval_rougeL": 0.40632334489545124,
345
- "eval_rougeLsum": 0.4095457087623684,
346
- "eval_runtime": 44.6457,
347
- "eval_samples_per_second": 1.21,
348
- "eval_steps_per_second": 0.314,
349
- "step": 12240
350
- },
351
- {
352
- "epoch": 0.74,
353
- "grad_norm": 0.5959820747375488,
354
- "learning_rate": 6.309025106152576e-05,
355
- "loss": 0.0865,
356
- "step": 12960
357
- },
358
- {
359
- "epoch": 0.74,
360
- "eval_bertscore": 0.7051340937614441,
361
- "eval_loss": 0.10186685621738434,
362
- "eval_rouge1": 0.5286009039113435,
363
- "eval_rouge2": 0.30974761597035483,
364
- "eval_rougeL": 0.39408942231662314,
365
- "eval_rougeLsum": 0.39582462237360283,
366
- "eval_runtime": 41.0675,
367
- "eval_samples_per_second": 1.315,
368
- "eval_steps_per_second": 0.341,
369
- "step": 12960
370
- },
371
- {
372
- "epoch": 0.78,
373
- "grad_norm": 0.3375673294067383,
374
- "learning_rate": 6.104129263913825e-05,
375
- "loss": 0.0882,
376
- "step": 13680
377
- },
378
- {
379
- "epoch": 0.78,
380
- "eval_bertscore": 0.7006374001502991,
381
- "eval_loss": 0.10748545080423355,
382
- "eval_rouge1": 0.5293328711395651,
383
- "eval_rouge2": 0.30648192947303854,
384
- "eval_rougeL": 0.38835230184676583,
385
- "eval_rougeLsum": 0.3893932601411799,
386
- "eval_runtime": 42.4292,
387
- "eval_samples_per_second": 1.273,
388
- "eval_steps_per_second": 0.33,
389
- "step": 13680
390
- },
391
- {
392
- "epoch": 0.82,
393
- "grad_norm": 0.2677787244319916,
394
- "learning_rate": 5.899233421675073e-05,
395
- "loss": 0.0974,
396
- "step": 14400
397
- },
398
- {
399
- "epoch": 0.82,
400
- "eval_bertscore": 0.7165916562080383,
401
- "eval_loss": 0.1007571741938591,
402
- "eval_rouge1": 0.5513926515300362,
403
- "eval_rouge2": 0.33911911389049787,
404
- "eval_rougeL": 0.416442801833816,
405
- "eval_rougeLsum": 0.4183281343761722,
406
- "eval_runtime": 44.5976,
407
- "eval_samples_per_second": 1.211,
408
- "eval_steps_per_second": 0.314,
409
- "step": 14400
410
- },
411
- {
412
- "epoch": 0.86,
413
- "grad_norm": 0.37346717715263367,
414
- "learning_rate": 5.694052606081331e-05,
415
- "loss": 0.0873,
416
- "step": 15120
417
- },
418
- {
419
- "epoch": 0.86,
420
- "eval_bertscore": 0.7286005616188049,
421
- "eval_loss": 0.10275202244520187,
422
- "eval_rouge1": 0.568867460495994,
423
- "eval_rouge2": 0.3456103655679248,
424
- "eval_rougeL": 0.43306516695459163,
425
- "eval_rougeLsum": 0.43448984907617605,
426
- "eval_runtime": 45.9446,
427
- "eval_samples_per_second": 1.175,
428
- "eval_steps_per_second": 0.305,
429
- "step": 15120
430
- },
431
- {
432
- "epoch": 0.9,
433
- "grad_norm": 0.42003852128982544,
434
- "learning_rate": 5.4888717904875894e-05,
435
- "loss": 0.0884,
436
- "step": 15840
437
- },
438
- {
439
- "epoch": 0.9,
440
- "eval_bertscore": 0.6985941529273987,
441
- "eval_loss": 0.10138168185949326,
442
- "eval_rouge1": 0.5142689394504161,
443
- "eval_rouge2": 0.29174269779985657,
444
- "eval_rougeL": 0.38750085273888524,
445
- "eval_rougeLsum": 0.38942879459463353,
446
- "eval_runtime": 45.3145,
447
- "eval_samples_per_second": 1.192,
448
- "eval_steps_per_second": 0.309,
449
- "step": 15840
450
- },
451
- {
452
- "epoch": 0.94,
453
- "grad_norm": 0.2986052334308624,
454
- "learning_rate": 5.283690974893848e-05,
455
- "loss": 0.0843,
456
- "step": 16560
457
- },
458
- {
459
- "epoch": 0.94,
460
- "eval_bertscore": 0.7196215391159058,
461
- "eval_loss": 0.09988830983638763,
462
- "eval_rouge1": 0.5568270207411694,
463
- "eval_rouge2": 0.3344249967035061,
464
- "eval_rougeL": 0.42228876150966843,
465
- "eval_rougeLsum": 0.4241723819280844,
466
- "eval_runtime": 43.9057,
467
- "eval_samples_per_second": 1.23,
468
- "eval_steps_per_second": 0.319,
469
- "step": 16560
470
- },
471
- {
472
- "epoch": 0.98,
473
- "grad_norm": 0.42629748582839966,
474
- "learning_rate": 5.078510159300106e-05,
475
- "loss": 0.0841,
476
- "step": 17280
477
- },
478
- {
479
- "epoch": 0.98,
480
- "eval_bertscore": 0.7275723814964294,
481
- "eval_loss": 0.09897469729185104,
482
- "eval_rouge1": 0.5719705231392143,
483
- "eval_rouge2": 0.3548031109092683,
484
- "eval_rougeL": 0.44224082293068945,
485
- "eval_rougeLsum": 0.4454366319399464,
486
- "eval_runtime": 44.0423,
487
- "eval_samples_per_second": 1.226,
488
- "eval_steps_per_second": 0.318,
489
- "step": 17280
490
  }
491
  ],
492
  "logging_steps": 720,
@@ -494,7 +174,7 @@
494
  "num_input_tokens_seen": 0,
495
  "num_train_epochs": 2,
496
  "save_steps": 2880,
497
- "total_flos": 1.4979027009798144e+17,
498
  "train_batch_size": 2,
499
  "trial_name": null,
500
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.32827994984611875,
5
  "eval_steps": 720,
6
+ "global_step": 5760,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
167
  "eval_samples_per_second": 1.158,
168
  "eval_steps_per_second": 0.3,
169
  "step": 5760
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  }
171
  ],
172
  "logging_steps": 720,
 
174
  "num_input_tokens_seen": 0,
175
  "num_train_epochs": 2,
176
  "save_steps": 2880,
177
+ "total_flos": 4.993009003266048e+16,
178
  "train_batch_size": 2,
179
  "trial_name": null,
180
  "trial_params": null