sekarmulyani commited on
Commit
85d7167
1 Parent(s): 5d3abcb

Upload 12 files

Browse files
Files changed (5) hide show
  1. optimizer.pt +1 -1
  2. pytorch_model.bin +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +359 -3
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:010cc660b5d560447c7fb8b57393b8f97dba3cba806dcec5ee0218894aebf79d
3
  size 995641861
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1958c3dbb04813e9cfb37dc3270d881bdb32b10faf80bc576183dadd1d67b1f
3
  size 995641861
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d8292a97db24ad842f8357890c37607ffdb8c0abbf25b087a8cc581fd45f68c4
3
  size 497807197
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d55afad0c1ff1e9d5816f48bc770e1719f184367f8577205df45a7ada8f25d79
3
  size 497807197
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6fbc44877a85dc9e31508ab5cdcb4b09e15e4ccd881628820393d3ed5e0b4726
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e34348f4fc7163b2cbb271bcc37ef245469192a3f7f73ffc101fcb3e6ff34188
3
  size 14575
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:272d9d323ca9bc4225a532f1ca51900b269e2ae9a9366402febf725ced99fda9
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b4432ea76e605f5a2ab91b958cd7cebdc5efaf3ebb4c8fe1d3da0e35f885a53
3
  size 627
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 8.0,
5
  "eval_steps": 500,
6
- "global_step": 9456,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -171,13 +171,369 @@
171
  "learning_rate": 8.096446700507615e-06,
172
  "loss": 2.9722,
173
  "step": 9000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  }
175
  ],
176
  "logging_steps": 500,
177
  "max_steps": 47280,
178
  "num_train_epochs": 40,
179
  "save_steps": 9456,
180
- "total_flos": 1.4819961470976e+16,
181
  "trial_name": null,
182
  "trial_params": null
183
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 24.0,
5
  "eval_steps": 500,
6
+ "global_step": 28368,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
171
  "learning_rate": 8.096446700507615e-06,
172
  "loss": 2.9722,
173
  "step": 9000
174
+ },
175
+ {
176
+ "epoch": 8.0,
177
+ "eval_loss": 3.004361391067505,
178
+ "eval_runtime": 6.0481,
179
+ "eval_samples_per_second": 42.327,
180
+ "eval_steps_per_second": 5.291,
181
+ "step": 9456
182
+ },
183
+ {
184
+ "epoch": 8.04,
185
+ "learning_rate": 7.990693739424705e-06,
186
+ "loss": 2.9628,
187
+ "step": 9500
188
+ },
189
+ {
190
+ "epoch": 8.46,
191
+ "learning_rate": 7.884940778341795e-06,
192
+ "loss": 2.9593,
193
+ "step": 10000
194
+ },
195
+ {
196
+ "epoch": 8.88,
197
+ "learning_rate": 7.779187817258885e-06,
198
+ "loss": 2.9485,
199
+ "step": 10500
200
+ },
201
+ {
202
+ "epoch": 9.0,
203
+ "eval_loss": 2.9940547943115234,
204
+ "eval_runtime": 6.1451,
205
+ "eval_samples_per_second": 41.659,
206
+ "eval_steps_per_second": 5.207,
207
+ "step": 10638
208
+ },
209
+ {
210
+ "epoch": 9.31,
211
+ "learning_rate": 7.673434856175973e-06,
212
+ "loss": 2.9405,
213
+ "step": 11000
214
+ },
215
+ {
216
+ "epoch": 9.73,
217
+ "learning_rate": 7.567681895093063e-06,
218
+ "loss": 2.943,
219
+ "step": 11500
220
+ },
221
+ {
222
+ "epoch": 10.0,
223
+ "eval_loss": 2.9857802391052246,
224
+ "eval_runtime": 6.2619,
225
+ "eval_samples_per_second": 40.882,
226
+ "eval_steps_per_second": 5.11,
227
+ "step": 11820
228
+ },
229
+ {
230
+ "epoch": 10.15,
231
+ "learning_rate": 7.461928934010153e-06,
232
+ "loss": 2.9243,
233
+ "step": 12000
234
+ },
235
+ {
236
+ "epoch": 10.58,
237
+ "learning_rate": 7.356175972927243e-06,
238
+ "loss": 2.9228,
239
+ "step": 12500
240
+ },
241
+ {
242
+ "epoch": 11.0,
243
+ "learning_rate": 7.2504230118443316e-06,
244
+ "loss": 2.9216,
245
+ "step": 13000
246
+ },
247
+ {
248
+ "epoch": 11.0,
249
+ "eval_loss": 2.9776651859283447,
250
+ "eval_runtime": 6.2854,
251
+ "eval_samples_per_second": 40.73,
252
+ "eval_steps_per_second": 5.091,
253
+ "step": 13002
254
+ },
255
+ {
256
+ "epoch": 11.42,
257
+ "learning_rate": 7.144670050761422e-06,
258
+ "loss": 2.9118,
259
+ "step": 13500
260
+ },
261
+ {
262
+ "epoch": 11.84,
263
+ "learning_rate": 7.038917089678512e-06,
264
+ "loss": 2.911,
265
+ "step": 14000
266
+ },
267
+ {
268
+ "epoch": 12.0,
269
+ "eval_loss": 2.9713006019592285,
270
+ "eval_runtime": 6.1107,
271
+ "eval_samples_per_second": 41.894,
272
+ "eval_steps_per_second": 5.237,
273
+ "step": 14184
274
+ },
275
+ {
276
+ "epoch": 12.27,
277
+ "learning_rate": 6.933164128595601e-06,
278
+ "loss": 2.9038,
279
+ "step": 14500
280
+ },
281
+ {
282
+ "epoch": 12.69,
283
+ "learning_rate": 6.827411167512691e-06,
284
+ "loss": 2.8924,
285
+ "step": 15000
286
+ },
287
+ {
288
+ "epoch": 13.0,
289
+ "eval_loss": 2.9653375148773193,
290
+ "eval_runtime": 6.1162,
291
+ "eval_samples_per_second": 41.856,
292
+ "eval_steps_per_second": 5.232,
293
+ "step": 15366
294
+ },
295
+ {
296
+ "epoch": 13.11,
297
+ "learning_rate": 6.721658206429781e-06,
298
+ "loss": 2.9025,
299
+ "step": 15500
300
+ },
301
+ {
302
+ "epoch": 13.54,
303
+ "learning_rate": 6.61590524534687e-06,
304
+ "loss": 2.8886,
305
+ "step": 16000
306
+ },
307
+ {
308
+ "epoch": 13.96,
309
+ "learning_rate": 6.51015228426396e-06,
310
+ "loss": 2.8882,
311
+ "step": 16500
312
+ },
313
+ {
314
+ "epoch": 14.0,
315
+ "eval_loss": 2.960761547088623,
316
+ "eval_runtime": 6.1211,
317
+ "eval_samples_per_second": 41.822,
318
+ "eval_steps_per_second": 5.228,
319
+ "step": 16548
320
+ },
321
+ {
322
+ "epoch": 14.38,
323
+ "learning_rate": 6.40439932318105e-06,
324
+ "loss": 2.8777,
325
+ "step": 17000
326
+ },
327
+ {
328
+ "epoch": 14.81,
329
+ "learning_rate": 6.298646362098139e-06,
330
+ "loss": 2.8826,
331
+ "step": 17500
332
+ },
333
+ {
334
+ "epoch": 15.0,
335
+ "eval_loss": 2.9559221267700195,
336
+ "eval_runtime": 6.0998,
337
+ "eval_samples_per_second": 41.969,
338
+ "eval_steps_per_second": 5.246,
339
+ "step": 17730
340
+ },
341
+ {
342
+ "epoch": 15.23,
343
+ "learning_rate": 6.1928934010152285e-06,
344
+ "loss": 2.8796,
345
+ "step": 18000
346
+ },
347
+ {
348
+ "epoch": 15.65,
349
+ "learning_rate": 6.0871404399323185e-06,
350
+ "loss": 2.8697,
351
+ "step": 18500
352
+ },
353
+ {
354
+ "epoch": 16.0,
355
+ "eval_loss": 2.952040672302246,
356
+ "eval_runtime": 6.2485,
357
+ "eval_samples_per_second": 40.97,
358
+ "eval_steps_per_second": 5.121,
359
+ "step": 18912
360
+ },
361
+ {
362
+ "epoch": 16.07,
363
+ "learning_rate": 5.981387478849409e-06,
364
+ "loss": 2.8645,
365
+ "step": 19000
366
+ },
367
+ {
368
+ "epoch": 16.5,
369
+ "learning_rate": 5.875634517766498e-06,
370
+ "loss": 2.8678,
371
+ "step": 19500
372
+ },
373
+ {
374
+ "epoch": 16.92,
375
+ "learning_rate": 5.769881556683588e-06,
376
+ "loss": 2.8616,
377
+ "step": 20000
378
+ },
379
+ {
380
+ "epoch": 17.0,
381
+ "eval_loss": 2.948793888092041,
382
+ "eval_runtime": 6.2711,
383
+ "eval_samples_per_second": 40.822,
384
+ "eval_steps_per_second": 5.103,
385
+ "step": 20094
386
+ },
387
+ {
388
+ "epoch": 17.34,
389
+ "learning_rate": 5.664128595600678e-06,
390
+ "loss": 2.8548,
391
+ "step": 20500
392
+ },
393
+ {
394
+ "epoch": 17.77,
395
+ "learning_rate": 5.558375634517766e-06,
396
+ "loss": 2.8529,
397
+ "step": 21000
398
+ },
399
+ {
400
+ "epoch": 18.0,
401
+ "eval_loss": 2.945361614227295,
402
+ "eval_runtime": 6.3517,
403
+ "eval_samples_per_second": 40.304,
404
+ "eval_steps_per_second": 5.038,
405
+ "step": 21276
406
+ },
407
+ {
408
+ "epoch": 18.19,
409
+ "learning_rate": 5.452622673434856e-06,
410
+ "loss": 2.8557,
411
+ "step": 21500
412
+ },
413
+ {
414
+ "epoch": 18.61,
415
+ "learning_rate": 5.346869712351946e-06,
416
+ "loss": 2.8448,
417
+ "step": 22000
418
+ },
419
+ {
420
+ "epoch": 19.0,
421
+ "eval_loss": 2.9428470134735107,
422
+ "eval_runtime": 6.2219,
423
+ "eval_samples_per_second": 41.145,
424
+ "eval_steps_per_second": 5.143,
425
+ "step": 22458
426
+ },
427
+ {
428
+ "epoch": 19.04,
429
+ "learning_rate": 5.241116751269036e-06,
430
+ "loss": 2.8458,
431
+ "step": 22500
432
+ },
433
+ {
434
+ "epoch": 19.46,
435
+ "learning_rate": 5.1353637901861255e-06,
436
+ "loss": 2.8462,
437
+ "step": 23000
438
+ },
439
+ {
440
+ "epoch": 19.88,
441
+ "learning_rate": 5.0296108291032155e-06,
442
+ "loss": 2.84,
443
+ "step": 23500
444
+ },
445
+ {
446
+ "epoch": 20.0,
447
+ "eval_loss": 2.940398693084717,
448
+ "eval_runtime": 6.2496,
449
+ "eval_samples_per_second": 40.962,
450
+ "eval_steps_per_second": 5.12,
451
+ "step": 23640
452
+ },
453
+ {
454
+ "epoch": 20.3,
455
+ "learning_rate": 4.923857868020305e-06,
456
+ "loss": 2.8349,
457
+ "step": 24000
458
+ },
459
+ {
460
+ "epoch": 20.73,
461
+ "learning_rate": 4.818104906937395e-06,
462
+ "loss": 2.8285,
463
+ "step": 24500
464
+ },
465
+ {
466
+ "epoch": 21.0,
467
+ "eval_loss": 2.938441276550293,
468
+ "eval_runtime": 6.1601,
469
+ "eval_samples_per_second": 41.558,
470
+ "eval_steps_per_second": 5.195,
471
+ "step": 24822
472
+ },
473
+ {
474
+ "epoch": 21.15,
475
+ "learning_rate": 4.712351945854484e-06,
476
+ "loss": 2.8345,
477
+ "step": 25000
478
+ },
479
+ {
480
+ "epoch": 21.57,
481
+ "learning_rate": 4.606598984771574e-06,
482
+ "loss": 2.8302,
483
+ "step": 25500
484
+ },
485
+ {
486
+ "epoch": 22.0,
487
+ "learning_rate": 4.500846023688664e-06,
488
+ "loss": 2.8266,
489
+ "step": 26000
490
+ },
491
+ {
492
+ "epoch": 22.0,
493
+ "eval_loss": 2.9362807273864746,
494
+ "eval_runtime": 6.0955,
495
+ "eval_samples_per_second": 41.998,
496
+ "eval_steps_per_second": 5.25,
497
+ "step": 26004
498
+ },
499
+ {
500
+ "epoch": 22.42,
501
+ "learning_rate": 4.395093062605753e-06,
502
+ "loss": 2.819,
503
+ "step": 26500
504
+ },
505
+ {
506
+ "epoch": 22.84,
507
+ "learning_rate": 4.289340101522843e-06,
508
+ "loss": 2.8232,
509
+ "step": 27000
510
+ },
511
+ {
512
+ "epoch": 23.0,
513
+ "eval_loss": 2.934544324874878,
514
+ "eval_runtime": 6.1597,
515
+ "eval_samples_per_second": 41.561,
516
+ "eval_steps_per_second": 5.195,
517
+ "step": 27186
518
+ },
519
+ {
520
+ "epoch": 23.27,
521
+ "learning_rate": 4.183587140439932e-06,
522
+ "loss": 2.8213,
523
+ "step": 27500
524
+ },
525
+ {
526
+ "epoch": 23.69,
527
+ "learning_rate": 4.0778341793570224e-06,
528
+ "loss": 2.8136,
529
+ "step": 28000
530
  }
531
  ],
532
  "logging_steps": 500,
533
  "max_steps": 47280,
534
  "num_train_epochs": 40,
535
  "save_steps": 9456,
536
+ "total_flos": 4.4459884412928e+16,
537
  "trial_name": null,
538
  "trial_params": null
539
  }