Bingsu commited on
Commit
b97e6d1
1 Parent(s): 73b22f5

Training in progress, step 120000

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:66a5ef622dbe8d56e7b956a8e46b2dc79895c2e55948ea096cb19b5071a69a55
3
  size 586828837
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05e2d269f333c96c29da8075e36a6de506892a84e8ab7a1d79c6b5baf653edf5
3
  size 586828837
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d4ef7a37dd78952cbb4fc37873637ea684f070d1e44bde74352b791d0c661cd4
3
  size 146774203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2cc943ae46672312ee4175b7b0df7b2bcb16bb1598452afd869122102f93e701
3
  size 146774203
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e483b397451e40991ea9aa448be1d4addf388b682e3c225fd31dfc476ba76800
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf3e3ff5ca04195d00ae182843134a34efdb2e565df68413f5842b7a4a84c37b
3
  size 14503
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:42dbbdc5cc13bd878c323aa5fdc58a0d0d735dc686a8a6a2c1206cf27279185b
3
  size 559
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17a6c740782a206d1a7821b1fbc9827af7a83dbc888bd997056c93056ef861be
3
  size 559
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:645860789a54b6c41f5c11e77f6f0318ed7d04c0720af434b0a590d8b30f84c0
3
  size 733555848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2812a708855da00be5c7a2b5b6519990cb027a8d82f04f202c74834685819f6
3
  size 733555848
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.47269742637737583,
5
- "global_step": 110000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -3306,11 +3306,311 @@
3306
  "learning_rate": 0.003245247489550804,
3307
  "loss": 8.6364,
3308
  "step": 110000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3309
  }
3310
  ],
3311
  "max_steps": 1000000,
3312
  "num_train_epochs": 5,
3313
- "total_flos": 1.7532129804288e+17,
3314
  "trial_name": null,
3315
  "trial_params": null
3316
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.51566991968441,
5
+ "global_step": 120000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
3306
  "learning_rate": 0.003245247489550804,
3307
  "loss": 8.6364,
3308
  "step": 110000
3309
+ },
3310
+ {
3311
+ "epoch": 0.47,
3312
+ "learning_rate": 0.0032544335062216403,
3313
+ "loss": 8.6422,
3314
+ "step": 110200
3315
+ },
3316
+ {
3317
+ "epoch": 0.47,
3318
+ "learning_rate": 0.0032636280571558636,
3319
+ "loss": 8.618,
3320
+ "step": 110400
3321
+ },
3322
+ {
3323
+ "epoch": 0.48,
3324
+ "learning_rate": 0.003272831102021408,
3325
+ "loss": 8.6276,
3326
+ "step": 110600
3327
+ },
3328
+ {
3329
+ "epoch": 0.48,
3330
+ "learning_rate": 0.003282042600448948,
3331
+ "loss": 8.6454,
3332
+ "step": 110800
3333
+ },
3334
+ {
3335
+ "epoch": 0.48,
3336
+ "learning_rate": 0.0032912625120320753,
3337
+ "loss": 8.6388,
3338
+ "step": 111000
3339
+ },
3340
+ {
3341
+ "epoch": 0.48,
3342
+ "learning_rate": 0.0033004907963274733,
3343
+ "loss": 8.6339,
3344
+ "step": 111200
3345
+ },
3346
+ {
3347
+ "epoch": 0.48,
3348
+ "learning_rate": 0.003309727412855108,
3349
+ "loss": 8.6243,
3350
+ "step": 111400
3351
+ },
3352
+ {
3353
+ "epoch": 0.48,
3354
+ "learning_rate": 0.0033189723210983865,
3355
+ "loss": 8.6264,
3356
+ "step": 111600
3357
+ },
3358
+ {
3359
+ "epoch": 0.48,
3360
+ "learning_rate": 0.0033282254805043487,
3361
+ "loss": 8.6401,
3362
+ "step": 111800
3363
+ },
3364
+ {
3365
+ "epoch": 0.48,
3366
+ "learning_rate": 0.003337440523277331,
3367
+ "loss": 8.6366,
3368
+ "step": 112000
3369
+ },
3370
+ {
3371
+ "epoch": 0.48,
3372
+ "learning_rate": 0.0033467100224565524,
3373
+ "loss": 8.6338,
3374
+ "step": 112200
3375
+ },
3376
+ {
3377
+ "epoch": 0.48,
3378
+ "learning_rate": 0.003355987651126521,
3379
+ "loss": 8.6377,
3380
+ "step": 112400
3381
+ },
3382
+ {
3383
+ "epoch": 0.48,
3384
+ "learning_rate": 0.0033652733685907424,
3385
+ "loss": 8.6414,
3386
+ "step": 112600
3387
+ },
3388
+ {
3389
+ "epoch": 0.48,
3390
+ "learning_rate": 0.0033745671341172496,
3391
+ "loss": 8.6264,
3392
+ "step": 112800
3393
+ },
3394
+ {
3395
+ "epoch": 0.49,
3396
+ "learning_rate": 0.0033838689069387654,
3397
+ "loss": 8.6289,
3398
+ "step": 113000
3399
+ },
3400
+ {
3401
+ "epoch": 0.49,
3402
+ "learning_rate": 0.00339317864625289,
3403
+ "loss": 8.6244,
3404
+ "step": 113200
3405
+ },
3406
+ {
3407
+ "epoch": 0.49,
3408
+ "learning_rate": 0.003402496311222283,
3409
+ "loss": 8.6287,
3410
+ "step": 113400
3411
+ },
3412
+ {
3413
+ "epoch": 0.49,
3414
+ "learning_rate": 0.0034118218609748346,
3415
+ "loss": 8.6251,
3416
+ "step": 113600
3417
+ },
3418
+ {
3419
+ "epoch": 0.49,
3420
+ "learning_rate": 0.003421155254603846,
3421
+ "loss": 8.6214,
3422
+ "step": 113800
3423
+ },
3424
+ {
3425
+ "epoch": 0.49,
3426
+ "learning_rate": 0.0034304964511682147,
3427
+ "loss": 8.6303,
3428
+ "step": 114000
3429
+ },
3430
+ {
3431
+ "epoch": 0.49,
3432
+ "learning_rate": 0.0034398454096926092,
3433
+ "loss": 8.6369,
3434
+ "step": 114200
3435
+ },
3436
+ {
3437
+ "epoch": 0.49,
3438
+ "learning_rate": 0.003449202089167651,
3439
+ "loss": 8.6236,
3440
+ "step": 114400
3441
+ },
3442
+ {
3443
+ "epoch": 0.49,
3444
+ "learning_rate": 0.0034585196077173436,
3445
+ "loss": 8.6251,
3446
+ "step": 114600
3447
+ },
3448
+ {
3449
+ "epoch": 0.49,
3450
+ "learning_rate": 0.003467891567838331,
3451
+ "loss": 8.6295,
3452
+ "step": 114800
3453
+ },
3454
+ {
3455
+ "epoch": 0.49,
3456
+ "learning_rate": 0.003477271125884973,
3457
+ "loss": 8.6219,
3458
+ "step": 115000
3459
+ },
3460
+ {
3461
+ "epoch": 0.5,
3462
+ "learning_rate": 0.0034866582407136653,
3463
+ "loss": 8.6271,
3464
+ "step": 115200
3465
+ },
3466
+ {
3467
+ "epoch": 0.5,
3468
+ "learning_rate": 0.003496052871147656,
3469
+ "loss": 8.6372,
3470
+ "step": 115400
3471
+ },
3472
+ {
3473
+ "epoch": 0.5,
3474
+ "learning_rate": 0.0035054549759772242,
3475
+ "loss": 8.6238,
3476
+ "step": 115600
3477
+ },
3478
+ {
3479
+ "epoch": 0.5,
3480
+ "learning_rate": 0.0035148645139598637,
3481
+ "loss": 8.6207,
3482
+ "step": 115800
3483
+ },
3484
+ {
3485
+ "epoch": 0.5,
3486
+ "learning_rate": 0.0035242814438204637,
3487
+ "loss": 8.6099,
3488
+ "step": 116000
3489
+ },
3490
+ {
3491
+ "epoch": 0.5,
3492
+ "learning_rate": 0.0035337057242514833,
3493
+ "loss": 8.6142,
3494
+ "step": 116200
3495
+ },
3496
+ {
3497
+ "epoch": 0.5,
3498
+ "learning_rate": 0.0035431373139131472,
3499
+ "loss": 8.6033,
3500
+ "step": 116400
3501
+ },
3502
+ {
3503
+ "epoch": 0.5,
3504
+ "learning_rate": 0.0035525761714336104,
3505
+ "loss": 8.6178,
3506
+ "step": 116600
3507
+ },
3508
+ {
3509
+ "epoch": 0.5,
3510
+ "learning_rate": 0.0035619750070819923,
3511
+ "loss": 8.6138,
3512
+ "step": 116800
3513
+ },
3514
+ {
3515
+ "epoch": 0.5,
3516
+ "learning_rate": 0.0035714282402552104,
3517
+ "loss": 8.6143,
3518
+ "step": 117000
3519
+ },
3520
+ {
3521
+ "epoch": 0.5,
3522
+ "learning_rate": 0.0035808886171885554,
3523
+ "loss": 8.6034,
3524
+ "step": 117200
3525
+ },
3526
+ {
3527
+ "epoch": 0.5,
3528
+ "learning_rate": 0.0035903560963839124,
3529
+ "loss": 8.6156,
3530
+ "step": 117400
3531
+ },
3532
+ {
3533
+ "epoch": 0.51,
3534
+ "learning_rate": 0.0035998306363120057,
3535
+ "loss": 8.6148,
3536
+ "step": 117600
3537
+ },
3538
+ {
3539
+ "epoch": 0.51,
3540
+ "learning_rate": 0.0036093121954125906,
3541
+ "loss": 8.6039,
3542
+ "step": 117800
3543
+ },
3544
+ {
3545
+ "epoch": 0.51,
3546
+ "learning_rate": 0.003618800732094636,
3547
+ "loss": 8.6107,
3548
+ "step": 118000
3549
+ },
3550
+ {
3551
+ "epoch": 0.51,
3552
+ "learning_rate": 0.0036282962047364973,
3553
+ "loss": 8.6094,
3554
+ "step": 118200
3555
+ },
3556
+ {
3557
+ "epoch": 0.51,
3558
+ "learning_rate": 0.0036377985716861084,
3559
+ "loss": 8.616,
3560
+ "step": 118400
3561
+ },
3562
+ {
3563
+ "epoch": 0.51,
3564
+ "learning_rate": 0.003647307791261164,
3565
+ "loss": 8.6135,
3566
+ "step": 118600
3567
+ },
3568
+ {
3569
+ "epoch": 0.51,
3570
+ "learning_rate": 0.003656823821749292,
3571
+ "loss": 8.6062,
3572
+ "step": 118800
3573
+ },
3574
+ {
3575
+ "epoch": 0.51,
3576
+ "learning_rate": 0.0036662989906407328,
3577
+ "loss": 8.6029,
3578
+ "step": 119000
3579
+ },
3580
+ {
3581
+ "epoch": 0.51,
3582
+ "learning_rate": 0.0036758284841655496,
3583
+ "loss": 8.6011,
3584
+ "step": 119200
3585
+ },
3586
+ {
3587
+ "epoch": 0.51,
3588
+ "learning_rate": 0.0036853646634968946,
3589
+ "loss": 8.5993,
3590
+ "step": 119400
3591
+ },
3592
+ {
3593
+ "epoch": 0.51,
3594
+ "learning_rate": 0.003694907486804143,
3595
+ "loss": 8.6029,
3596
+ "step": 119600
3597
+ },
3598
+ {
3599
+ "epoch": 0.51,
3600
+ "learning_rate": 0.00370445691222752,
3601
+ "loss": 8.6018,
3602
+ "step": 119800
3603
+ },
3604
+ {
3605
+ "epoch": 0.52,
3606
+ "learning_rate": 0.003714012897878298,
3607
+ "loss": 8.5978,
3608
+ "step": 120000
3609
  }
3610
  ],
3611
  "max_steps": 1000000,
3612
  "num_train_epochs": 5,
3613
+ "total_flos": 1.9125959786496e+17,
3614
  "trial_name": null,
3615
  "trial_params": null
3616
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d4ef7a37dd78952cbb4fc37873637ea684f070d1e44bde74352b791d0c661cd4
3
  size 146774203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2cc943ae46672312ee4175b7b0df7b2bcb16bb1598452afd869122102f93e701
3
  size 146774203