Stuti103 commited on
Commit
0d38df9
·
verified ·
1 Parent(s): 0ff1662

Training in progress, step 22800, checkpoint

Browse files
.gitattributes CHANGED
@@ -34,4 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
- checkpoint-22800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ last-checkpoint/tokenizer.json filter=lfs diff=lfs merge=lfs -text
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce5bc62b462cec671c252739f2dc62bd7fba894afdb3be9a2d2c7507560195bc
3
  size 3541119728
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:237f5c17c55df679a8e8f4a65ad9de09e2a99a2eaba9876aace075096abcfb63
3
  size 3541119728
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ed691e41a2ff7f7d475bdb81ca2c3a7572376d28d14ce571a0d4355792b6e26
3
  size 778374186
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cc82f803cbf27fa5d02dc20006fbaf09405895a4d61a6169c832576c2db2940
3
  size 778374186
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:21ea56942b563c971fd85c03a66bbd99dd86b68826dc8364e7984139050ee071
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:959293b55cb243a7a3af582584a0698f2aeb95373b8b27dd72c03d8f0bdce376
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.0737302998126916,
6
  "eval_steps": 500,
7
- "global_step": 22500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -20258,6 +20258,276 @@
20258
  "mean_token_accuracy": 0.8939898759126663,
20259
  "num_tokens": 37361844.0,
20260
  "step": 22500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20261
  }
20262
  ],
20263
  "logging_steps": 10,
@@ -20277,7 +20547,7 @@
20277
  "attributes": {}
20278
  }
20279
  },
20280
- "total_flos": 8.413914307123446e+17,
20281
  "train_batch_size": 2,
20282
  "trial_name": null,
20283
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.0880468628831172,
6
  "eval_steps": 500,
7
+ "global_step": 22800,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
20258
  "mean_token_accuracy": 0.8939898759126663,
20259
  "num_tokens": 37361844.0,
20260
  "step": 22500
20261
+ },
20262
+ {
20263
+ "epoch": 1.074207518581706,
20264
+ "grad_norm": 0.37823808193206787,
20265
+ "learning_rate": 9.258410880458124e-06,
20266
+ "loss": 0.6272,
20267
+ "mean_token_accuracy": 0.8770518571138382,
20268
+ "num_tokens": 37379000.0,
20269
+ "step": 22510
20270
+ },
20271
+ {
20272
+ "epoch": 1.07468473735072,
20273
+ "grad_norm": 0.4005347788333893,
20274
+ "learning_rate": 9.253638749701743e-06,
20275
+ "loss": 0.6015,
20276
+ "mean_token_accuracy": 0.8748599126935005,
20277
+ "num_tokens": 37395773.0,
20278
+ "step": 22520
20279
+ },
20280
+ {
20281
+ "epoch": 1.075161956119734,
20282
+ "grad_norm": 0.39781296253204346,
20283
+ "learning_rate": 9.24886661894536e-06,
20284
+ "loss": 0.6869,
20285
+ "mean_token_accuracy": 0.8716883912682534,
20286
+ "num_tokens": 37415210.0,
20287
+ "step": 22530
20288
+ },
20289
+ {
20290
+ "epoch": 1.0756391748887484,
20291
+ "grad_norm": 0.39247405529022217,
20292
+ "learning_rate": 9.244094488188978e-06,
20293
+ "loss": 0.7395,
20294
+ "mean_token_accuracy": 0.8584993034601212,
20295
+ "num_tokens": 37432118.0,
20296
+ "step": 22540
20297
+ },
20298
+ {
20299
+ "epoch": 1.0761163936577625,
20300
+ "grad_norm": 0.3580706715583801,
20301
+ "learning_rate": 9.239322357432594e-06,
20302
+ "loss": 0.79,
20303
+ "mean_token_accuracy": 0.8506992489099503,
20304
+ "num_tokens": 37452181.0,
20305
+ "step": 22550
20306
+ },
20307
+ {
20308
+ "epoch": 1.0765936124267768,
20309
+ "grad_norm": 0.3170486092567444,
20310
+ "learning_rate": 9.234550226676213e-06,
20311
+ "loss": 0.6072,
20312
+ "mean_token_accuracy": 0.8733228012919426,
20313
+ "num_tokens": 37468470.0,
20314
+ "step": 22560
20315
+ },
20316
+ {
20317
+ "epoch": 1.0770708311957908,
20318
+ "grad_norm": 0.44699838757514954,
20319
+ "learning_rate": 9.22977809591983e-06,
20320
+ "loss": 0.5566,
20321
+ "mean_token_accuracy": 0.8752368673682213,
20322
+ "num_tokens": 37485065.0,
20323
+ "step": 22570
20324
+ },
20325
+ {
20326
+ "epoch": 1.0775480499648051,
20327
+ "grad_norm": 0.5032857656478882,
20328
+ "learning_rate": 9.225005965163446e-06,
20329
+ "loss": 0.656,
20330
+ "mean_token_accuracy": 0.8768767550587654,
20331
+ "num_tokens": 37501662.0,
20332
+ "step": 22580
20333
+ },
20334
+ {
20335
+ "epoch": 1.0780252687338192,
20336
+ "grad_norm": 0.3128605782985687,
20337
+ "learning_rate": 9.220233834407063e-06,
20338
+ "loss": 0.6645,
20339
+ "mean_token_accuracy": 0.8662539958953858,
20340
+ "num_tokens": 37519154.0,
20341
+ "step": 22590
20342
+ },
20343
+ {
20344
+ "epoch": 1.0785024875028335,
20345
+ "grad_norm": 0.3933728039264679,
20346
+ "learning_rate": 9.215461703650681e-06,
20347
+ "loss": 0.5397,
20348
+ "mean_token_accuracy": 0.8856153175234794,
20349
+ "num_tokens": 37534651.0,
20350
+ "step": 22600
20351
+ },
20352
+ {
20353
+ "epoch": 1.0789797062718476,
20354
+ "grad_norm": 0.5340325832366943,
20355
+ "learning_rate": 9.210689572894298e-06,
20356
+ "loss": 0.6269,
20357
+ "mean_token_accuracy": 0.8772767931222916,
20358
+ "num_tokens": 37551794.0,
20359
+ "step": 22610
20360
+ },
20361
+ {
20362
+ "epoch": 1.0794569250408619,
20363
+ "grad_norm": 0.3841538429260254,
20364
+ "learning_rate": 9.205917442137915e-06,
20365
+ "loss": 0.6175,
20366
+ "mean_token_accuracy": 0.8755568400025368,
20367
+ "num_tokens": 37567508.0,
20368
+ "step": 22620
20369
+ },
20370
+ {
20371
+ "epoch": 1.079934143809876,
20372
+ "grad_norm": 0.37845683097839355,
20373
+ "learning_rate": 9.201145311381533e-06,
20374
+ "loss": 0.5757,
20375
+ "mean_token_accuracy": 0.879976649582386,
20376
+ "num_tokens": 37582508.0,
20377
+ "step": 22630
20378
+ },
20379
+ {
20380
+ "epoch": 1.0804113625788903,
20381
+ "grad_norm": 0.3559890687465668,
20382
+ "learning_rate": 9.19637318062515e-06,
20383
+ "loss": 0.8135,
20384
+ "mean_token_accuracy": 0.8407615974545479,
20385
+ "num_tokens": 37601326.0,
20386
+ "step": 22640
20387
+ },
20388
+ {
20389
+ "epoch": 1.0808885813479043,
20390
+ "grad_norm": 0.32038992643356323,
20391
+ "learning_rate": 9.191601049868766e-06,
20392
+ "loss": 0.5877,
20393
+ "mean_token_accuracy": 0.885163950920105,
20394
+ "num_tokens": 37616610.0,
20395
+ "step": 22650
20396
+ },
20397
+ {
20398
+ "epoch": 1.0813658001169186,
20399
+ "grad_norm": 0.366234689950943,
20400
+ "learning_rate": 9.186828919112385e-06,
20401
+ "loss": 0.6438,
20402
+ "mean_token_accuracy": 0.8744160294532776,
20403
+ "num_tokens": 37633602.0,
20404
+ "step": 22660
20405
+ },
20406
+ {
20407
+ "epoch": 1.0818430188859327,
20408
+ "grad_norm": 0.32627347111701965,
20409
+ "learning_rate": 9.182056788356001e-06,
20410
+ "loss": 0.6948,
20411
+ "mean_token_accuracy": 0.8610922127962113,
20412
+ "num_tokens": 37651592.0,
20413
+ "step": 22670
20414
+ },
20415
+ {
20416
+ "epoch": 1.082320237654947,
20417
+ "grad_norm": 0.3474673628807068,
20418
+ "learning_rate": 9.17728465759962e-06,
20419
+ "loss": 0.6262,
20420
+ "mean_token_accuracy": 0.8750404015183448,
20421
+ "num_tokens": 37668010.0,
20422
+ "step": 22680
20423
+ },
20424
+ {
20425
+ "epoch": 1.082797456423961,
20426
+ "grad_norm": 0.3955213129520416,
20427
+ "learning_rate": 9.172512526843236e-06,
20428
+ "loss": 0.5588,
20429
+ "mean_token_accuracy": 0.8861236184835434,
20430
+ "num_tokens": 37684538.0,
20431
+ "step": 22690
20432
+ },
20433
+ {
20434
+ "epoch": 1.0832746751929754,
20435
+ "grad_norm": 0.4451896846294403,
20436
+ "learning_rate": 9.167740396086855e-06,
20437
+ "loss": 0.5774,
20438
+ "mean_token_accuracy": 0.8859012797474861,
20439
+ "num_tokens": 37700694.0,
20440
+ "step": 22700
20441
+ },
20442
+ {
20443
+ "epoch": 1.0837518939619895,
20444
+ "grad_norm": 0.41938453912734985,
20445
+ "learning_rate": 9.162968265330471e-06,
20446
+ "loss": 0.6575,
20447
+ "mean_token_accuracy": 0.8762999802827836,
20448
+ "num_tokens": 37716717.0,
20449
+ "step": 22710
20450
+ },
20451
+ {
20452
+ "epoch": 1.0842291127310038,
20453
+ "grad_norm": 0.38627904653549194,
20454
+ "learning_rate": 9.158196134574088e-06,
20455
+ "loss": 0.6263,
20456
+ "mean_token_accuracy": 0.8728866443037987,
20457
+ "num_tokens": 37734196.0,
20458
+ "step": 22720
20459
+ },
20460
+ {
20461
+ "epoch": 1.0847063315000178,
20462
+ "grad_norm": 0.39531171321868896,
20463
+ "learning_rate": 9.153424003817706e-06,
20464
+ "loss": 0.5782,
20465
+ "mean_token_accuracy": 0.8879543572664261,
20466
+ "num_tokens": 37750684.0,
20467
+ "step": 22730
20468
+ },
20469
+ {
20470
+ "epoch": 1.0851835502690321,
20471
+ "grad_norm": 0.3783516585826874,
20472
+ "learning_rate": 9.148651873061323e-06,
20473
+ "loss": 0.599,
20474
+ "mean_token_accuracy": 0.8701232433319092,
20475
+ "num_tokens": 37767146.0,
20476
+ "step": 22740
20477
+ },
20478
+ {
20479
+ "epoch": 1.0856607690380462,
20480
+ "grad_norm": 0.39319974184036255,
20481
+ "learning_rate": 9.14387974230494e-06,
20482
+ "loss": 0.6413,
20483
+ "mean_token_accuracy": 0.8686925515532493,
20484
+ "num_tokens": 37787034.0,
20485
+ "step": 22750
20486
+ },
20487
+ {
20488
+ "epoch": 1.0861379878070605,
20489
+ "grad_norm": 0.41720524430274963,
20490
+ "learning_rate": 9.139107611548556e-06,
20491
+ "loss": 0.669,
20492
+ "mean_token_accuracy": 0.8737372472882271,
20493
+ "num_tokens": 37802820.0,
20494
+ "step": 22760
20495
+ },
20496
+ {
20497
+ "epoch": 1.0866152065760746,
20498
+ "grad_norm": 0.5915963053703308,
20499
+ "learning_rate": 9.134335480792175e-06,
20500
+ "loss": 0.7127,
20501
+ "mean_token_accuracy": 0.8542029947042465,
20502
+ "num_tokens": 37820083.0,
20503
+ "step": 22770
20504
+ },
20505
+ {
20506
+ "epoch": 1.0870924253450889,
20507
+ "grad_norm": 0.48407578468322754,
20508
+ "learning_rate": 9.129563350035791e-06,
20509
+ "loss": 0.6094,
20510
+ "mean_token_accuracy": 0.8686896711587906,
20511
+ "num_tokens": 37836877.0,
20512
+ "step": 22780
20513
+ },
20514
+ {
20515
+ "epoch": 1.087569644114103,
20516
+ "grad_norm": 0.411697119474411,
20517
+ "learning_rate": 9.124791219279408e-06,
20518
+ "loss": 0.5874,
20519
+ "mean_token_accuracy": 0.8753976777195931,
20520
+ "num_tokens": 37852853.0,
20521
+ "step": 22790
20522
+ },
20523
+ {
20524
+ "epoch": 1.0880468628831172,
20525
+ "grad_norm": 0.43069422245025635,
20526
+ "learning_rate": 9.120019088523026e-06,
20527
+ "loss": 0.6337,
20528
+ "mean_token_accuracy": 0.8802076116204262,
20529
+ "num_tokens": 37869204.0,
20530
+ "step": 22800
20531
  }
20532
  ],
20533
  "logging_steps": 10,
 
20547
  "attributes": {}
20548
  }
20549
  },
20550
+ "total_flos": 8.528831995220091e+17,
20551
  "train_batch_size": 2,
20552
  "trial_name": null,
20553
  "trial_params": null