broodmother41 commited on
Commit
4e00db3
·
verified ·
1 Parent(s): aefa34d

Training in progress, step 750, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:53de51910fe256e39a8dab15b3b9d88b1e32692ee402ffa4c8c31f0bdd898078
3
  size 671149168
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20576ea29e8be0c4360c63fcfd6599fe6ed1ccaf070aacb265c8263331c04b3a
3
  size 671149168
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4077c89923cf539023189a42e0ca81f0605f9947f62676f393f78158c99dcf42
3
  size 341314644
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f510575b1aa3e1648b5bfc57047308e06b6702537cb3135095ab24c37a88337
3
  size 341314644
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:17d6f541a72ee7fc28ef488bf48f548b9923f079676f1d225f856336b32c304b
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5308048dd0c014787972e4c3767cd80f6a3368dfd7ebb6f842008ecf91ddc070
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:db75167d9b6c2f4e2365c7b7c1f58f3cefca81af2686abbae14d4522016893c1
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a197a1a4d059a9ce0958158bb28df1470f0554d236703a402b461e53e77ae60
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.9845598936080933,
3
- "best_model_checkpoint": "miner_id_24/checkpoint-600",
4
- "epoch": 0.3012300225922517,
5
  "eval_steps": 150,
6
- "global_step": 600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -4247,6 +4247,1064 @@
4247
  "eval_samples_per_second": 21.26,
4248
  "eval_steps_per_second": 2.659,
4249
  "step": 600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4250
  }
4251
  ],
4252
  "logging_steps": 1,
@@ -4275,7 +5333,7 @@
4275
  "attributes": {}
4276
  }
4277
  },
4278
- "total_flos": 4.1668916328907407e+18,
4279
  "train_batch_size": 12,
4280
  "trial_name": null,
4281
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.9616905450820923,
3
+ "best_model_checkpoint": "miner_id_24/checkpoint-750",
4
+ "epoch": 0.3765375282403146,
5
  "eval_steps": 150,
6
+ "global_step": 750,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
4247
  "eval_samples_per_second": 21.26,
4248
  "eval_steps_per_second": 2.659,
4249
  "step": 600
4250
+ },
4251
+ {
4252
+ "epoch": 0.3017320726299054,
4253
+ "grad_norm": 0.445065438747406,
4254
+ "learning_rate": 6.655688009427832e-05,
4255
+ "loss": 1.2529,
4256
+ "step": 601
4257
+ },
4258
+ {
4259
+ "epoch": 0.3022341226675592,
4260
+ "grad_norm": 0.39952167868614197,
4261
+ "learning_rate": 6.645669584873494e-05,
4262
+ "loss": 1.2194,
4263
+ "step": 602
4264
+ },
4265
+ {
4266
+ "epoch": 0.30273617270521297,
4267
+ "grad_norm": 0.403266042470932,
4268
+ "learning_rate": 6.635643745188734e-05,
4269
+ "loss": 1.2289,
4270
+ "step": 603
4271
+ },
4272
+ {
4273
+ "epoch": 0.3032382227428667,
4274
+ "grad_norm": 0.38917073607444763,
4275
+ "learning_rate": 6.625610535548418e-05,
4276
+ "loss": 1.1336,
4277
+ "step": 604
4278
+ },
4279
+ {
4280
+ "epoch": 0.30374027278052046,
4281
+ "grad_norm": 0.4072120785713196,
4282
+ "learning_rate": 6.615570001160626e-05,
4283
+ "loss": 1.0642,
4284
+ "step": 605
4285
+ },
4286
+ {
4287
+ "epoch": 0.3042423228181742,
4288
+ "grad_norm": 0.4204983711242676,
4289
+ "learning_rate": 6.605522187266441e-05,
4290
+ "loss": 1.0719,
4291
+ "step": 606
4292
+ },
4293
+ {
4294
+ "epoch": 0.30474437285582795,
4295
+ "grad_norm": 0.39132463932037354,
4296
+ "learning_rate": 6.595467139139743e-05,
4297
+ "loss": 1.0398,
4298
+ "step": 607
4299
+ },
4300
+ {
4301
+ "epoch": 0.3052464228934817,
4302
+ "grad_norm": 0.35773175954818726,
4303
+ "learning_rate": 6.585404902087011e-05,
4304
+ "loss": 1.0631,
4305
+ "step": 608
4306
+ },
4307
+ {
4308
+ "epoch": 0.3057484729311355,
4309
+ "grad_norm": 0.36051151156425476,
4310
+ "learning_rate": 6.575335521447114e-05,
4311
+ "loss": 1.04,
4312
+ "step": 609
4313
+ },
4314
+ {
4315
+ "epoch": 0.30625052296878924,
4316
+ "grad_norm": 0.36739856004714966,
4317
+ "learning_rate": 6.565259042591113e-05,
4318
+ "loss": 1.0239,
4319
+ "step": 610
4320
+ },
4321
+ {
4322
+ "epoch": 0.306752573006443,
4323
+ "grad_norm": 0.3616657853126526,
4324
+ "learning_rate": 6.555175510922047e-05,
4325
+ "loss": 1.0545,
4326
+ "step": 611
4327
+ },
4328
+ {
4329
+ "epoch": 0.30725462304409673,
4330
+ "grad_norm": 0.3667794167995453,
4331
+ "learning_rate": 6.545084971874738e-05,
4332
+ "loss": 0.9624,
4333
+ "step": 612
4334
+ },
4335
+ {
4336
+ "epoch": 0.3077566730817505,
4337
+ "grad_norm": 0.3631950318813324,
4338
+ "learning_rate": 6.53498747091558e-05,
4339
+ "loss": 1.0004,
4340
+ "step": 613
4341
+ },
4342
+ {
4343
+ "epoch": 0.3082587231194042,
4344
+ "grad_norm": 0.35089895129203796,
4345
+ "learning_rate": 6.524883053542339e-05,
4346
+ "loss": 1.0094,
4347
+ "step": 614
4348
+ },
4349
+ {
4350
+ "epoch": 0.30876077315705797,
4351
+ "grad_norm": 0.38375306129455566,
4352
+ "learning_rate": 6.514771765283942e-05,
4353
+ "loss": 1.018,
4354
+ "step": 615
4355
+ },
4356
+ {
4357
+ "epoch": 0.30926282319471177,
4358
+ "grad_norm": 0.3634318709373474,
4359
+ "learning_rate": 6.504653651700278e-05,
4360
+ "loss": 1.0375,
4361
+ "step": 616
4362
+ },
4363
+ {
4364
+ "epoch": 0.3097648732323655,
4365
+ "grad_norm": 0.3617091774940491,
4366
+ "learning_rate": 6.494528758381984e-05,
4367
+ "loss": 1.0412,
4368
+ "step": 617
4369
+ },
4370
+ {
4371
+ "epoch": 0.31026692327001926,
4372
+ "grad_norm": 0.3729401230812073,
4373
+ "learning_rate": 6.484397130950254e-05,
4374
+ "loss": 1.0327,
4375
+ "step": 618
4376
+ },
4377
+ {
4378
+ "epoch": 0.310768973307673,
4379
+ "grad_norm": 0.3525683581829071,
4380
+ "learning_rate": 6.474258815056622e-05,
4381
+ "loss": 1.0164,
4382
+ "step": 619
4383
+ },
4384
+ {
4385
+ "epoch": 0.31127102334532675,
4386
+ "grad_norm": 0.3672581911087036,
4387
+ "learning_rate": 6.464113856382752e-05,
4388
+ "loss": 1.0148,
4389
+ "step": 620
4390
+ },
4391
+ {
4392
+ "epoch": 0.3117730733829805,
4393
+ "grad_norm": 0.3790574371814728,
4394
+ "learning_rate": 6.453962300640249e-05,
4395
+ "loss": 0.9997,
4396
+ "step": 621
4397
+ },
4398
+ {
4399
+ "epoch": 0.31227512342063424,
4400
+ "grad_norm": 0.36040011048316956,
4401
+ "learning_rate": 6.44380419357044e-05,
4402
+ "loss": 0.9505,
4403
+ "step": 622
4404
+ },
4405
+ {
4406
+ "epoch": 0.312777173458288,
4407
+ "grad_norm": 0.3569061756134033,
4408
+ "learning_rate": 6.43363958094417e-05,
4409
+ "loss": 0.9429,
4410
+ "step": 623
4411
+ },
4412
+ {
4413
+ "epoch": 0.3132792234959418,
4414
+ "grad_norm": 0.36146458983421326,
4415
+ "learning_rate": 6.423468508561599e-05,
4416
+ "loss": 0.9924,
4417
+ "step": 624
4418
+ },
4419
+ {
4420
+ "epoch": 0.31378127353359553,
4421
+ "grad_norm": 0.37957096099853516,
4422
+ "learning_rate": 6.413291022251989e-05,
4423
+ "loss": 0.9934,
4424
+ "step": 625
4425
+ },
4426
+ {
4427
+ "epoch": 0.3142833235712493,
4428
+ "grad_norm": 0.37144365906715393,
4429
+ "learning_rate": 6.403107167873509e-05,
4430
+ "loss": 0.9251,
4431
+ "step": 626
4432
+ },
4433
+ {
4434
+ "epoch": 0.314785373608903,
4435
+ "grad_norm": 0.3828261196613312,
4436
+ "learning_rate": 6.392916991313016e-05,
4437
+ "loss": 0.9649,
4438
+ "step": 627
4439
+ },
4440
+ {
4441
+ "epoch": 0.31528742364655676,
4442
+ "grad_norm": 0.3864898681640625,
4443
+ "learning_rate": 6.382720538485856e-05,
4444
+ "loss": 0.9834,
4445
+ "step": 628
4446
+ },
4447
+ {
4448
+ "epoch": 0.3157894736842105,
4449
+ "grad_norm": 0.3928738832473755,
4450
+ "learning_rate": 6.372517855335655e-05,
4451
+ "loss": 0.9759,
4452
+ "step": 629
4453
+ },
4454
+ {
4455
+ "epoch": 0.31629152372186425,
4456
+ "grad_norm": 0.42996037006378174,
4457
+ "learning_rate": 6.362308987834115e-05,
4458
+ "loss": 0.9628,
4459
+ "step": 630
4460
+ },
4461
+ {
4462
+ "epoch": 0.31679357375951805,
4463
+ "grad_norm": 0.3807196319103241,
4464
+ "learning_rate": 6.352093981980796e-05,
4465
+ "loss": 0.9842,
4466
+ "step": 631
4467
+ },
4468
+ {
4469
+ "epoch": 0.3172956237971718,
4470
+ "grad_norm": 0.39248624444007874,
4471
+ "learning_rate": 6.341872883802923e-05,
4472
+ "loss": 0.9539,
4473
+ "step": 632
4474
+ },
4475
+ {
4476
+ "epoch": 0.31779767383482554,
4477
+ "grad_norm": 0.4059353470802307,
4478
+ "learning_rate": 6.331645739355168e-05,
4479
+ "loss": 0.9635,
4480
+ "step": 633
4481
+ },
4482
+ {
4483
+ "epoch": 0.3182997238724793,
4484
+ "grad_norm": 0.4235178828239441,
4485
+ "learning_rate": 6.321412594719451e-05,
4486
+ "loss": 0.9473,
4487
+ "step": 634
4488
+ },
4489
+ {
4490
+ "epoch": 0.31880177391013304,
4491
+ "grad_norm": 0.45633211731910706,
4492
+ "learning_rate": 6.311173496004723e-05,
4493
+ "loss": 0.9836,
4494
+ "step": 635
4495
+ },
4496
+ {
4497
+ "epoch": 0.3193038239477868,
4498
+ "grad_norm": 0.4051073491573334,
4499
+ "learning_rate": 6.300928489346766e-05,
4500
+ "loss": 0.9482,
4501
+ "step": 636
4502
+ },
4503
+ {
4504
+ "epoch": 0.3198058739854405,
4505
+ "grad_norm": 0.4133238196372986,
4506
+ "learning_rate": 6.290677620907982e-05,
4507
+ "loss": 0.9009,
4508
+ "step": 637
4509
+ },
4510
+ {
4511
+ "epoch": 0.3203079240230943,
4512
+ "grad_norm": 0.4294078052043915,
4513
+ "learning_rate": 6.280420936877188e-05,
4514
+ "loss": 0.9389,
4515
+ "step": 638
4516
+ },
4517
+ {
4518
+ "epoch": 0.32080997406074807,
4519
+ "grad_norm": 0.4092111885547638,
4520
+ "learning_rate": 6.270158483469397e-05,
4521
+ "loss": 0.8397,
4522
+ "step": 639
4523
+ },
4524
+ {
4525
+ "epoch": 0.3213120240984018,
4526
+ "grad_norm": 0.42124441266059875,
4527
+ "learning_rate": 6.259890306925627e-05,
4528
+ "loss": 0.8405,
4529
+ "step": 640
4530
+ },
4531
+ {
4532
+ "epoch": 0.32181407413605556,
4533
+ "grad_norm": 0.4422035217285156,
4534
+ "learning_rate": 6.249616453512677e-05,
4535
+ "loss": 0.8641,
4536
+ "step": 641
4537
+ },
4538
+ {
4539
+ "epoch": 0.3223161241737093,
4540
+ "grad_norm": 0.4448348879814148,
4541
+ "learning_rate": 6.239336969522932e-05,
4542
+ "loss": 0.9077,
4543
+ "step": 642
4544
+ },
4545
+ {
4546
+ "epoch": 0.32281817421136305,
4547
+ "grad_norm": 0.4691510796546936,
4548
+ "learning_rate": 6.229051901274137e-05,
4549
+ "loss": 0.8585,
4550
+ "step": 643
4551
+ },
4552
+ {
4553
+ "epoch": 0.3233202242490168,
4554
+ "grad_norm": 0.4641557037830353,
4555
+ "learning_rate": 6.218761295109208e-05,
4556
+ "loss": 0.8527,
4557
+ "step": 644
4558
+ },
4559
+ {
4560
+ "epoch": 0.3238222742866706,
4561
+ "grad_norm": 0.5288779735565186,
4562
+ "learning_rate": 6.208465197396013e-05,
4563
+ "loss": 0.8489,
4564
+ "step": 645
4565
+ },
4566
+ {
4567
+ "epoch": 0.32432432432432434,
4568
+ "grad_norm": 0.45869073271751404,
4569
+ "learning_rate": 6.19816365452716e-05,
4570
+ "loss": 0.8505,
4571
+ "step": 646
4572
+ },
4573
+ {
4574
+ "epoch": 0.3248263743619781,
4575
+ "grad_norm": 0.49422523379325867,
4576
+ "learning_rate": 6.187856712919795e-05,
4577
+ "loss": 0.8555,
4578
+ "step": 647
4579
+ },
4580
+ {
4581
+ "epoch": 0.32532842439963183,
4582
+ "grad_norm": 0.5668922066688538,
4583
+ "learning_rate": 6.177544419015388e-05,
4584
+ "loss": 0.7629,
4585
+ "step": 648
4586
+ },
4587
+ {
4588
+ "epoch": 0.3258304744372856,
4589
+ "grad_norm": 0.5716300010681152,
4590
+ "learning_rate": 6.167226819279528e-05,
4591
+ "loss": 0.8643,
4592
+ "step": 649
4593
+ },
4594
+ {
4595
+ "epoch": 0.3263325244749393,
4596
+ "grad_norm": 0.6652288436889648,
4597
+ "learning_rate": 6.156903960201709e-05,
4598
+ "loss": 0.7433,
4599
+ "step": 650
4600
+ },
4601
+ {
4602
+ "epoch": 0.32683457451259307,
4603
+ "grad_norm": 0.6001056432723999,
4604
+ "learning_rate": 6.146575888295123e-05,
4605
+ "loss": 1.2497,
4606
+ "step": 651
4607
+ },
4608
+ {
4609
+ "epoch": 0.32733662455024687,
4610
+ "grad_norm": 0.3522529900074005,
4611
+ "learning_rate": 6.136242650096451e-05,
4612
+ "loss": 1.177,
4613
+ "step": 652
4614
+ },
4615
+ {
4616
+ "epoch": 0.3278386745879006,
4617
+ "grad_norm": 0.3846982717514038,
4618
+ "learning_rate": 6.125904292165652e-05,
4619
+ "loss": 1.1357,
4620
+ "step": 653
4621
+ },
4622
+ {
4623
+ "epoch": 0.32834072462555436,
4624
+ "grad_norm": 0.389482706785202,
4625
+ "learning_rate": 6.115560861085756e-05,
4626
+ "loss": 1.0675,
4627
+ "step": 654
4628
+ },
4629
+ {
4630
+ "epoch": 0.3288427746632081,
4631
+ "grad_norm": 0.41399508714675903,
4632
+ "learning_rate": 6.105212403462651e-05,
4633
+ "loss": 1.1065,
4634
+ "step": 655
4635
+ },
4636
+ {
4637
+ "epoch": 0.32934482470086185,
4638
+ "grad_norm": 0.5792128443717957,
4639
+ "learning_rate": 6.0948589659248654e-05,
4640
+ "loss": 1.1188,
4641
+ "step": 656
4642
+ },
4643
+ {
4644
+ "epoch": 0.3298468747385156,
4645
+ "grad_norm": 0.3753111958503723,
4646
+ "learning_rate": 6.084500595123383e-05,
4647
+ "loss": 1.1127,
4648
+ "step": 657
4649
+ },
4650
+ {
4651
+ "epoch": 0.33034892477616934,
4652
+ "grad_norm": 0.3663425147533417,
4653
+ "learning_rate": 6.0741373377314005e-05,
4654
+ "loss": 1.019,
4655
+ "step": 658
4656
+ },
4657
+ {
4658
+ "epoch": 0.3308509748138231,
4659
+ "grad_norm": 0.39105096459388733,
4660
+ "learning_rate": 6.0637692404441416e-05,
4661
+ "loss": 1.0186,
4662
+ "step": 659
4663
+ },
4664
+ {
4665
+ "epoch": 0.3313530248514769,
4666
+ "grad_norm": 0.38673144578933716,
4667
+ "learning_rate": 6.0533963499786314e-05,
4668
+ "loss": 1.0256,
4669
+ "step": 660
4670
+ },
4671
+ {
4672
+ "epoch": 0.33185507488913063,
4673
+ "grad_norm": 0.3633407950401306,
4674
+ "learning_rate": 6.0430187130735016e-05,
4675
+ "loss": 1.0332,
4676
+ "step": 661
4677
+ },
4678
+ {
4679
+ "epoch": 0.3323571249267844,
4680
+ "grad_norm": 0.35200172662734985,
4681
+ "learning_rate": 6.032636376488763e-05,
4682
+ "loss": 0.9356,
4683
+ "step": 662
4684
+ },
4685
+ {
4686
+ "epoch": 0.3328591749644381,
4687
+ "grad_norm": 0.3665078282356262,
4688
+ "learning_rate": 6.0222493870056044e-05,
4689
+ "loss": 1.0154,
4690
+ "step": 663
4691
+ },
4692
+ {
4693
+ "epoch": 0.33336122500209187,
4694
+ "grad_norm": 0.3591248095035553,
4695
+ "learning_rate": 6.0118577914261784e-05,
4696
+ "loss": 0.9798,
4697
+ "step": 664
4698
+ },
4699
+ {
4700
+ "epoch": 0.3338632750397456,
4701
+ "grad_norm": 0.361217200756073,
4702
+ "learning_rate": 6.001461636573397e-05,
4703
+ "loss": 0.9813,
4704
+ "step": 665
4705
+ },
4706
+ {
4707
+ "epoch": 0.33436532507739936,
4708
+ "grad_norm": 0.37569659948349,
4709
+ "learning_rate": 5.99106096929071e-05,
4710
+ "loss": 1.011,
4711
+ "step": 666
4712
+ },
4713
+ {
4714
+ "epoch": 0.33486737511505316,
4715
+ "grad_norm": 0.3692183494567871,
4716
+ "learning_rate": 5.980655836441902e-05,
4717
+ "loss": 1.0294,
4718
+ "step": 667
4719
+ },
4720
+ {
4721
+ "epoch": 0.3353694251527069,
4722
+ "grad_norm": 0.374726802110672,
4723
+ "learning_rate": 5.970246284910876e-05,
4724
+ "loss": 0.9654,
4725
+ "step": 668
4726
+ },
4727
+ {
4728
+ "epoch": 0.33587147519036065,
4729
+ "grad_norm": 0.3687571585178375,
4730
+ "learning_rate": 5.959832361601453e-05,
4731
+ "loss": 1.0423,
4732
+ "step": 669
4733
+ },
4734
+ {
4735
+ "epoch": 0.3363735252280144,
4736
+ "grad_norm": 0.36362433433532715,
4737
+ "learning_rate": 5.949414113437142e-05,
4738
+ "loss": 0.8874,
4739
+ "step": 670
4740
+ },
4741
+ {
4742
+ "epoch": 0.33687557526566814,
4743
+ "grad_norm": 0.34844672679901123,
4744
+ "learning_rate": 5.938991587360946e-05,
4745
+ "loss": 0.8979,
4746
+ "step": 671
4747
+ },
4748
+ {
4749
+ "epoch": 0.3373776253033219,
4750
+ "grad_norm": 0.3646034598350525,
4751
+ "learning_rate": 5.9285648303351404e-05,
4752
+ "loss": 0.9435,
4753
+ "step": 672
4754
+ },
4755
+ {
4756
+ "epoch": 0.3378796753409756,
4757
+ "grad_norm": 0.37094947695732117,
4758
+ "learning_rate": 5.9181338893410663e-05,
4759
+ "loss": 0.9679,
4760
+ "step": 673
4761
+ },
4762
+ {
4763
+ "epoch": 0.3383817253786294,
4764
+ "grad_norm": 0.385873943567276,
4765
+ "learning_rate": 5.907698811378919e-05,
4766
+ "loss": 0.9898,
4767
+ "step": 674
4768
+ },
4769
+ {
4770
+ "epoch": 0.3388837754162832,
4771
+ "grad_norm": 0.38623571395874023,
4772
+ "learning_rate": 5.897259643467527e-05,
4773
+ "loss": 0.987,
4774
+ "step": 675
4775
+ },
4776
+ {
4777
+ "epoch": 0.3393858254539369,
4778
+ "grad_norm": 0.3703857362270355,
4779
+ "learning_rate": 5.8868164326441546e-05,
4780
+ "loss": 0.919,
4781
+ "step": 676
4782
+ },
4783
+ {
4784
+ "epoch": 0.33988787549159066,
4785
+ "grad_norm": 0.3874402344226837,
4786
+ "learning_rate": 5.876369225964283e-05,
4787
+ "loss": 0.959,
4788
+ "step": 677
4789
+ },
4790
+ {
4791
+ "epoch": 0.3403899255292444,
4792
+ "grad_norm": 0.37169700860977173,
4793
+ "learning_rate": 5.8659180705013936e-05,
4794
+ "loss": 0.9883,
4795
+ "step": 678
4796
+ },
4797
+ {
4798
+ "epoch": 0.34089197556689815,
4799
+ "grad_norm": 0.4187929332256317,
4800
+ "learning_rate": 5.8554630133467624e-05,
4801
+ "loss": 0.9527,
4802
+ "step": 679
4803
+ },
4804
+ {
4805
+ "epoch": 0.3413940256045519,
4806
+ "grad_norm": 0.39550694823265076,
4807
+ "learning_rate": 5.8450041016092464e-05,
4808
+ "loss": 0.9152,
4809
+ "step": 680
4810
+ },
4811
+ {
4812
+ "epoch": 0.3418960756422057,
4813
+ "grad_norm": 0.40294429659843445,
4814
+ "learning_rate": 5.83454138241507e-05,
4815
+ "loss": 0.95,
4816
+ "step": 681
4817
+ },
4818
+ {
4819
+ "epoch": 0.34239812567985944,
4820
+ "grad_norm": 0.38999685645103455,
4821
+ "learning_rate": 5.8240749029076134e-05,
4822
+ "loss": 0.9475,
4823
+ "step": 682
4824
+ },
4825
+ {
4826
+ "epoch": 0.3429001757175132,
4827
+ "grad_norm": 0.40788596868515015,
4828
+ "learning_rate": 5.8136047102472e-05,
4829
+ "loss": 1.01,
4830
+ "step": 683
4831
+ },
4832
+ {
4833
+ "epoch": 0.34340222575516693,
4834
+ "grad_norm": 0.4204280972480774,
4835
+ "learning_rate": 5.803130851610886e-05,
4836
+ "loss": 0.934,
4837
+ "step": 684
4838
+ },
4839
+ {
4840
+ "epoch": 0.3439042757928207,
4841
+ "grad_norm": 0.4102809429168701,
4842
+ "learning_rate": 5.792653374192245e-05,
4843
+ "loss": 0.9398,
4844
+ "step": 685
4845
+ },
4846
+ {
4847
+ "epoch": 0.3444063258304744,
4848
+ "grad_norm": 0.4025559723377228,
4849
+ "learning_rate": 5.782172325201155e-05,
4850
+ "loss": 0.9245,
4851
+ "step": 686
4852
+ },
4853
+ {
4854
+ "epoch": 0.34490837586812817,
4855
+ "grad_norm": 0.4101907014846802,
4856
+ "learning_rate": 5.771687751863587e-05,
4857
+ "loss": 0.9279,
4858
+ "step": 687
4859
+ },
4860
+ {
4861
+ "epoch": 0.34541042590578197,
4862
+ "grad_norm": 0.43221110105514526,
4863
+ "learning_rate": 5.761199701421391e-05,
4864
+ "loss": 0.8831,
4865
+ "step": 688
4866
+ },
4867
+ {
4868
+ "epoch": 0.3459124759434357,
4869
+ "grad_norm": 0.42259782552719116,
4870
+ "learning_rate": 5.750708221132092e-05,
4871
+ "loss": 0.8903,
4872
+ "step": 689
4873
+ },
4874
+ {
4875
+ "epoch": 0.34641452598108946,
4876
+ "grad_norm": 0.4195202887058258,
4877
+ "learning_rate": 5.7402133582686576e-05,
4878
+ "loss": 0.8291,
4879
+ "step": 690
4880
+ },
4881
+ {
4882
+ "epoch": 0.3469165760187432,
4883
+ "grad_norm": 0.4531534016132355,
4884
+ "learning_rate": 5.7297151601193056e-05,
4885
+ "loss": 0.8893,
4886
+ "step": 691
4887
+ },
4888
+ {
4889
+ "epoch": 0.34741862605639695,
4890
+ "grad_norm": 0.46428826451301575,
4891
+ "learning_rate": 5.719213673987277e-05,
4892
+ "loss": 0.9049,
4893
+ "step": 692
4894
+ },
4895
+ {
4896
+ "epoch": 0.3479206760940507,
4897
+ "grad_norm": 0.4338727295398712,
4898
+ "learning_rate": 5.708708947190634e-05,
4899
+ "loss": 0.8142,
4900
+ "step": 693
4901
+ },
4902
+ {
4903
+ "epoch": 0.34842272613170444,
4904
+ "grad_norm": 0.44543692469596863,
4905
+ "learning_rate": 5.698201027062034e-05,
4906
+ "loss": 0.8463,
4907
+ "step": 694
4908
+ },
4909
+ {
4910
+ "epoch": 0.3489247761693582,
4911
+ "grad_norm": 0.4769425094127655,
4912
+ "learning_rate": 5.6876899609485256e-05,
4913
+ "loss": 0.8931,
4914
+ "step": 695
4915
+ },
4916
+ {
4917
+ "epoch": 0.349426826207012,
4918
+ "grad_norm": 0.49232223629951477,
4919
+ "learning_rate": 5.6771757962113323e-05,
4920
+ "loss": 0.8189,
4921
+ "step": 696
4922
+ },
4923
+ {
4924
+ "epoch": 0.34992887624466573,
4925
+ "grad_norm": 0.49148690700531006,
4926
+ "learning_rate": 5.666658580225643e-05,
4927
+ "loss": 0.8153,
4928
+ "step": 697
4929
+ },
4930
+ {
4931
+ "epoch": 0.3504309262823195,
4932
+ "grad_norm": 0.5055503845214844,
4933
+ "learning_rate": 5.656138360380391e-05,
4934
+ "loss": 0.8018,
4935
+ "step": 698
4936
+ },
4937
+ {
4938
+ "epoch": 0.3509329763199732,
4939
+ "grad_norm": 0.5481170415878296,
4940
+ "learning_rate": 5.645615184078044e-05,
4941
+ "loss": 0.8587,
4942
+ "step": 699
4943
+ },
4944
+ {
4945
+ "epoch": 0.35143502635762697,
4946
+ "grad_norm": 0.6615381240844727,
4947
+ "learning_rate": 5.6350890987343944e-05,
4948
+ "loss": 0.777,
4949
+ "step": 700
4950
+ },
4951
+ {
4952
+ "epoch": 0.3519370763952807,
4953
+ "grad_norm": 0.434299111366272,
4954
+ "learning_rate": 5.6245601517783406e-05,
4955
+ "loss": 1.2088,
4956
+ "step": 701
4957
+ },
4958
+ {
4959
+ "epoch": 0.35243912643293446,
4960
+ "grad_norm": 0.39533907175064087,
4961
+ "learning_rate": 5.614028390651675e-05,
4962
+ "loss": 1.1814,
4963
+ "step": 702
4964
+ },
4965
+ {
4966
+ "epoch": 0.35294117647058826,
4967
+ "grad_norm": 0.3828687369823456,
4968
+ "learning_rate": 5.6034938628088705e-05,
4969
+ "loss": 1.1873,
4970
+ "step": 703
4971
+ },
4972
+ {
4973
+ "epoch": 0.353443226508242,
4974
+ "grad_norm": 0.3660382628440857,
4975
+ "learning_rate": 5.5929566157168665e-05,
4976
+ "loss": 1.0862,
4977
+ "step": 704
4978
+ },
4979
+ {
4980
+ "epoch": 0.35394527654589575,
4981
+ "grad_norm": 0.39876964688301086,
4982
+ "learning_rate": 5.582416696854853e-05,
4983
+ "loss": 1.0083,
4984
+ "step": 705
4985
+ },
4986
+ {
4987
+ "epoch": 0.3544473265835495,
4988
+ "grad_norm": 0.409247487783432,
4989
+ "learning_rate": 5.571874153714063e-05,
4990
+ "loss": 1.0714,
4991
+ "step": 706
4992
+ },
4993
+ {
4994
+ "epoch": 0.35494937662120324,
4995
+ "grad_norm": 0.3872778117656708,
4996
+ "learning_rate": 5.561329033797547e-05,
4997
+ "loss": 1.085,
4998
+ "step": 707
4999
+ },
5000
+ {
5001
+ "epoch": 0.355451426658857,
5002
+ "grad_norm": 0.38185930252075195,
5003
+ "learning_rate": 5.550781384619973e-05,
5004
+ "loss": 1.0762,
5005
+ "step": 708
5006
+ },
5007
+ {
5008
+ "epoch": 0.35595347669651073,
5009
+ "grad_norm": 0.3866881728172302,
5010
+ "learning_rate": 5.540231253707403e-05,
5011
+ "loss": 1.0326,
5012
+ "step": 709
5013
+ },
5014
+ {
5015
+ "epoch": 0.35645552673416453,
5016
+ "grad_norm": 0.37910160422325134,
5017
+ "learning_rate": 5.5296786885970805e-05,
5018
+ "loss": 1.0769,
5019
+ "step": 710
5020
+ },
5021
+ {
5022
+ "epoch": 0.3569575767718183,
5023
+ "grad_norm": 0.3608991205692291,
5024
+ "learning_rate": 5.519123736837217e-05,
5025
+ "loss": 1.0523,
5026
+ "step": 711
5027
+ },
5028
+ {
5029
+ "epoch": 0.357459626809472,
5030
+ "grad_norm": 0.36697694659233093,
5031
+ "learning_rate": 5.50856644598678e-05,
5032
+ "loss": 0.9778,
5033
+ "step": 712
5034
+ },
5035
+ {
5036
+ "epoch": 0.35796167684712576,
5037
+ "grad_norm": 0.4545275568962097,
5038
+ "learning_rate": 5.498006863615275e-05,
5039
+ "loss": 1.0207,
5040
+ "step": 713
5041
+ },
5042
+ {
5043
+ "epoch": 0.3584637268847795,
5044
+ "grad_norm": 0.3483712375164032,
5045
+ "learning_rate": 5.487445037302531e-05,
5046
+ "loss": 1.0002,
5047
+ "step": 714
5048
+ },
5049
+ {
5050
+ "epoch": 0.35896577692243326,
5051
+ "grad_norm": 0.3665158152580261,
5052
+ "learning_rate": 5.476881014638491e-05,
5053
+ "loss": 1.0274,
5054
+ "step": 715
5055
+ },
5056
+ {
5057
+ "epoch": 0.359467826960087,
5058
+ "grad_norm": 0.35564157366752625,
5059
+ "learning_rate": 5.466314843222993e-05,
5060
+ "loss": 0.9884,
5061
+ "step": 716
5062
+ },
5063
+ {
5064
+ "epoch": 0.3599698769977408,
5065
+ "grad_norm": 0.3559761345386505,
5066
+ "learning_rate": 5.4557465706655564e-05,
5067
+ "loss": 1.0143,
5068
+ "step": 717
5069
+ },
5070
+ {
5071
+ "epoch": 0.36047192703539455,
5072
+ "grad_norm": 0.38508090376853943,
5073
+ "learning_rate": 5.4451762445851705e-05,
5074
+ "loss": 1.0679,
5075
+ "step": 718
5076
+ },
5077
+ {
5078
+ "epoch": 0.3609739770730483,
5079
+ "grad_norm": 0.3513292670249939,
5080
+ "learning_rate": 5.4346039126100733e-05,
5081
+ "loss": 0.948,
5082
+ "step": 719
5083
+ },
5084
+ {
5085
+ "epoch": 0.36147602711070204,
5086
+ "grad_norm": 0.36502474546432495,
5087
+ "learning_rate": 5.4240296223775465e-05,
5088
+ "loss": 1.0246,
5089
+ "step": 720
5090
+ },
5091
+ {
5092
+ "epoch": 0.3619780771483558,
5093
+ "grad_norm": 0.3846004605293274,
5094
+ "learning_rate": 5.41345342153369e-05,
5095
+ "loss": 1.0332,
5096
+ "step": 721
5097
+ },
5098
+ {
5099
+ "epoch": 0.3624801271860095,
5100
+ "grad_norm": 0.35061997175216675,
5101
+ "learning_rate": 5.4028753577332146e-05,
5102
+ "loss": 0.9286,
5103
+ "step": 722
5104
+ },
5105
+ {
5106
+ "epoch": 0.36298217722366327,
5107
+ "grad_norm": 0.37235984206199646,
5108
+ "learning_rate": 5.392295478639225e-05,
5109
+ "loss": 1.0385,
5110
+ "step": 723
5111
+ },
5112
+ {
5113
+ "epoch": 0.36348422726131707,
5114
+ "grad_norm": 0.3770149350166321,
5115
+ "learning_rate": 5.3817138319230076e-05,
5116
+ "loss": 0.9865,
5117
+ "step": 724
5118
+ },
5119
+ {
5120
+ "epoch": 0.3639862772989708,
5121
+ "grad_norm": 0.3904590606689453,
5122
+ "learning_rate": 5.3711304652638126e-05,
5123
+ "loss": 0.934,
5124
+ "step": 725
5125
+ },
5126
+ {
5127
+ "epoch": 0.36448832733662456,
5128
+ "grad_norm": 0.3823120892047882,
5129
+ "learning_rate": 5.360545426348638e-05,
5130
+ "loss": 0.9394,
5131
+ "step": 726
5132
+ },
5133
+ {
5134
+ "epoch": 0.3649903773742783,
5135
+ "grad_norm": 0.36231666803359985,
5136
+ "learning_rate": 5.349958762872016e-05,
5137
+ "loss": 0.9282,
5138
+ "step": 727
5139
+ },
5140
+ {
5141
+ "epoch": 0.36549242741193205,
5142
+ "grad_norm": 0.3757944405078888,
5143
+ "learning_rate": 5.3393705225358046e-05,
5144
+ "loss": 0.8884,
5145
+ "step": 728
5146
+ },
5147
+ {
5148
+ "epoch": 0.3659944774495858,
5149
+ "grad_norm": 0.4007607102394104,
5150
+ "learning_rate": 5.32878075304896e-05,
5151
+ "loss": 0.9739,
5152
+ "step": 729
5153
+ },
5154
+ {
5155
+ "epoch": 0.36649652748723954,
5156
+ "grad_norm": 0.40476924180984497,
5157
+ "learning_rate": 5.318189502127332e-05,
5158
+ "loss": 0.9458,
5159
+ "step": 730
5160
+ },
5161
+ {
5162
+ "epoch": 0.3669985775248933,
5163
+ "grad_norm": 0.39884302020072937,
5164
+ "learning_rate": 5.307596817493445e-05,
5165
+ "loss": 0.8989,
5166
+ "step": 731
5167
+ },
5168
+ {
5169
+ "epoch": 0.3675006275625471,
5170
+ "grad_norm": 0.42604318261146545,
5171
+ "learning_rate": 5.297002746876284e-05,
5172
+ "loss": 0.9337,
5173
+ "step": 732
5174
+ },
5175
+ {
5176
+ "epoch": 0.36800267760020083,
5177
+ "grad_norm": 0.41235285997390747,
5178
+ "learning_rate": 5.286407338011079e-05,
5179
+ "loss": 0.9191,
5180
+ "step": 733
5181
+ },
5182
+ {
5183
+ "epoch": 0.3685047276378546,
5184
+ "grad_norm": 0.40768033266067505,
5185
+ "learning_rate": 5.275810638639088e-05,
5186
+ "loss": 0.957,
5187
+ "step": 734
5188
+ },
5189
+ {
5190
+ "epoch": 0.3690067776755083,
5191
+ "grad_norm": 0.42073965072631836,
5192
+ "learning_rate": 5.265212696507387e-05,
5193
+ "loss": 0.9503,
5194
+ "step": 735
5195
+ },
5196
+ {
5197
+ "epoch": 0.36950882771316207,
5198
+ "grad_norm": 0.40175575017929077,
5199
+ "learning_rate": 5.254613559368649e-05,
5200
+ "loss": 0.9277,
5201
+ "step": 736
5202
+ },
5203
+ {
5204
+ "epoch": 0.3700108777508158,
5205
+ "grad_norm": 0.39959418773651123,
5206
+ "learning_rate": 5.2440132749809313e-05,
5207
+ "loss": 0.9021,
5208
+ "step": 737
5209
+ },
5210
+ {
5211
+ "epoch": 0.37051292778846956,
5212
+ "grad_norm": 0.45893776416778564,
5213
+ "learning_rate": 5.2334118911074635e-05,
5214
+ "loss": 0.9413,
5215
+ "step": 738
5216
+ },
5217
+ {
5218
+ "epoch": 0.37101497782612336,
5219
+ "grad_norm": 0.4203508794307709,
5220
+ "learning_rate": 5.2228094555164265e-05,
5221
+ "loss": 0.9131,
5222
+ "step": 739
5223
+ },
5224
+ {
5225
+ "epoch": 0.3715170278637771,
5226
+ "grad_norm": 0.4097796082496643,
5227
+ "learning_rate": 5.212206015980742e-05,
5228
+ "loss": 0.881,
5229
+ "step": 740
5230
+ },
5231
+ {
5232
+ "epoch": 0.37201907790143085,
5233
+ "grad_norm": 0.44615375995635986,
5234
+ "learning_rate": 5.201601620277854e-05,
5235
+ "loss": 0.8147,
5236
+ "step": 741
5237
+ },
5238
+ {
5239
+ "epoch": 0.3725211279390846,
5240
+ "grad_norm": 0.4491327702999115,
5241
+ "learning_rate": 5.190996316189515e-05,
5242
+ "loss": 0.8368,
5243
+ "step": 742
5244
+ },
5245
+ {
5246
+ "epoch": 0.37302317797673834,
5247
+ "grad_norm": 0.4489690065383911,
5248
+ "learning_rate": 5.180390151501569e-05,
5249
+ "loss": 0.9062,
5250
+ "step": 743
5251
+ },
5252
+ {
5253
+ "epoch": 0.3735252280143921,
5254
+ "grad_norm": 0.4554278552532196,
5255
+ "learning_rate": 5.1697831740037436e-05,
5256
+ "loss": 0.841,
5257
+ "step": 744
5258
+ },
5259
+ {
5260
+ "epoch": 0.37402727805204583,
5261
+ "grad_norm": 0.4591432213783264,
5262
+ "learning_rate": 5.159175431489424e-05,
5263
+ "loss": 0.8241,
5264
+ "step": 745
5265
+ },
5266
+ {
5267
+ "epoch": 0.37452932808969963,
5268
+ "grad_norm": 0.4552235007286072,
5269
+ "learning_rate": 5.1485669717554396e-05,
5270
+ "loss": 0.7784,
5271
+ "step": 746
5272
+ },
5273
+ {
5274
+ "epoch": 0.3750313781273534,
5275
+ "grad_norm": 0.4900113046169281,
5276
+ "learning_rate": 5.137957842601856e-05,
5277
+ "loss": 0.7905,
5278
+ "step": 747
5279
+ },
5280
+ {
5281
+ "epoch": 0.3755334281650071,
5282
+ "grad_norm": 0.5452777743339539,
5283
+ "learning_rate": 5.1273480918317554e-05,
5284
+ "loss": 0.8248,
5285
+ "step": 748
5286
+ },
5287
+ {
5288
+ "epoch": 0.37603547820266087,
5289
+ "grad_norm": 0.5230666399002075,
5290
+ "learning_rate": 5.116737767251021e-05,
5291
+ "loss": 0.781,
5292
+ "step": 749
5293
+ },
5294
+ {
5295
+ "epoch": 0.3765375282403146,
5296
+ "grad_norm": 0.632352352142334,
5297
+ "learning_rate": 5.1061269166681183e-05,
5298
+ "loss": 0.7272,
5299
+ "step": 750
5300
+ },
5301
+ {
5302
+ "epoch": 0.3765375282403146,
5303
+ "eval_loss": 0.9616905450820923,
5304
+ "eval_runtime": 709.9548,
5305
+ "eval_samples_per_second": 21.265,
5306
+ "eval_steps_per_second": 2.659,
5307
+ "step": 750
5308
  }
5309
  ],
5310
  "logging_steps": 1,
 
5333
  "attributes": {}
5334
  }
5335
  },
5336
+ "total_flos": 5.207200309633352e+18,
5337
  "train_batch_size": 12,
5338
  "trial_name": null,
5339
  "trial_params": null