romainnn commited on
Commit
fc428c1
·
verified ·
1 Parent(s): 0e9a22c

Training in progress, step 700, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cc7f9f12a7d975eceff199ac96f1fb4abb58016ab42da13fe25b1845a1e0b3ce
3
  size 289512208
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b9031cffc344e8840f4c7143987fa4e58be60af4e5110d63d50416a3f8b59f3
3
  size 289512208
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:37a24bd2239b20338e528442d83d2139315c8017ad79ac6456a27ebc2a7c4982
3
  size 147781972
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3586c1b40b78d2d911170eb1a15bda1bb7e14d32d622befc2e331d34b42a7aff
3
  size 147781972
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:28ace9fc649252ea1299cd2d9b1953184b717d1b1778bd2d51cf81f8fdd857fb
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b885f396904d7214b2eeb837fb3989cd5db4deae210b67eca24ef3c766dfa24
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:14de197ce4fca667a77214b11d375124cfec5ed9c075fb60180e734827aaa864
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0aa4ab3cdb1a9e7e00fd89c904cf6ae8c19a72f37f60ac96d0d021814a6f0bd4
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 1.2089511156082153,
3
- "best_model_checkpoint": "miner_id_24/checkpoint-600",
4
- "epoch": 0.03813912836200389,
5
  "eval_steps": 100,
6
- "global_step": 600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -4263,6 +4263,714 @@
4263
  "eval_samples_per_second": 4.035,
4264
  "eval_steps_per_second": 1.009,
4265
  "step": 600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4266
  }
4267
  ],
4268
  "logging_steps": 1,
@@ -4291,7 +4999,7 @@
4291
  "attributes": {}
4292
  }
4293
  },
4294
- "total_flos": 3.117735147995136e+18,
4295
  "train_batch_size": 4,
4296
  "trial_name": null,
4297
  "trial_params": null
 
1
  {
2
+ "best_metric": 1.2046868801116943,
3
+ "best_model_checkpoint": "miner_id_24/checkpoint-700",
4
+ "epoch": 0.04449564975567121,
5
  "eval_steps": 100,
6
+ "global_step": 700,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
4263
  "eval_samples_per_second": 4.035,
4264
  "eval_steps_per_second": 1.009,
4265
  "step": 600
4266
+ },
4267
+ {
4268
+ "epoch": 0.03820269357594057,
4269
+ "grad_norm": 0.2633255422115326,
4270
+ "learning_rate": 3.820213483066737e-05,
4271
+ "loss": 1.1605,
4272
+ "step": 601
4273
+ },
4274
+ {
4275
+ "epoch": 0.03826625878987724,
4276
+ "grad_norm": 0.24185825884342194,
4277
+ "learning_rate": 3.7904999416234864e-05,
4278
+ "loss": 1.2412,
4279
+ "step": 602
4280
+ },
4281
+ {
4282
+ "epoch": 0.03832982400381391,
4283
+ "grad_norm": 0.26332393288612366,
4284
+ "learning_rate": 3.7608753611846446e-05,
4285
+ "loss": 1.222,
4286
+ "step": 603
4287
+ },
4288
+ {
4289
+ "epoch": 0.03839338921775059,
4290
+ "grad_norm": 0.258789986371994,
4291
+ "learning_rate": 3.731340166169635e-05,
4292
+ "loss": 1.105,
4293
+ "step": 604
4294
+ },
4295
+ {
4296
+ "epoch": 0.03845695443168726,
4297
+ "grad_norm": 0.2542060315608978,
4298
+ "learning_rate": 3.701894779717286e-05,
4299
+ "loss": 1.149,
4300
+ "step": 605
4301
+ },
4302
+ {
4303
+ "epoch": 0.03852051964562393,
4304
+ "grad_norm": 0.2608526647090912,
4305
+ "learning_rate": 3.6725396236797935e-05,
4306
+ "loss": 1.1996,
4307
+ "step": 606
4308
+ },
4309
+ {
4310
+ "epoch": 0.03858408485956061,
4311
+ "grad_norm": 0.24727903306484222,
4312
+ "learning_rate": 3.64327511861663e-05,
4313
+ "loss": 1.1608,
4314
+ "step": 607
4315
+ },
4316
+ {
4317
+ "epoch": 0.038647650073497276,
4318
+ "grad_norm": 0.2504411041736603,
4319
+ "learning_rate": 3.614101683788575e-05,
4320
+ "loss": 1.1245,
4321
+ "step": 608
4322
+ },
4323
+ {
4324
+ "epoch": 0.03871121528743395,
4325
+ "grad_norm": 0.26074525713920593,
4326
+ "learning_rate": 3.585019737151669e-05,
4327
+ "loss": 1.1894,
4328
+ "step": 609
4329
+ },
4330
+ {
4331
+ "epoch": 0.03877478050137063,
4332
+ "grad_norm": 0.2569214105606079,
4333
+ "learning_rate": 3.5560296953512295e-05,
4334
+ "loss": 1.2079,
4335
+ "step": 610
4336
+ },
4337
+ {
4338
+ "epoch": 0.038838345715307296,
4339
+ "grad_norm": 0.2611664831638336,
4340
+ "learning_rate": 3.52713197371591e-05,
4341
+ "loss": 1.1224,
4342
+ "step": 611
4343
+ },
4344
+ {
4345
+ "epoch": 0.03890191092924397,
4346
+ "grad_norm": 0.25584879517555237,
4347
+ "learning_rate": 3.498326986251717e-05,
4348
+ "loss": 1.3047,
4349
+ "step": 612
4350
+ },
4351
+ {
4352
+ "epoch": 0.03896547614318065,
4353
+ "grad_norm": 0.24464906752109528,
4354
+ "learning_rate": 3.4696151456360956e-05,
4355
+ "loss": 1.0912,
4356
+ "step": 613
4357
+ },
4358
+ {
4359
+ "epoch": 0.039029041357117315,
4360
+ "grad_norm": 0.25548434257507324,
4361
+ "learning_rate": 3.4409968632120126e-05,
4362
+ "loss": 1.2166,
4363
+ "step": 614
4364
+ },
4365
+ {
4366
+ "epoch": 0.03909260657105399,
4367
+ "grad_norm": 0.25605612993240356,
4368
+ "learning_rate": 3.4124725489820645e-05,
4369
+ "loss": 1.2629,
4370
+ "step": 615
4371
+ },
4372
+ {
4373
+ "epoch": 0.039156171784990666,
4374
+ "grad_norm": 0.24373793601989746,
4375
+ "learning_rate": 3.3840426116026044e-05,
4376
+ "loss": 1.1917,
4377
+ "step": 616
4378
+ },
4379
+ {
4380
+ "epoch": 0.039219736998927335,
4381
+ "grad_norm": 0.24588941037654877,
4382
+ "learning_rate": 3.3557074583778814e-05,
4383
+ "loss": 1.286,
4384
+ "step": 617
4385
+ },
4386
+ {
4387
+ "epoch": 0.03928330221286401,
4388
+ "grad_norm": 0.25556549429893494,
4389
+ "learning_rate": 3.327467495254225e-05,
4390
+ "loss": 1.2295,
4391
+ "step": 618
4392
+ },
4393
+ {
4394
+ "epoch": 0.039346867426800686,
4395
+ "grad_norm": 0.2570589780807495,
4396
+ "learning_rate": 3.299323126814191e-05,
4397
+ "loss": 1.2417,
4398
+ "step": 619
4399
+ },
4400
+ {
4401
+ "epoch": 0.039410432640737354,
4402
+ "grad_norm": 0.24832259118556976,
4403
+ "learning_rate": 3.2712747562708115e-05,
4404
+ "loss": 1.2996,
4405
+ "step": 620
4406
+ },
4407
+ {
4408
+ "epoch": 0.03947399785467403,
4409
+ "grad_norm": 0.2418624311685562,
4410
+ "learning_rate": 3.243322785461781e-05,
4411
+ "loss": 1.2418,
4412
+ "step": 621
4413
+ },
4414
+ {
4415
+ "epoch": 0.039537563068610705,
4416
+ "grad_norm": 0.2648262083530426,
4417
+ "learning_rate": 3.215467614843719e-05,
4418
+ "loss": 1.2913,
4419
+ "step": 622
4420
+ },
4421
+ {
4422
+ "epoch": 0.039601128282547374,
4423
+ "grad_norm": 0.2682283818721771,
4424
+ "learning_rate": 3.187709643486427e-05,
4425
+ "loss": 1.2148,
4426
+ "step": 623
4427
+ },
4428
+ {
4429
+ "epoch": 0.03966469349648405,
4430
+ "grad_norm": 0.26762086153030396,
4431
+ "learning_rate": 3.160049269067174e-05,
4432
+ "loss": 1.2949,
4433
+ "step": 624
4434
+ },
4435
+ {
4436
+ "epoch": 0.039728258710420725,
4437
+ "grad_norm": 0.25577932596206665,
4438
+ "learning_rate": 3.132486887864992e-05,
4439
+ "loss": 1.12,
4440
+ "step": 625
4441
+ },
4442
+ {
4443
+ "epoch": 0.03979182392435739,
4444
+ "grad_norm": 0.2690037190914154,
4445
+ "learning_rate": 3.105022894755003e-05,
4446
+ "loss": 1.1813,
4447
+ "step": 626
4448
+ },
4449
+ {
4450
+ "epoch": 0.03985538913829407,
4451
+ "grad_norm": 0.25239890813827515,
4452
+ "learning_rate": 3.077657683202779e-05,
4453
+ "loss": 1.2678,
4454
+ "step": 627
4455
+ },
4456
+ {
4457
+ "epoch": 0.039918954352230744,
4458
+ "grad_norm": 0.2665114104747772,
4459
+ "learning_rate": 3.0503916452586612e-05,
4460
+ "loss": 1.0682,
4461
+ "step": 628
4462
+ },
4463
+ {
4464
+ "epoch": 0.03998251956616741,
4465
+ "grad_norm": 0.2506917715072632,
4466
+ "learning_rate": 3.0232251715521932e-05,
4467
+ "loss": 1.2247,
4468
+ "step": 629
4469
+ },
4470
+ {
4471
+ "epoch": 0.04004608478010409,
4472
+ "grad_norm": 0.24417519569396973,
4473
+ "learning_rate": 2.9961586512864947e-05,
4474
+ "loss": 1.2097,
4475
+ "step": 630
4476
+ },
4477
+ {
4478
+ "epoch": 0.040109649994040764,
4479
+ "grad_norm": 0.2546679973602295,
4480
+ "learning_rate": 2.9691924722326826e-05,
4481
+ "loss": 1.1986,
4482
+ "step": 631
4483
+ },
4484
+ {
4485
+ "epoch": 0.04017321520797743,
4486
+ "grad_norm": 0.25688743591308594,
4487
+ "learning_rate": 2.9423270207243437e-05,
4488
+ "loss": 1.2543,
4489
+ "step": 632
4490
+ },
4491
+ {
4492
+ "epoch": 0.04023678042191411,
4493
+ "grad_norm": 0.23989447951316833,
4494
+ "learning_rate": 2.9155626816519677e-05,
4495
+ "loss": 1.0773,
4496
+ "step": 633
4497
+ },
4498
+ {
4499
+ "epoch": 0.04030034563585078,
4500
+ "grad_norm": 0.25679922103881836,
4501
+ "learning_rate": 2.888899838457455e-05,
4502
+ "loss": 1.2843,
4503
+ "step": 634
4504
+ },
4505
+ {
4506
+ "epoch": 0.04036391084978745,
4507
+ "grad_norm": 0.2580190896987915,
4508
+ "learning_rate": 2.8623388731286093e-05,
4509
+ "loss": 1.1911,
4510
+ "step": 635
4511
+ },
4512
+ {
4513
+ "epoch": 0.04042747606372413,
4514
+ "grad_norm": 0.24526208639144897,
4515
+ "learning_rate": 2.835880166193683e-05,
4516
+ "loss": 1.2574,
4517
+ "step": 636
4518
+ },
4519
+ {
4520
+ "epoch": 0.0404910412776608,
4521
+ "grad_norm": 0.25860583782196045,
4522
+ "learning_rate": 2.8095240967158954e-05,
4523
+ "loss": 1.2943,
4524
+ "step": 637
4525
+ },
4526
+ {
4527
+ "epoch": 0.04055460649159747,
4528
+ "grad_norm": 0.26202407479286194,
4529
+ "learning_rate": 2.7832710422880328e-05,
4530
+ "loss": 1.1769,
4531
+ "step": 638
4532
+ },
4533
+ {
4534
+ "epoch": 0.040618171705534147,
4535
+ "grad_norm": 0.2458542138338089,
4536
+ "learning_rate": 2.75712137902703e-05,
4537
+ "loss": 1.1669,
4538
+ "step": 639
4539
+ },
4540
+ {
4541
+ "epoch": 0.04068173691947082,
4542
+ "grad_norm": 0.25534749031066895,
4543
+ "learning_rate": 2.7310754815685624e-05,
4544
+ "loss": 1.2057,
4545
+ "step": 640
4546
+ },
4547
+ {
4548
+ "epoch": 0.04074530213340749,
4549
+ "grad_norm": 0.2514583170413971,
4550
+ "learning_rate": 2.7051337230617125e-05,
4551
+ "loss": 1.2483,
4552
+ "step": 641
4553
+ },
4554
+ {
4555
+ "epoch": 0.040808867347344166,
4556
+ "grad_norm": 0.25142601132392883,
4557
+ "learning_rate": 2.679296475163595e-05,
4558
+ "loss": 1.1685,
4559
+ "step": 642
4560
+ },
4561
+ {
4562
+ "epoch": 0.04087243256128084,
4563
+ "grad_norm": 0.2746109962463379,
4564
+ "learning_rate": 2.6535641080340458e-05,
4565
+ "loss": 1.2658,
4566
+ "step": 643
4567
+ },
4568
+ {
4569
+ "epoch": 0.04093599777521751,
4570
+ "grad_norm": 0.26082682609558105,
4571
+ "learning_rate": 2.6279369903303175e-05,
4572
+ "loss": 1.2184,
4573
+ "step": 644
4574
+ },
4575
+ {
4576
+ "epoch": 0.040999562989154185,
4577
+ "grad_norm": 0.27172860503196716,
4578
+ "learning_rate": 2.6024154892017937e-05,
4579
+ "loss": 1.2417,
4580
+ "step": 645
4581
+ },
4582
+ {
4583
+ "epoch": 0.04106312820309086,
4584
+ "grad_norm": 0.26511403918266296,
4585
+ "learning_rate": 2.5769999702847346e-05,
4586
+ "loss": 1.2099,
4587
+ "step": 646
4588
+ },
4589
+ {
4590
+ "epoch": 0.04112669341702753,
4591
+ "grad_norm": 0.26414263248443604,
4592
+ "learning_rate": 2.5516907976970328e-05,
4593
+ "loss": 1.2562,
4594
+ "step": 647
4595
+ },
4596
+ {
4597
+ "epoch": 0.041190258630964205,
4598
+ "grad_norm": 0.25787821412086487,
4599
+ "learning_rate": 2.5264883340330113e-05,
4600
+ "loss": 1.2202,
4601
+ "step": 648
4602
+ },
4603
+ {
4604
+ "epoch": 0.04125382384490088,
4605
+ "grad_norm": 0.25424811244010925,
4606
+ "learning_rate": 2.501392940358197e-05,
4607
+ "loss": 1.2154,
4608
+ "step": 649
4609
+ },
4610
+ {
4611
+ "epoch": 0.04131738905883755,
4612
+ "grad_norm": 0.26234978437423706,
4613
+ "learning_rate": 2.4764049762041874e-05,
4614
+ "loss": 1.242,
4615
+ "step": 650
4616
+ },
4617
+ {
4618
+ "epoch": 0.041380954272774224,
4619
+ "grad_norm": 0.25528523325920105,
4620
+ "learning_rate": 2.4515247995634694e-05,
4621
+ "loss": 1.1873,
4622
+ "step": 651
4623
+ },
4624
+ {
4625
+ "epoch": 0.0414445194867109,
4626
+ "grad_norm": 0.2629062235355377,
4627
+ "learning_rate": 2.426752766884306e-05,
4628
+ "loss": 1.1596,
4629
+ "step": 652
4630
+ },
4631
+ {
4632
+ "epoch": 0.04150808470064757,
4633
+ "grad_norm": 0.24369929730892181,
4634
+ "learning_rate": 2.4020892330656252e-05,
4635
+ "loss": 1.069,
4636
+ "step": 653
4637
+ },
4638
+ {
4639
+ "epoch": 0.041571649914584244,
4640
+ "grad_norm": 0.2602699100971222,
4641
+ "learning_rate": 2.377534551451932e-05,
4642
+ "loss": 1.2132,
4643
+ "step": 654
4644
+ },
4645
+ {
4646
+ "epoch": 0.04163521512852092,
4647
+ "grad_norm": 0.24992002546787262,
4648
+ "learning_rate": 2.353089073828255e-05,
4649
+ "loss": 1.1259,
4650
+ "step": 655
4651
+ },
4652
+ {
4653
+ "epoch": 0.04169878034245759,
4654
+ "grad_norm": 0.2784167528152466,
4655
+ "learning_rate": 2.328753150415094e-05,
4656
+ "loss": 1.1997,
4657
+ "step": 656
4658
+ },
4659
+ {
4660
+ "epoch": 0.04176234555639426,
4661
+ "grad_norm": 0.2581193745136261,
4662
+ "learning_rate": 2.304527129863424e-05,
4663
+ "loss": 1.1832,
4664
+ "step": 657
4665
+ },
4666
+ {
4667
+ "epoch": 0.04182591077033094,
4668
+ "grad_norm": 0.25155678391456604,
4669
+ "learning_rate": 2.280411359249668e-05,
4670
+ "loss": 1.147,
4671
+ "step": 658
4672
+ },
4673
+ {
4674
+ "epoch": 0.04188947598426761,
4675
+ "grad_norm": 0.2618091106414795,
4676
+ "learning_rate": 2.2564061840707495e-05,
4677
+ "loss": 1.303,
4678
+ "step": 659
4679
+ },
4680
+ {
4681
+ "epoch": 0.04195304119820428,
4682
+ "grad_norm": 0.2630173861980438,
4683
+ "learning_rate": 2.2325119482391467e-05,
4684
+ "loss": 1.2555,
4685
+ "step": 660
4686
+ },
4687
+ {
4688
+ "epoch": 0.04201660641214096,
4689
+ "grad_norm": 0.25127795338630676,
4690
+ "learning_rate": 2.2087289940779343e-05,
4691
+ "loss": 1.1694,
4692
+ "step": 661
4693
+ },
4694
+ {
4695
+ "epoch": 0.04208017162607763,
4696
+ "grad_norm": 0.2526141107082367,
4697
+ "learning_rate": 2.185057662315918e-05,
4698
+ "loss": 1.0997,
4699
+ "step": 662
4700
+ },
4701
+ {
4702
+ "epoch": 0.0421437368400143,
4703
+ "grad_norm": 0.2466498613357544,
4704
+ "learning_rate": 2.1614982920827243e-05,
4705
+ "loss": 1.2093,
4706
+ "step": 663
4707
+ },
4708
+ {
4709
+ "epoch": 0.04220730205395098,
4710
+ "grad_norm": 0.2559715211391449,
4711
+ "learning_rate": 2.1380512209039528e-05,
4712
+ "loss": 1.239,
4713
+ "step": 664
4714
+ },
4715
+ {
4716
+ "epoch": 0.042270867267887646,
4717
+ "grad_norm": 0.24562884867191315,
4718
+ "learning_rate": 2.1147167846963422e-05,
4719
+ "loss": 1.1716,
4720
+ "step": 665
4721
+ },
4722
+ {
4723
+ "epoch": 0.04233443248182432,
4724
+ "grad_norm": 0.25966036319732666,
4725
+ "learning_rate": 2.0914953177629548e-05,
4726
+ "loss": 1.2553,
4727
+ "step": 666
4728
+ },
4729
+ {
4730
+ "epoch": 0.042397997695761,
4731
+ "grad_norm": 0.25772759318351746,
4732
+ "learning_rate": 2.068387152788387e-05,
4733
+ "loss": 1.1341,
4734
+ "step": 667
4735
+ },
4736
+ {
4737
+ "epoch": 0.042461562909697666,
4738
+ "grad_norm": 0.24900874495506287,
4739
+ "learning_rate": 2.0453926208340003e-05,
4740
+ "loss": 1.1742,
4741
+ "step": 668
4742
+ },
4743
+ {
4744
+ "epoch": 0.04252512812363434,
4745
+ "grad_norm": 0.2540144622325897,
4746
+ "learning_rate": 2.022512051333194e-05,
4747
+ "loss": 1.1856,
4748
+ "step": 669
4749
+ },
4750
+ {
4751
+ "epoch": 0.04258869333757102,
4752
+ "grad_norm": 0.26840710639953613,
4753
+ "learning_rate": 1.999745772086655e-05,
4754
+ "loss": 1.2104,
4755
+ "step": 670
4756
+ },
4757
+ {
4758
+ "epoch": 0.042652258551507685,
4759
+ "grad_norm": 0.2511826753616333,
4760
+ "learning_rate": 1.9770941092576957e-05,
4761
+ "loss": 1.2477,
4762
+ "step": 671
4763
+ },
4764
+ {
4765
+ "epoch": 0.04271582376544436,
4766
+ "grad_norm": 0.26480165123939514,
4767
+ "learning_rate": 1.954557387367557e-05,
4768
+ "loss": 1.1991,
4769
+ "step": 672
4770
+ },
4771
+ {
4772
+ "epoch": 0.042779388979381036,
4773
+ "grad_norm": 0.2562330663204193,
4774
+ "learning_rate": 1.9321359292907702e-05,
4775
+ "loss": 1.2336,
4776
+ "step": 673
4777
+ },
4778
+ {
4779
+ "epoch": 0.042842954193317705,
4780
+ "grad_norm": 0.25312507152557373,
4781
+ "learning_rate": 1.9098300562505266e-05,
4782
+ "loss": 1.1593,
4783
+ "step": 674
4784
+ },
4785
+ {
4786
+ "epoch": 0.04290651940725438,
4787
+ "grad_norm": 0.2678249180316925,
4788
+ "learning_rate": 1.8876400878140775e-05,
4789
+ "loss": 1.1886,
4790
+ "step": 675
4791
+ },
4792
+ {
4793
+ "epoch": 0.042970084621191056,
4794
+ "grad_norm": 0.25428783893585205,
4795
+ "learning_rate": 1.8655663418881584e-05,
4796
+ "loss": 1.2123,
4797
+ "step": 676
4798
+ },
4799
+ {
4800
+ "epoch": 0.043033649835127724,
4801
+ "grad_norm": 0.2611987292766571,
4802
+ "learning_rate": 1.8436091347144246e-05,
4803
+ "loss": 1.2407,
4804
+ "step": 677
4805
+ },
4806
+ {
4807
+ "epoch": 0.0430972150490644,
4808
+ "grad_norm": 0.2611881196498871,
4809
+ "learning_rate": 1.821768780864943e-05,
4810
+ "loss": 1.1918,
4811
+ "step": 678
4812
+ },
4813
+ {
4814
+ "epoch": 0.043160780263001075,
4815
+ "grad_norm": 0.2661250829696655,
4816
+ "learning_rate": 1.800045593237647e-05,
4817
+ "loss": 1.2046,
4818
+ "step": 679
4819
+ },
4820
+ {
4821
+ "epoch": 0.043224345476937744,
4822
+ "grad_norm": 0.2643533945083618,
4823
+ "learning_rate": 1.7784398830519e-05,
4824
+ "loss": 1.1827,
4825
+ "step": 680
4826
+ },
4827
+ {
4828
+ "epoch": 0.04328791069087442,
4829
+ "grad_norm": 0.25061362981796265,
4830
+ "learning_rate": 1.756951959844e-05,
4831
+ "loss": 1.2051,
4832
+ "step": 681
4833
+ },
4834
+ {
4835
+ "epoch": 0.043351475904811095,
4836
+ "grad_norm": 0.24832050502300262,
4837
+ "learning_rate": 1.7355821314627564e-05,
4838
+ "loss": 1.1704,
4839
+ "step": 682
4840
+ },
4841
+ {
4842
+ "epoch": 0.04341504111874776,
4843
+ "grad_norm": 0.26712068915367126,
4844
+ "learning_rate": 1.7143307040650925e-05,
4845
+ "loss": 1.2655,
4846
+ "step": 683
4847
+ },
4848
+ {
4849
+ "epoch": 0.04347860633268444,
4850
+ "grad_norm": 0.26257115602493286,
4851
+ "learning_rate": 1.6931979821116418e-05,
4852
+ "loss": 1.183,
4853
+ "step": 684
4854
+ },
4855
+ {
4856
+ "epoch": 0.043542171546621114,
4857
+ "grad_norm": 0.2578732371330261,
4858
+ "learning_rate": 1.672184268362391e-05,
4859
+ "loss": 1.1036,
4860
+ "step": 685
4861
+ },
4862
+ {
4863
+ "epoch": 0.04360573676055778,
4864
+ "grad_norm": 0.25747859477996826,
4865
+ "learning_rate": 1.6512898638723497e-05,
4866
+ "loss": 1.2769,
4867
+ "step": 686
4868
+ },
4869
+ {
4870
+ "epoch": 0.04366930197449446,
4871
+ "grad_norm": 0.26593005657196045,
4872
+ "learning_rate": 1.630515067987226e-05,
4873
+ "loss": 1.2707,
4874
+ "step": 687
4875
+ },
4876
+ {
4877
+ "epoch": 0.04373286718843113,
4878
+ "grad_norm": 0.2610760033130646,
4879
+ "learning_rate": 1.6098601783391487e-05,
4880
+ "loss": 1.2226,
4881
+ "step": 688
4882
+ },
4883
+ {
4884
+ "epoch": 0.0437964324023678,
4885
+ "grad_norm": 0.2636644244194031,
4886
+ "learning_rate": 1.5893254908423937e-05,
4887
+ "loss": 1.194,
4888
+ "step": 689
4889
+ },
4890
+ {
4891
+ "epoch": 0.04385999761630448,
4892
+ "grad_norm": 0.25099021196365356,
4893
+ "learning_rate": 1.5689112996891576e-05,
4894
+ "loss": 1.1853,
4895
+ "step": 690
4896
+ },
4897
+ {
4898
+ "epoch": 0.04392356283024115,
4899
+ "grad_norm": 0.26002123951911926,
4900
+ "learning_rate": 1.54861789734532e-05,
4901
+ "loss": 1.1705,
4902
+ "step": 691
4903
+ },
4904
+ {
4905
+ "epoch": 0.04398712804417782,
4906
+ "grad_norm": 0.25610899925231934,
4907
+ "learning_rate": 1.5284455745462834e-05,
4908
+ "loss": 1.173,
4909
+ "step": 692
4910
+ },
4911
+ {
4912
+ "epoch": 0.0440506932581145,
4913
+ "grad_norm": 0.2630417048931122,
4914
+ "learning_rate": 1.5083946202927824e-05,
4915
+ "loss": 1.183,
4916
+ "step": 693
4917
+ },
4918
+ {
4919
+ "epoch": 0.04411425847205117,
4920
+ "grad_norm": 0.26131799817085266,
4921
+ "learning_rate": 1.4884653218467571e-05,
4922
+ "loss": 1.2147,
4923
+ "step": 694
4924
+ },
4925
+ {
4926
+ "epoch": 0.04417782368598784,
4927
+ "grad_norm": 0.2511073052883148,
4928
+ "learning_rate": 1.4686579647272336e-05,
4929
+ "loss": 1.1362,
4930
+ "step": 695
4931
+ },
4932
+ {
4933
+ "epoch": 0.044241388899924516,
4934
+ "grad_norm": 0.2500525414943695,
4935
+ "learning_rate": 1.4489728327062324e-05,
4936
+ "loss": 1.1264,
4937
+ "step": 696
4938
+ },
4939
+ {
4940
+ "epoch": 0.04430495411386119,
4941
+ "grad_norm": 0.2648208439350128,
4942
+ "learning_rate": 1.4294102078047055e-05,
4943
+ "loss": 1.2098,
4944
+ "step": 697
4945
+ },
4946
+ {
4947
+ "epoch": 0.04436851932779786,
4948
+ "grad_norm": 0.2602032721042633,
4949
+ "learning_rate": 1.4099703702884936e-05,
4950
+ "loss": 1.2527,
4951
+ "step": 698
4952
+ },
4953
+ {
4954
+ "epoch": 0.044432084541734536,
4955
+ "grad_norm": 0.26263752579689026,
4956
+ "learning_rate": 1.3906535986643176e-05,
4957
+ "loss": 1.218,
4958
+ "step": 699
4959
+ },
4960
+ {
4961
+ "epoch": 0.04449564975567121,
4962
+ "grad_norm": 0.2635667622089386,
4963
+ "learning_rate": 1.3714601696757712e-05,
4964
+ "loss": 1.2896,
4965
+ "step": 700
4966
+ },
4967
+ {
4968
+ "epoch": 0.04449564975567121,
4969
+ "eval_loss": 1.2046868801116943,
4970
+ "eval_runtime": 1238.8537,
4971
+ "eval_samples_per_second": 4.036,
4972
+ "eval_steps_per_second": 1.009,
4973
+ "step": 700
4974
  }
4975
  ],
4976
  "logging_steps": 1,
 
4999
  "attributes": {}
5000
  }
5001
  },
5002
+ "total_flos": 3.637357672660992e+18,
5003
  "train_batch_size": 4,
5004
  "trial_name": null,
5005
  "trial_params": null