SummerSigh commited on
Commit
bd8a4e1
1 Parent(s): 3afb0a7

Upload 8 files

Browse files
Files changed (4) hide show
  1. model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. scheduler.pt +1 -1
  4. trainer_state.json +884 -4
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9fe830f215593241d5a06be7b7382d26df2df95fa5877818c55602cc0aaaec7c
3
  size 18494040
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f64a3cafa47c9ba3e54437d1f9852c222a0087b81b8ce6e387c02057cb1bfd3
3
  size 18494040
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ffc9e9ba9737e7047e65caa20e5526ed8da4c213c4ce3f2cca848b1ac8ecdbd
3
  size 37035002
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be93c1be2bc3f33f7d84eeeeb4d8c4d995ed64199a72fdbe553b1f003bc30445
3
  size 37035002
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c4c29cdd3791b07f27eccf7f1e479362eae64a2df7b8ed21d32b1b0f2e78f0d
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60cbdd85cf6bcb7c6140c88eacbc709e5746be6620fc2427f93d0a9c73d83631
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0964851864300316,
5
  "eval_steps": 500,
6
- "global_step": 99500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -5311,11 +5311,891 @@
5311
  "loss": 4.3878,
5312
  "num_input_tokens_seen": 581214146,
5313
  "step": 99450
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5314
  }
5315
  ],
5316
  "logging_steps": 150,
5317
  "max_steps": 272232,
5318
- "num_input_tokens_seen": 581503010,
5319
  "num_train_epochs": 3,
5320
  "save_steps": 500,
5321
  "stateful_callbacks": {
@@ -5330,7 +6210,7 @@
5330
  "attributes": {}
5331
  }
5332
  },
5333
- "total_flos": 8978778636326400.0,
5334
  "train_batch_size": 32,
5335
  "trial_name": null,
5336
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.2783143881998358,
5
  "eval_steps": 500,
6
+ "global_step": 116000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
5311
  "loss": 4.3878,
5312
  "num_input_tokens_seen": 581214146,
5313
  "step": 99450
5314
+ },
5315
+ {
5316
+ "epoch": 1.0975871815922729,
5317
+ "grad_norm": 1.9078856706619263,
5318
+ "learning_rate": 0.00010148434400575989,
5319
+ "loss": 4.3976,
5320
+ "num_input_tokens_seen": 582094658,
5321
+ "step": 99600
5322
+ },
5323
+ {
5324
+ "epoch": 1.0992401743356346,
5325
+ "grad_norm": 1.8750337362289429,
5326
+ "learning_rate": 0.00010139618261163437,
5327
+ "loss": 4.3999,
5328
+ "num_input_tokens_seen": 582981922,
5329
+ "step": 99750
5330
+ },
5331
+ {
5332
+ "epoch": 1.1008931670789965,
5333
+ "grad_norm": 1.9243488311767578,
5334
+ "learning_rate": 0.00010130802121750887,
5335
+ "loss": 4.3879,
5336
+ "num_input_tokens_seen": 583869026,
5337
+ "step": 99900
5338
+ },
5339
+ {
5340
+ "epoch": 1.1025461598223583,
5341
+ "grad_norm": 1.8446391820907593,
5342
+ "learning_rate": 0.00010121985982338334,
5343
+ "loss": 4.3894,
5344
+ "num_input_tokens_seen": 584749826,
5345
+ "step": 100050
5346
+ },
5347
+ {
5348
+ "epoch": 1.1041991525657202,
5349
+ "grad_norm": 1.726158857345581,
5350
+ "learning_rate": 0.00010113169842925785,
5351
+ "loss": 4.3985,
5352
+ "num_input_tokens_seen": 585630274,
5353
+ "step": 100200
5354
+ },
5355
+ {
5356
+ "epoch": 1.1058521453090822,
5357
+ "grad_norm": 1.8227604627609253,
5358
+ "learning_rate": 0.00010104353703513232,
5359
+ "loss": 4.3906,
5360
+ "num_input_tokens_seen": 586484930,
5361
+ "step": 100350
5362
+ },
5363
+ {
5364
+ "epoch": 1.1075051380524439,
5365
+ "grad_norm": 1.9156420230865479,
5366
+ "learning_rate": 0.00010095537564100682,
5367
+ "loss": 4.3893,
5368
+ "num_input_tokens_seen": 587352738,
5369
+ "step": 100500
5370
+ },
5371
+ {
5372
+ "epoch": 1.1091581307958058,
5373
+ "grad_norm": 1.8385225534439087,
5374
+ "learning_rate": 0.0001008678019895088,
5375
+ "loss": 4.3994,
5376
+ "num_input_tokens_seen": 588239810,
5377
+ "step": 100650
5378
+ },
5379
+ {
5380
+ "epoch": 1.1108111235391678,
5381
+ "grad_norm": 1.9076261520385742,
5382
+ "learning_rate": 0.00010077964059538329,
5383
+ "loss": 4.3922,
5384
+ "num_input_tokens_seen": 589116514,
5385
+ "step": 100800
5386
+ },
5387
+ {
5388
+ "epoch": 1.1124641162825295,
5389
+ "grad_norm": 1.8701651096343994,
5390
+ "learning_rate": 0.00010069147920125778,
5391
+ "loss": 4.4015,
5392
+ "num_input_tokens_seen": 589983426,
5393
+ "step": 100950
5394
+ },
5395
+ {
5396
+ "epoch": 1.1141171090258915,
5397
+ "grad_norm": 1.9545180797576904,
5398
+ "learning_rate": 0.00010060331780713227,
5399
+ "loss": 4.3978,
5400
+ "num_input_tokens_seen": 590856994,
5401
+ "step": 101100
5402
+ },
5403
+ {
5404
+ "epoch": 1.1157701017692532,
5405
+ "grad_norm": 1.9418137073516846,
5406
+ "learning_rate": 0.00010051515641300676,
5407
+ "loss": 4.3893,
5408
+ "num_input_tokens_seen": 591735490,
5409
+ "step": 101250
5410
+ },
5411
+ {
5412
+ "epoch": 1.1174230945126151,
5413
+ "grad_norm": 1.892683982849121,
5414
+ "learning_rate": 0.00010042699501888123,
5415
+ "loss": 4.3833,
5416
+ "num_input_tokens_seen": 592622626,
5417
+ "step": 101400
5418
+ },
5419
+ {
5420
+ "epoch": 1.1190760872559768,
5421
+ "grad_norm": 1.830404281616211,
5422
+ "learning_rate": 0.00010033883362475573,
5423
+ "loss": 4.3939,
5424
+ "num_input_tokens_seen": 593500354,
5425
+ "step": 101550
5426
+ },
5427
+ {
5428
+ "epoch": 1.1207290799993388,
5429
+ "grad_norm": 1.8536481857299805,
5430
+ "learning_rate": 0.00010025067223063021,
5431
+ "loss": 4.3826,
5432
+ "num_input_tokens_seen": 594383234,
5433
+ "step": 101700
5434
+ },
5435
+ {
5436
+ "epoch": 1.1223820727427007,
5437
+ "grad_norm": 1.84872567653656,
5438
+ "learning_rate": 0.00010016251083650471,
5439
+ "loss": 4.3847,
5440
+ "num_input_tokens_seen": 595255266,
5441
+ "step": 101850
5442
+ },
5443
+ {
5444
+ "epoch": 1.1240350654860625,
5445
+ "grad_norm": 1.8653180599212646,
5446
+ "learning_rate": 0.00010007434944237918,
5447
+ "loss": 4.392,
5448
+ "num_input_tokens_seen": 596135586,
5449
+ "step": 102000
5450
+ },
5451
+ {
5452
+ "epoch": 1.1256880582294244,
5453
+ "grad_norm": 1.8534561395645142,
5454
+ "learning_rate": 9.998618804825369e-05,
5455
+ "loss": 4.3862,
5456
+ "num_input_tokens_seen": 597009218,
5457
+ "step": 102150
5458
+ },
5459
+ {
5460
+ "epoch": 1.1273410509727861,
5461
+ "grad_norm": 1.8982864618301392,
5462
+ "learning_rate": 9.989802665412816e-05,
5463
+ "loss": 4.3969,
5464
+ "num_input_tokens_seen": 597873026,
5465
+ "step": 102300
5466
+ },
5467
+ {
5468
+ "epoch": 1.128994043716148,
5469
+ "grad_norm": 1.9212620258331299,
5470
+ "learning_rate": 9.980986526000266e-05,
5471
+ "loss": 4.3872,
5472
+ "num_input_tokens_seen": 598748322,
5473
+ "step": 102450
5474
+ },
5475
+ {
5476
+ "epoch": 1.13064703645951,
5477
+ "grad_norm": 1.8133482933044434,
5478
+ "learning_rate": 9.972170386587714e-05,
5479
+ "loss": 4.3801,
5480
+ "num_input_tokens_seen": 599625410,
5481
+ "step": 102600
5482
+ },
5483
+ {
5484
+ "epoch": 1.1323000292028718,
5485
+ "grad_norm": 1.8521312475204468,
5486
+ "learning_rate": 9.963354247175164e-05,
5487
+ "loss": 4.3867,
5488
+ "num_input_tokens_seen": 600489762,
5489
+ "step": 102750
5490
+ },
5491
+ {
5492
+ "epoch": 1.1339530219462337,
5493
+ "grad_norm": 2.050074577331543,
5494
+ "learning_rate": 9.954538107762612e-05,
5495
+ "loss": 4.3813,
5496
+ "num_input_tokens_seen": 601357666,
5497
+ "step": 102900
5498
+ },
5499
+ {
5500
+ "epoch": 1.1356060146895954,
5501
+ "grad_norm": 1.8785549402236938,
5502
+ "learning_rate": 9.945721968350062e-05,
5503
+ "loss": 4.3799,
5504
+ "num_input_tokens_seen": 602239362,
5505
+ "step": 103050
5506
+ },
5507
+ {
5508
+ "epoch": 1.1372590074329574,
5509
+ "grad_norm": 1.9237360954284668,
5510
+ "learning_rate": 9.93690582893751e-05,
5511
+ "loss": 4.3902,
5512
+ "num_input_tokens_seen": 603119650,
5513
+ "step": 103200
5514
+ },
5515
+ {
5516
+ "epoch": 1.1389120001763193,
5517
+ "grad_norm": 1.8664278984069824,
5518
+ "learning_rate": 9.928089689524957e-05,
5519
+ "loss": 4.3905,
5520
+ "num_input_tokens_seen": 603985666,
5521
+ "step": 103350
5522
+ },
5523
+ {
5524
+ "epoch": 1.140564992919681,
5525
+ "grad_norm": 1.812515139579773,
5526
+ "learning_rate": 9.919273550112407e-05,
5527
+ "loss": 4.3757,
5528
+ "num_input_tokens_seen": 604874530,
5529
+ "step": 103500
5530
+ },
5531
+ {
5532
+ "epoch": 1.142217985663043,
5533
+ "grad_norm": 1.9093918800354004,
5534
+ "learning_rate": 9.910457410699855e-05,
5535
+ "loss": 4.4058,
5536
+ "num_input_tokens_seen": 605755394,
5537
+ "step": 103650
5538
+ },
5539
+ {
5540
+ "epoch": 1.1438709784064047,
5541
+ "grad_norm": 1.9712496995925903,
5542
+ "learning_rate": 9.901641271287305e-05,
5543
+ "loss": 4.3848,
5544
+ "num_input_tokens_seen": 606649794,
5545
+ "step": 103800
5546
+ },
5547
+ {
5548
+ "epoch": 1.1455239711497667,
5549
+ "grad_norm": 1.9102181196212769,
5550
+ "learning_rate": 9.892825131874752e-05,
5551
+ "loss": 4.3926,
5552
+ "num_input_tokens_seen": 607513858,
5553
+ "step": 103950
5554
+ },
5555
+ {
5556
+ "epoch": 1.1471769638931284,
5557
+ "grad_norm": 1.7749512195587158,
5558
+ "learning_rate": 9.884008992462201e-05,
5559
+ "loss": 4.3906,
5560
+ "num_input_tokens_seen": 608391202,
5561
+ "step": 104100
5562
+ },
5563
+ {
5564
+ "epoch": 1.1488299566364903,
5565
+ "grad_norm": 1.8394023180007935,
5566
+ "learning_rate": 9.87519285304965e-05,
5567
+ "loss": 4.3814,
5568
+ "num_input_tokens_seen": 609282018,
5569
+ "step": 104250
5570
+ },
5571
+ {
5572
+ "epoch": 1.1504829493798523,
5573
+ "grad_norm": 1.9161593914031982,
5574
+ "learning_rate": 9.866376713637099e-05,
5575
+ "loss": 4.3947,
5576
+ "num_input_tokens_seen": 610168514,
5577
+ "step": 104400
5578
+ },
5579
+ {
5580
+ "epoch": 1.152135942123214,
5581
+ "grad_norm": 1.930790901184082,
5582
+ "learning_rate": 9.857560574224548e-05,
5583
+ "loss": 4.3928,
5584
+ "num_input_tokens_seen": 611052354,
5585
+ "step": 104550
5586
+ },
5587
+ {
5588
+ "epoch": 1.153788934866576,
5589
+ "grad_norm": 1.836146354675293,
5590
+ "learning_rate": 9.848803209074748e-05,
5591
+ "loss": 4.3977,
5592
+ "num_input_tokens_seen": 611926498,
5593
+ "step": 104700
5594
+ },
5595
+ {
5596
+ "epoch": 1.155441927609938,
5597
+ "grad_norm": 1.7802364826202393,
5598
+ "learning_rate": 9.839987069662196e-05,
5599
+ "loss": 4.3921,
5600
+ "num_input_tokens_seen": 612818210,
5601
+ "step": 104850
5602
+ },
5603
+ {
5604
+ "epoch": 1.1570949203532996,
5605
+ "grad_norm": 1.9587794542312622,
5606
+ "learning_rate": 9.831170930249643e-05,
5607
+ "loss": 4.3925,
5608
+ "num_input_tokens_seen": 613694850,
5609
+ "step": 105000
5610
+ },
5611
+ {
5612
+ "epoch": 1.1587479130966616,
5613
+ "grad_norm": 1.9676165580749512,
5614
+ "learning_rate": 9.822354790837093e-05,
5615
+ "loss": 4.3782,
5616
+ "num_input_tokens_seen": 614583618,
5617
+ "step": 105150
5618
+ },
5619
+ {
5620
+ "epoch": 1.1604009058400233,
5621
+ "grad_norm": 1.8942914009094238,
5622
+ "learning_rate": 9.813538651424541e-05,
5623
+ "loss": 4.3792,
5624
+ "num_input_tokens_seen": 615478530,
5625
+ "step": 105300
5626
+ },
5627
+ {
5628
+ "epoch": 1.1620538985833853,
5629
+ "grad_norm": 1.8436447381973267,
5630
+ "learning_rate": 9.804722512011991e-05,
5631
+ "loss": 4.3848,
5632
+ "num_input_tokens_seen": 616374914,
5633
+ "step": 105450
5634
+ },
5635
+ {
5636
+ "epoch": 1.163706891326747,
5637
+ "grad_norm": 1.9150909185409546,
5638
+ "learning_rate": 9.795906372599439e-05,
5639
+ "loss": 4.381,
5640
+ "num_input_tokens_seen": 617260162,
5641
+ "step": 105600
5642
+ },
5643
+ {
5644
+ "epoch": 1.165359884070109,
5645
+ "grad_norm": 2.0403525829315186,
5646
+ "learning_rate": 9.787090233186889e-05,
5647
+ "loss": 4.3835,
5648
+ "num_input_tokens_seen": 618136386,
5649
+ "step": 105750
5650
+ },
5651
+ {
5652
+ "epoch": 1.1670128768134709,
5653
+ "grad_norm": 1.8062185049057007,
5654
+ "learning_rate": 9.778274093774336e-05,
5655
+ "loss": 4.3821,
5656
+ "num_input_tokens_seen": 619009282,
5657
+ "step": 105900
5658
+ },
5659
+ {
5660
+ "epoch": 1.1686658695568326,
5661
+ "grad_norm": 1.9948753118515015,
5662
+ "learning_rate": 9.769457954361787e-05,
5663
+ "loss": 4.3911,
5664
+ "num_input_tokens_seen": 619886722,
5665
+ "step": 106050
5666
+ },
5667
+ {
5668
+ "epoch": 1.1703188623001946,
5669
+ "grad_norm": 1.8109992742538452,
5670
+ "learning_rate": 9.760641814949234e-05,
5671
+ "loss": 4.3791,
5672
+ "num_input_tokens_seen": 620758178,
5673
+ "step": 106200
5674
+ },
5675
+ {
5676
+ "epoch": 1.1719718550435563,
5677
+ "grad_norm": 1.9707014560699463,
5678
+ "learning_rate": 9.751825675536684e-05,
5679
+ "loss": 4.3809,
5680
+ "num_input_tokens_seen": 621629506,
5681
+ "step": 106350
5682
+ },
5683
+ {
5684
+ "epoch": 1.1736248477869182,
5685
+ "grad_norm": 1.9458143711090088,
5686
+ "learning_rate": 9.743009536124132e-05,
5687
+ "loss": 4.3952,
5688
+ "num_input_tokens_seen": 622496418,
5689
+ "step": 106500
5690
+ },
5691
+ {
5692
+ "epoch": 1.17527784053028,
5693
+ "grad_norm": 1.9349957704544067,
5694
+ "learning_rate": 9.734310945237081e-05,
5695
+ "loss": 4.379,
5696
+ "num_input_tokens_seen": 623395010,
5697
+ "step": 106650
5698
+ },
5699
+ {
5700
+ "epoch": 1.176930833273642,
5701
+ "grad_norm": 1.9133590459823608,
5702
+ "learning_rate": 9.725494805824531e-05,
5703
+ "loss": 4.3689,
5704
+ "num_input_tokens_seen": 624262434,
5705
+ "step": 106800
5706
+ },
5707
+ {
5708
+ "epoch": 1.1785838260170038,
5709
+ "grad_norm": 1.9451539516448975,
5710
+ "learning_rate": 9.716678666411979e-05,
5711
+ "loss": 4.3863,
5712
+ "num_input_tokens_seen": 625153506,
5713
+ "step": 106950
5714
+ },
5715
+ {
5716
+ "epoch": 1.1802368187603656,
5717
+ "grad_norm": 2.0072357654571533,
5718
+ "learning_rate": 9.707862526999429e-05,
5719
+ "loss": 4.378,
5720
+ "num_input_tokens_seen": 626026690,
5721
+ "step": 107100
5722
+ },
5723
+ {
5724
+ "epoch": 1.1818898115037275,
5725
+ "grad_norm": 1.7655397653579712,
5726
+ "learning_rate": 9.699046387586877e-05,
5727
+ "loss": 4.3801,
5728
+ "num_input_tokens_seen": 626902594,
5729
+ "step": 107250
5730
+ },
5731
+ {
5732
+ "epoch": 1.1835428042470895,
5733
+ "grad_norm": 1.9583156108856201,
5734
+ "learning_rate": 9.690230248174325e-05,
5735
+ "loss": 4.3902,
5736
+ "num_input_tokens_seen": 627796194,
5737
+ "step": 107400
5738
+ },
5739
+ {
5740
+ "epoch": 1.1851957969904512,
5741
+ "grad_norm": 1.7717612981796265,
5742
+ "learning_rate": 9.681414108761774e-05,
5743
+ "loss": 4.3812,
5744
+ "num_input_tokens_seen": 628675970,
5745
+ "step": 107550
5746
+ },
5747
+ {
5748
+ "epoch": 1.1868487897338131,
5749
+ "grad_norm": 1.9090009927749634,
5750
+ "learning_rate": 9.672597969349223e-05,
5751
+ "loss": 4.3889,
5752
+ "num_input_tokens_seen": 629549794,
5753
+ "step": 107700
5754
+ },
5755
+ {
5756
+ "epoch": 1.1885017824771749,
5757
+ "grad_norm": 1.8910843133926392,
5758
+ "learning_rate": 9.663781829936672e-05,
5759
+ "loss": 4.3913,
5760
+ "num_input_tokens_seen": 630437378,
5761
+ "step": 107850
5762
+ },
5763
+ {
5764
+ "epoch": 1.1901547752205368,
5765
+ "grad_norm": 1.840728521347046,
5766
+ "learning_rate": 9.654965690524121e-05,
5767
+ "loss": 4.3792,
5768
+ "num_input_tokens_seen": 631313666,
5769
+ "step": 108000
5770
+ },
5771
+ {
5772
+ "epoch": 1.1918077679638985,
5773
+ "grad_norm": 1.8772791624069214,
5774
+ "learning_rate": 9.64614955111157e-05,
5775
+ "loss": 4.3813,
5776
+ "num_input_tokens_seen": 632194466,
5777
+ "step": 108150
5778
+ },
5779
+ {
5780
+ "epoch": 1.1934607607072605,
5781
+ "grad_norm": 1.9666273593902588,
5782
+ "learning_rate": 9.637333411699017e-05,
5783
+ "loss": 4.3716,
5784
+ "num_input_tokens_seen": 633058978,
5785
+ "step": 108300
5786
+ },
5787
+ {
5788
+ "epoch": 1.1951137534506224,
5789
+ "grad_norm": 1.930409550666809,
5790
+ "learning_rate": 9.628517272286466e-05,
5791
+ "loss": 4.3934,
5792
+ "num_input_tokens_seen": 633935458,
5793
+ "step": 108450
5794
+ },
5795
+ {
5796
+ "epoch": 1.1967667461939842,
5797
+ "grad_norm": 1.8000093698501587,
5798
+ "learning_rate": 9.619701132873915e-05,
5799
+ "loss": 4.3794,
5800
+ "num_input_tokens_seen": 634825634,
5801
+ "step": 108600
5802
+ },
5803
+ {
5804
+ "epoch": 1.198419738937346,
5805
+ "grad_norm": 1.8369793891906738,
5806
+ "learning_rate": 9.610884993461364e-05,
5807
+ "loss": 4.386,
5808
+ "num_input_tokens_seen": 635701666,
5809
+ "step": 108750
5810
+ },
5811
+ {
5812
+ "epoch": 1.2000727316807078,
5813
+ "grad_norm": 1.9381849765777588,
5814
+ "learning_rate": 9.602068854048813e-05,
5815
+ "loss": 4.3824,
5816
+ "num_input_tokens_seen": 636568994,
5817
+ "step": 108900
5818
+ },
5819
+ {
5820
+ "epoch": 1.2017257244240698,
5821
+ "grad_norm": 1.8089631795883179,
5822
+ "learning_rate": 9.593252714636261e-05,
5823
+ "loss": 4.3733,
5824
+ "num_input_tokens_seen": 637444034,
5825
+ "step": 109050
5826
+ },
5827
+ {
5828
+ "epoch": 1.2033787171674317,
5829
+ "grad_norm": 1.7429847717285156,
5830
+ "learning_rate": 9.584436575223709e-05,
5831
+ "loss": 4.3766,
5832
+ "num_input_tokens_seen": 638321634,
5833
+ "step": 109200
5834
+ },
5835
+ {
5836
+ "epoch": 1.2050317099107934,
5837
+ "grad_norm": 1.9182720184326172,
5838
+ "learning_rate": 9.575620435811159e-05,
5839
+ "loss": 4.3724,
5840
+ "num_input_tokens_seen": 639189538,
5841
+ "step": 109350
5842
+ },
5843
+ {
5844
+ "epoch": 1.2066847026541554,
5845
+ "grad_norm": 1.9700244665145874,
5846
+ "learning_rate": 9.566804296398607e-05,
5847
+ "loss": 4.3859,
5848
+ "num_input_tokens_seen": 640080354,
5849
+ "step": 109500
5850
+ },
5851
+ {
5852
+ "epoch": 1.2083376953975171,
5853
+ "grad_norm": 1.86391019821167,
5854
+ "learning_rate": 9.557988156986057e-05,
5855
+ "loss": 4.3875,
5856
+ "num_input_tokens_seen": 640977634,
5857
+ "step": 109650
5858
+ },
5859
+ {
5860
+ "epoch": 1.209990688140879,
5861
+ "grad_norm": 1.9451704025268555,
5862
+ "learning_rate": 9.549230791836256e-05,
5863
+ "loss": 4.3928,
5864
+ "num_input_tokens_seen": 641871874,
5865
+ "step": 109800
5866
+ },
5867
+ {
5868
+ "epoch": 1.211643680884241,
5869
+ "grad_norm": 2.063884735107422,
5870
+ "learning_rate": 9.540414652423704e-05,
5871
+ "loss": 4.3704,
5872
+ "num_input_tokens_seen": 642751170,
5873
+ "step": 109950
5874
+ },
5875
+ {
5876
+ "epoch": 1.2132966736276027,
5877
+ "grad_norm": 1.8499351739883423,
5878
+ "learning_rate": 9.531598513011154e-05,
5879
+ "loss": 4.3886,
5880
+ "num_input_tokens_seen": 643629698,
5881
+ "step": 110100
5882
+ },
5883
+ {
5884
+ "epoch": 1.2149496663709647,
5885
+ "grad_norm": 1.9735474586486816,
5886
+ "learning_rate": 9.522782373598601e-05,
5887
+ "loss": 4.3854,
5888
+ "num_input_tokens_seen": 644509698,
5889
+ "step": 110250
5890
+ },
5891
+ {
5892
+ "epoch": 1.2166026591143264,
5893
+ "grad_norm": 1.9430962800979614,
5894
+ "learning_rate": 9.513966234186051e-05,
5895
+ "loss": 4.3905,
5896
+ "num_input_tokens_seen": 645395394,
5897
+ "step": 110400
5898
+ },
5899
+ {
5900
+ "epoch": 1.2182556518576884,
5901
+ "grad_norm": 1.9608047008514404,
5902
+ "learning_rate": 9.505150094773499e-05,
5903
+ "loss": 4.383,
5904
+ "num_input_tokens_seen": 646254626,
5905
+ "step": 110550
5906
+ },
5907
+ {
5908
+ "epoch": 1.21990864460105,
5909
+ "grad_norm": 1.9237737655639648,
5910
+ "learning_rate": 9.4963927296237e-05,
5911
+ "loss": 4.3886,
5912
+ "num_input_tokens_seen": 647146658,
5913
+ "step": 110700
5914
+ },
5915
+ {
5916
+ "epoch": 1.221561637344412,
5917
+ "grad_norm": 1.9678759574890137,
5918
+ "learning_rate": 9.487576590211147e-05,
5919
+ "loss": 4.3858,
5920
+ "num_input_tokens_seen": 648004962,
5921
+ "step": 110850
5922
+ },
5923
+ {
5924
+ "epoch": 1.223214630087774,
5925
+ "grad_norm": 1.8643629550933838,
5926
+ "learning_rate": 9.478760450798597e-05,
5927
+ "loss": 4.3718,
5928
+ "num_input_tokens_seen": 648877602,
5929
+ "step": 111000
5930
+ },
5931
+ {
5932
+ "epoch": 1.2248676228311357,
5933
+ "grad_norm": 1.8100017309188843,
5934
+ "learning_rate": 9.469944311386045e-05,
5935
+ "loss": 4.38,
5936
+ "num_input_tokens_seen": 649743970,
5937
+ "step": 111150
5938
+ },
5939
+ {
5940
+ "epoch": 1.2265206155744977,
5941
+ "grad_norm": 1.8271883726119995,
5942
+ "learning_rate": 9.461128171973495e-05,
5943
+ "loss": 4.3911,
5944
+ "num_input_tokens_seen": 650620130,
5945
+ "step": 111300
5946
+ },
5947
+ {
5948
+ "epoch": 1.2281736083178596,
5949
+ "grad_norm": 1.9749687910079956,
5950
+ "learning_rate": 9.452312032560942e-05,
5951
+ "loss": 4.3715,
5952
+ "num_input_tokens_seen": 651492738,
5953
+ "step": 111450
5954
+ },
5955
+ {
5956
+ "epoch": 1.2298266010612213,
5957
+ "grad_norm": 1.9666537046432495,
5958
+ "learning_rate": 9.44349589314839e-05,
5959
+ "loss": 4.3823,
5960
+ "num_input_tokens_seen": 652359170,
5961
+ "step": 111600
5962
+ },
5963
+ {
5964
+ "epoch": 1.2314795938045833,
5965
+ "grad_norm": 1.9260027408599854,
5966
+ "learning_rate": 9.43467975373584e-05,
5967
+ "loss": 4.3862,
5968
+ "num_input_tokens_seen": 653229570,
5969
+ "step": 111750
5970
+ },
5971
+ {
5972
+ "epoch": 1.233132586547945,
5973
+ "grad_norm": 1.8240337371826172,
5974
+ "learning_rate": 9.425863614323288e-05,
5975
+ "loss": 4.3771,
5976
+ "num_input_tokens_seen": 654109090,
5977
+ "step": 111900
5978
+ },
5979
+ {
5980
+ "epoch": 1.234785579291307,
5981
+ "grad_norm": 1.957507848739624,
5982
+ "learning_rate": 9.417047474910738e-05,
5983
+ "loss": 4.3817,
5984
+ "num_input_tokens_seen": 654980482,
5985
+ "step": 112050
5986
+ },
5987
+ {
5988
+ "epoch": 1.2364385720346687,
5989
+ "grad_norm": 1.8944330215454102,
5990
+ "learning_rate": 9.408231335498185e-05,
5991
+ "loss": 4.3812,
5992
+ "num_input_tokens_seen": 655849634,
5993
+ "step": 112200
5994
+ },
5995
+ {
5996
+ "epoch": 1.2380915647780306,
5997
+ "grad_norm": 1.8677889108657837,
5998
+ "learning_rate": 9.399415196085636e-05,
5999
+ "loss": 4.3803,
6000
+ "num_input_tokens_seen": 656736738,
6001
+ "step": 112350
6002
+ },
6003
+ {
6004
+ "epoch": 1.2397445575213926,
6005
+ "grad_norm": 1.8283082246780396,
6006
+ "learning_rate": 9.390599056673083e-05,
6007
+ "loss": 4.3933,
6008
+ "num_input_tokens_seen": 657615938,
6009
+ "step": 112500
6010
+ },
6011
+ {
6012
+ "epoch": 1.2413975502647543,
6013
+ "grad_norm": 1.9106853008270264,
6014
+ "learning_rate": 9.381782917260533e-05,
6015
+ "loss": 4.3847,
6016
+ "num_input_tokens_seen": 658494850,
6017
+ "step": 112650
6018
+ },
6019
+ {
6020
+ "epoch": 1.2430505430081162,
6021
+ "grad_norm": 1.8882030248641968,
6022
+ "learning_rate": 9.372966777847981e-05,
6023
+ "loss": 4.3862,
6024
+ "num_input_tokens_seen": 659363618,
6025
+ "step": 112800
6026
+ },
6027
+ {
6028
+ "epoch": 1.244703535751478,
6029
+ "grad_norm": 1.964934229850769,
6030
+ "learning_rate": 9.36415063843543e-05,
6031
+ "loss": 4.3805,
6032
+ "num_input_tokens_seen": 660234946,
6033
+ "step": 112950
6034
+ },
6035
+ {
6036
+ "epoch": 1.24635652849484,
6037
+ "grad_norm": 1.8856420516967773,
6038
+ "learning_rate": 9.355334499022878e-05,
6039
+ "loss": 4.3794,
6040
+ "num_input_tokens_seen": 661115810,
6041
+ "step": 113100
6042
+ },
6043
+ {
6044
+ "epoch": 1.2480095212382019,
6045
+ "grad_norm": 1.8618583679199219,
6046
+ "learning_rate": 9.346518359610327e-05,
6047
+ "loss": 4.3883,
6048
+ "num_input_tokens_seen": 661994434,
6049
+ "step": 113250
6050
+ },
6051
+ {
6052
+ "epoch": 1.2496625139815636,
6053
+ "grad_norm": 1.9158508777618408,
6054
+ "learning_rate": 9.337702220197776e-05,
6055
+ "loss": 4.3739,
6056
+ "num_input_tokens_seen": 662868834,
6057
+ "step": 113400
6058
+ },
6059
+ {
6060
+ "epoch": 1.2513155067249255,
6061
+ "grad_norm": 1.8499860763549805,
6062
+ "learning_rate": 9.328886080785225e-05,
6063
+ "loss": 4.379,
6064
+ "num_input_tokens_seen": 663752002,
6065
+ "step": 113550
6066
+ },
6067
+ {
6068
+ "epoch": 1.2529684994682873,
6069
+ "grad_norm": 1.8565645217895508,
6070
+ "learning_rate": 9.320069941372673e-05,
6071
+ "loss": 4.3854,
6072
+ "num_input_tokens_seen": 664622402,
6073
+ "step": 113700
6074
+ },
6075
+ {
6076
+ "epoch": 1.2546214922116492,
6077
+ "grad_norm": 2.060188055038452,
6078
+ "learning_rate": 9.311253801960123e-05,
6079
+ "loss": 4.3758,
6080
+ "num_input_tokens_seen": 665495618,
6081
+ "step": 113850
6082
+ },
6083
+ {
6084
+ "epoch": 1.2562744849550112,
6085
+ "grad_norm": 1.892635464668274,
6086
+ "learning_rate": 9.30243766254757e-05,
6087
+ "loss": 4.3884,
6088
+ "num_input_tokens_seen": 666361922,
6089
+ "step": 114000
6090
+ },
6091
+ {
6092
+ "epoch": 1.2579274776983729,
6093
+ "grad_norm": 1.9154144525527954,
6094
+ "learning_rate": 9.29362152313502e-05,
6095
+ "loss": 4.3752,
6096
+ "num_input_tokens_seen": 667241410,
6097
+ "step": 114150
6098
+ },
6099
+ {
6100
+ "epoch": 1.2595804704417348,
6101
+ "grad_norm": 1.9253753423690796,
6102
+ "learning_rate": 9.284805383722468e-05,
6103
+ "loss": 4.3875,
6104
+ "num_input_tokens_seen": 668132226,
6105
+ "step": 114300
6106
+ },
6107
+ {
6108
+ "epoch": 1.2612334631850965,
6109
+ "grad_norm": 1.9465709924697876,
6110
+ "learning_rate": 9.275989244309918e-05,
6111
+ "loss": 4.3742,
6112
+ "num_input_tokens_seen": 669015202,
6113
+ "step": 114450
6114
+ },
6115
+ {
6116
+ "epoch": 1.2628864559284585,
6117
+ "grad_norm": 1.9070016145706177,
6118
+ "learning_rate": 9.267173104897366e-05,
6119
+ "loss": 4.3737,
6120
+ "num_input_tokens_seen": 669892578,
6121
+ "step": 114600
6122
+ },
6123
+ {
6124
+ "epoch": 1.2645394486718202,
6125
+ "grad_norm": 1.9075013399124146,
6126
+ "learning_rate": 9.258356965484816e-05,
6127
+ "loss": 4.3789,
6128
+ "num_input_tokens_seen": 670773314,
6129
+ "step": 114750
6130
+ },
6131
+ {
6132
+ "epoch": 1.2661924414151822,
6133
+ "grad_norm": 1.8648816347122192,
6134
+ "learning_rate": 9.249540826072263e-05,
6135
+ "loss": 4.3583,
6136
+ "num_input_tokens_seen": 671644514,
6137
+ "step": 114900
6138
+ },
6139
+ {
6140
+ "epoch": 1.2678454341585441,
6141
+ "grad_norm": 1.9572055339813232,
6142
+ "learning_rate": 9.240724686659714e-05,
6143
+ "loss": 4.3871,
6144
+ "num_input_tokens_seen": 672523202,
6145
+ "step": 115050
6146
+ },
6147
+ {
6148
+ "epoch": 1.2694984269019058,
6149
+ "grad_norm": 1.9419187307357788,
6150
+ "learning_rate": 9.231908547247161e-05,
6151
+ "loss": 4.3802,
6152
+ "num_input_tokens_seen": 673387298,
6153
+ "step": 115200
6154
+ },
6155
+ {
6156
+ "epoch": 1.2711514196452678,
6157
+ "grad_norm": 1.9556363821029663,
6158
+ "learning_rate": 9.223092407834611e-05,
6159
+ "loss": 4.3922,
6160
+ "num_input_tokens_seen": 674262786,
6161
+ "step": 115350
6162
+ },
6163
+ {
6164
+ "epoch": 1.2728044123886297,
6165
+ "grad_norm": 1.8693435192108154,
6166
+ "learning_rate": 9.214276268422059e-05,
6167
+ "loss": 4.3719,
6168
+ "num_input_tokens_seen": 675145058,
6169
+ "step": 115500
6170
+ },
6171
+ {
6172
+ "epoch": 1.2744574051319915,
6173
+ "grad_norm": 1.9475206136703491,
6174
+ "learning_rate": 9.205460129009508e-05,
6175
+ "loss": 4.38,
6176
+ "num_input_tokens_seen": 676008962,
6177
+ "step": 115650
6178
+ },
6179
+ {
6180
+ "epoch": 1.2761103978753534,
6181
+ "grad_norm": 1.8718332052230835,
6182
+ "learning_rate": 9.196643989596957e-05,
6183
+ "loss": 4.3734,
6184
+ "num_input_tokens_seen": 676887042,
6185
+ "step": 115800
6186
+ },
6187
+ {
6188
+ "epoch": 1.2777633906187151,
6189
+ "grad_norm": 1.8318613767623901,
6190
+ "learning_rate": 9.187827850184405e-05,
6191
+ "loss": 4.3857,
6192
+ "num_input_tokens_seen": 677766690,
6193
+ "step": 115950
6194
  }
6195
  ],
6196
  "logging_steps": 150,
6197
  "max_steps": 272232,
6198
+ "num_input_tokens_seen": 678060130,
6199
  "num_train_epochs": 3,
6200
  "save_steps": 500,
6201
  "stateful_callbacks": {
 
6210
  "attributes": {}
6211
  }
6212
  },
6213
+ "total_flos": 1.04696823656832e+16,
6214
  "train_batch_size": 32,
6215
  "trial_name": null,
6216
  "trial_params": null