gsmyrnis's picture
End of training
39f4cfd verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1521,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01972386587771203,
"grad_norm": 8.260985845744274,
"learning_rate": 6.493506493506493e-07,
"loss": 0.7556,
"step": 10
},
{
"epoch": 0.03944773175542406,
"grad_norm": 3.285262290141762,
"learning_rate": 1.2987012987012986e-06,
"loss": 0.6243,
"step": 20
},
{
"epoch": 0.05917159763313609,
"grad_norm": 1.8240152669785012,
"learning_rate": 1.9480519480519483e-06,
"loss": 0.527,
"step": 30
},
{
"epoch": 0.07889546351084813,
"grad_norm": 2.484918347587673,
"learning_rate": 2.597402597402597e-06,
"loss": 0.4858,
"step": 40
},
{
"epoch": 0.09861932938856016,
"grad_norm": 1.6071106659008887,
"learning_rate": 3.246753246753247e-06,
"loss": 0.4616,
"step": 50
},
{
"epoch": 0.11834319526627218,
"grad_norm": 1.7753595146102916,
"learning_rate": 3.896103896103897e-06,
"loss": 0.4467,
"step": 60
},
{
"epoch": 0.13806706114398423,
"grad_norm": 1.9952784096044014,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.4355,
"step": 70
},
{
"epoch": 0.15779092702169625,
"grad_norm": 2.0090088525329057,
"learning_rate": 4.999952075361122e-06,
"loss": 0.4303,
"step": 80
},
{
"epoch": 0.17751479289940827,
"grad_norm": 1.9429866993343683,
"learning_rate": 4.99910013857428e-06,
"loss": 0.4213,
"step": 90
},
{
"epoch": 0.19723865877712032,
"grad_norm": 2.2150406308730166,
"learning_rate": 4.997183673954895e-06,
"loss": 0.4205,
"step": 100
},
{
"epoch": 0.21696252465483234,
"grad_norm": 2.3280715715799105,
"learning_rate": 4.994203588590157e-06,
"loss": 0.4132,
"step": 110
},
{
"epoch": 0.23668639053254437,
"grad_norm": 2.0514718162160617,
"learning_rate": 4.9901612929925455e-06,
"loss": 0.4097,
"step": 120
},
{
"epoch": 0.2564102564102564,
"grad_norm": 2.248051724393392,
"learning_rate": 4.985058700432217e-06,
"loss": 0.4078,
"step": 130
},
{
"epoch": 0.27613412228796846,
"grad_norm": 2.4477065193392114,
"learning_rate": 4.978898226031426e-06,
"loss": 0.4035,
"step": 140
},
{
"epoch": 0.2958579881656805,
"grad_norm": 2.3530821376592317,
"learning_rate": 4.97168278562142e-06,
"loss": 0.3988,
"step": 150
},
{
"epoch": 0.3155818540433925,
"grad_norm": 2.0658208779463796,
"learning_rate": 4.9634157943623345e-06,
"loss": 0.4008,
"step": 160
},
{
"epoch": 0.33530571992110453,
"grad_norm": 1.6308701318103827,
"learning_rate": 4.954101165126764e-06,
"loss": 0.3955,
"step": 170
},
{
"epoch": 0.35502958579881655,
"grad_norm": 1.8767575875235638,
"learning_rate": 4.943743306647738e-06,
"loss": 0.3964,
"step": 180
},
{
"epoch": 0.3747534516765286,
"grad_norm": 2.158851334024998,
"learning_rate": 4.932347121432018e-06,
"loss": 0.3955,
"step": 190
},
{
"epoch": 0.39447731755424065,
"grad_norm": 2.2424601067528367,
"learning_rate": 4.919918003439677e-06,
"loss": 0.3929,
"step": 200
},
{
"epoch": 0.41420118343195267,
"grad_norm": 1.4704562127782181,
"learning_rate": 4.9064618355310694e-06,
"loss": 0.3951,
"step": 210
},
{
"epoch": 0.4339250493096647,
"grad_norm": 1.5325962055467024,
"learning_rate": 4.8919849866823955e-06,
"loss": 0.3936,
"step": 220
},
{
"epoch": 0.4536489151873767,
"grad_norm": 1.752553432251344,
"learning_rate": 4.8764943089711876e-06,
"loss": 0.3894,
"step": 230
},
{
"epoch": 0.47337278106508873,
"grad_norm": 3.582185649197669,
"learning_rate": 4.859997134333133e-06,
"loss": 0.39,
"step": 240
},
{
"epoch": 0.4930966469428008,
"grad_norm": 2.283623608488685,
"learning_rate": 4.842501271091773e-06,
"loss": 0.3845,
"step": 250
},
{
"epoch": 0.5128205128205128,
"grad_norm": 2.954635543178996,
"learning_rate": 4.8240150002627285e-06,
"loss": 0.3853,
"step": 260
},
{
"epoch": 0.5325443786982249,
"grad_norm": 2.621411991175976,
"learning_rate": 4.80454707163418e-06,
"loss": 0.3802,
"step": 270
},
{
"epoch": 0.5522682445759369,
"grad_norm": 3.0076538937186554,
"learning_rate": 4.784106699625493e-06,
"loss": 0.3778,
"step": 280
},
{
"epoch": 0.571992110453649,
"grad_norm": 2.620788244299813,
"learning_rate": 4.762703558925907e-06,
"loss": 0.381,
"step": 290
},
{
"epoch": 0.591715976331361,
"grad_norm": 2.600774288511616,
"learning_rate": 4.740347779915384e-06,
"loss": 0.3795,
"step": 300
},
{
"epoch": 0.611439842209073,
"grad_norm": 2.825934593188172,
"learning_rate": 4.717049943869774e-06,
"loss": 0.3754,
"step": 310
},
{
"epoch": 0.631163708086785,
"grad_norm": 1.9636455738063043,
"learning_rate": 4.692821077952556e-06,
"loss": 0.3709,
"step": 320
},
{
"epoch": 0.650887573964497,
"grad_norm": 1.465934093555826,
"learning_rate": 4.667672649995539e-06,
"loss": 0.3686,
"step": 330
},
{
"epoch": 0.6706114398422091,
"grad_norm": 1.6730733158146738,
"learning_rate": 4.641616563071003e-06,
"loss": 0.374,
"step": 340
},
{
"epoch": 0.6903353057199211,
"grad_norm": 1.6420981338152472,
"learning_rate": 4.6146651498578095e-06,
"loss": 0.3725,
"step": 350
},
{
"epoch": 0.7100591715976331,
"grad_norm": 1.7081311753490396,
"learning_rate": 4.586831166804191e-06,
"loss": 0.3723,
"step": 360
},
{
"epoch": 0.7297830374753451,
"grad_norm": 1.7004420091082517,
"learning_rate": 4.558127788089966e-06,
"loss": 0.3685,
"step": 370
},
{
"epoch": 0.7495069033530573,
"grad_norm": 1.813129623101683,
"learning_rate": 4.5285685993910246e-06,
"loss": 0.3693,
"step": 380
},
{
"epoch": 0.7692307692307693,
"grad_norm": 1.441392302489358,
"learning_rate": 4.49816759144906e-06,
"loss": 0.3672,
"step": 390
},
{
"epoch": 0.7889546351084813,
"grad_norm": 1.743528342139816,
"learning_rate": 4.466939153449565e-06,
"loss": 0.3629,
"step": 400
},
{
"epoch": 0.8086785009861933,
"grad_norm": 1.5505480061250534,
"learning_rate": 4.434898066211255e-06,
"loss": 0.3647,
"step": 410
},
{
"epoch": 0.8284023668639053,
"grad_norm": 1.748134152515452,
"learning_rate": 4.402059495190112e-06,
"loss": 0.3687,
"step": 420
},
{
"epoch": 0.8481262327416174,
"grad_norm": 1.888131474531523,
"learning_rate": 4.368438983301382e-06,
"loss": 0.368,
"step": 430
},
{
"epoch": 0.8678500986193294,
"grad_norm": 1.3077877777100417,
"learning_rate": 4.334052443562914e-06,
"loss": 0.364,
"step": 440
},
{
"epoch": 0.8875739644970414,
"grad_norm": 1.7143497390643974,
"learning_rate": 4.298916151563324e-06,
"loss": 0.3662,
"step": 450
},
{
"epoch": 0.9072978303747534,
"grad_norm": 1.2650560376490414,
"learning_rate": 4.263046737758557e-06,
"loss": 0.3634,
"step": 460
},
{
"epoch": 0.9270216962524654,
"grad_norm": 1.325272234023546,
"learning_rate": 4.226461179600474e-06,
"loss": 0.3647,
"step": 470
},
{
"epoch": 0.9467455621301775,
"grad_norm": 1.7799396783443953,
"learning_rate": 4.189176793501208e-06,
"loss": 0.3601,
"step": 480
},
{
"epoch": 0.9664694280078896,
"grad_norm": 1.6138030010077298,
"learning_rate": 4.151211226637083e-06,
"loss": 0.3639,
"step": 490
},
{
"epoch": 0.9861932938856016,
"grad_norm": 1.6475058606657829,
"learning_rate": 4.112582448595989e-06,
"loss": 0.3631,
"step": 500
},
{
"epoch": 1.0,
"eval_loss": 0.3610161542892456,
"eval_runtime": 46.5378,
"eval_samples_per_second": 293.095,
"eval_steps_per_second": 1.16,
"step": 507
},
{
"epoch": 1.0059171597633136,
"grad_norm": 2.318083617694004,
"learning_rate": 4.073308742872136e-06,
"loss": 0.339,
"step": 510
},
{
"epoch": 1.0256410256410255,
"grad_norm": 2.26507527796031,
"learning_rate": 4.033408698212244e-06,
"loss": 0.2904,
"step": 520
},
{
"epoch": 1.0453648915187377,
"grad_norm": 2.129210352759771,
"learning_rate": 3.99290119981726e-06,
"loss": 0.2845,
"step": 530
},
{
"epoch": 1.0650887573964498,
"grad_norm": 2.0458511034566897,
"learning_rate": 3.95180542040374e-06,
"loss": 0.2826,
"step": 540
},
{
"epoch": 1.0848126232741617,
"grad_norm": 2.34540520465628,
"learning_rate": 3.910140811129166e-06,
"loss": 0.2817,
"step": 550
},
{
"epoch": 1.1045364891518739,
"grad_norm": 1.5731137478504271,
"learning_rate": 3.8679270923854596e-06,
"loss": 0.2816,
"step": 560
},
{
"epoch": 1.1242603550295858,
"grad_norm": 1.9641564243584235,
"learning_rate": 3.825184244465071e-06,
"loss": 0.2833,
"step": 570
},
{
"epoch": 1.143984220907298,
"grad_norm": 1.5653763677552233,
"learning_rate": 3.7819324981040517e-06,
"loss": 0.2835,
"step": 580
},
{
"epoch": 1.1637080867850098,
"grad_norm": 1.4455902546137582,
"learning_rate": 3.7381923249065838e-06,
"loss": 0.2806,
"step": 590
},
{
"epoch": 1.183431952662722,
"grad_norm": 1.4589441051909717,
"learning_rate": 3.6939844276555146e-06,
"loss": 0.2842,
"step": 600
},
{
"epoch": 1.2031558185404339,
"grad_norm": 1.4737079619190827,
"learning_rate": 3.649329730513461e-06,
"loss": 0.2818,
"step": 610
},
{
"epoch": 1.222879684418146,
"grad_norm": 1.424470321783783,
"learning_rate": 3.6042493691191377e-06,
"loss": 0.2835,
"step": 620
},
{
"epoch": 1.242603550295858,
"grad_norm": 1.43822809638539,
"learning_rate": 3.558764680583589e-06,
"loss": 0.2829,
"step": 630
},
{
"epoch": 1.26232741617357,
"grad_norm": 1.4491877471048427,
"learning_rate": 3.51289719339106e-06,
"loss": 0.2823,
"step": 640
},
{
"epoch": 1.282051282051282,
"grad_norm": 1.4979353903583295,
"learning_rate": 3.4666686172092927e-06,
"loss": 0.2859,
"step": 650
},
{
"epoch": 1.301775147928994,
"grad_norm": 1.4793881592613725,
"learning_rate": 3.4201008326140596e-06,
"loss": 0.2849,
"step": 660
},
{
"epoch": 1.3214990138067062,
"grad_norm": 1.6343693105840815,
"learning_rate": 3.3732158807328116e-06,
"loss": 0.2875,
"step": 670
},
{
"epoch": 1.3412228796844181,
"grad_norm": 1.5638318327999918,
"learning_rate": 3.3260359528123266e-06,
"loss": 0.2877,
"step": 680
},
{
"epoch": 1.3609467455621302,
"grad_norm": 1.434550639059279,
"learning_rate": 3.2785833797153115e-06,
"loss": 0.2817,
"step": 690
},
{
"epoch": 1.3806706114398422,
"grad_norm": 1.3783604211664602,
"learning_rate": 3.2308806213509204e-06,
"loss": 0.2809,
"step": 700
},
{
"epoch": 1.4003944773175543,
"grad_norm": 1.7104337243982326,
"learning_rate": 3.182950256044188e-06,
"loss": 0.2825,
"step": 710
},
{
"epoch": 1.4201183431952662,
"grad_norm": 1.9527331404429782,
"learning_rate": 3.1348149698494233e-06,
"loss": 0.2827,
"step": 720
},
{
"epoch": 1.4398422090729783,
"grad_norm": 1.5082040480125063,
"learning_rate": 3.0864975458126158e-06,
"loss": 0.2857,
"step": 730
},
{
"epoch": 1.4595660749506902,
"grad_norm": 1.5939434329404958,
"learning_rate": 3.038020853187914e-06,
"loss": 0.2831,
"step": 740
},
{
"epoch": 1.4792899408284024,
"grad_norm": 1.425454732201556,
"learning_rate": 2.98940783661333e-06,
"loss": 0.2802,
"step": 750
},
{
"epoch": 1.4990138067061145,
"grad_norm": 1.4324944544127631,
"learning_rate": 2.940681505250742e-06,
"loss": 0.2848,
"step": 760
},
{
"epoch": 1.5187376725838264,
"grad_norm": 1.4082984304420074,
"learning_rate": 2.8918649218953624e-06,
"loss": 0.2801,
"step": 770
},
{
"epoch": 1.5384615384615383,
"grad_norm": 1.5895657718154816,
"learning_rate": 2.84298119205983e-06,
"loss": 0.2807,
"step": 780
},
{
"epoch": 1.5581854043392505,
"grad_norm": 1.6080440377232041,
"learning_rate": 2.7940534530380666e-06,
"loss": 0.2835,
"step": 790
},
{
"epoch": 1.5779092702169626,
"grad_norm": 1.404915797241871,
"learning_rate": 2.7451048629541045e-06,
"loss": 0.2808,
"step": 800
},
{
"epoch": 1.5976331360946747,
"grad_norm": 1.4879672080505235,
"learning_rate": 2.6961585898010523e-06,
"loss": 0.2806,
"step": 810
},
{
"epoch": 1.6173570019723866,
"grad_norm": 1.3888602093522253,
"learning_rate": 2.647237800475384e-06,
"loss": 0.2832,
"step": 820
},
{
"epoch": 1.6370808678500985,
"grad_norm": 1.3670120148082392,
"learning_rate": 2.5983656498117525e-06,
"loss": 0.2825,
"step": 830
},
{
"epoch": 1.6568047337278107,
"grad_norm": 1.2812642080517738,
"learning_rate": 2.54956526962351e-06,
"loss": 0.279,
"step": 840
},
{
"epoch": 1.6765285996055228,
"grad_norm": 1.252430854449729,
"learning_rate": 2.5008597577541288e-06,
"loss": 0.2814,
"step": 850
},
{
"epoch": 1.6962524654832347,
"grad_norm": 1.2750427994477165,
"learning_rate": 2.45227216714469e-06,
"loss": 0.2792,
"step": 860
},
{
"epoch": 1.7159763313609466,
"grad_norm": 1.354377403404739,
"learning_rate": 2.403825494922636e-06,
"loss": 0.282,
"step": 870
},
{
"epoch": 1.7357001972386588,
"grad_norm": 1.4267990848182481,
"learning_rate": 2.3555426715169396e-06,
"loss": 0.2791,
"step": 880
},
{
"epoch": 1.755424063116371,
"grad_norm": 1.252857555239978,
"learning_rate": 2.3074465498048303e-06,
"loss": 0.2826,
"step": 890
},
{
"epoch": 1.7751479289940828,
"grad_norm": 1.2876786054611615,
"learning_rate": 2.259559894295244e-06,
"loss": 0.2789,
"step": 900
},
{
"epoch": 1.7948717948717947,
"grad_norm": 1.2629901820145135,
"learning_rate": 2.2119053703540866e-06,
"loss": 0.2791,
"step": 910
},
{
"epoch": 1.8145956607495068,
"grad_norm": 1.3562733049556417,
"learning_rate": 2.1645055334764237e-06,
"loss": 0.2807,
"step": 920
},
{
"epoch": 1.834319526627219,
"grad_norm": 1.3132542320273741,
"learning_rate": 2.1173828186106828e-06,
"loss": 0.2782,
"step": 930
},
{
"epoch": 1.854043392504931,
"grad_norm": 1.372645351488049,
"learning_rate": 2.0705595295399e-06,
"loss": 0.28,
"step": 940
},
{
"epoch": 1.873767258382643,
"grad_norm": 1.286506818666612,
"learning_rate": 2.0240578283250596e-06,
"loss": 0.2788,
"step": 950
},
{
"epoch": 1.893491124260355,
"grad_norm": 1.343985774681719,
"learning_rate": 1.9778997248155013e-06,
"loss": 0.2779,
"step": 960
},
{
"epoch": 1.913214990138067,
"grad_norm": 1.3873943864064089,
"learning_rate": 1.9321070662313824e-06,
"loss": 0.2768,
"step": 970
},
{
"epoch": 1.9329388560157792,
"grad_norm": 1.3822544572854645,
"learning_rate": 1.88670152682311e-06,
"loss": 0.2753,
"step": 980
},
{
"epoch": 1.952662721893491,
"grad_norm": 1.3724554338840655,
"learning_rate": 1.8417045976126347e-06,
"loss": 0.274,
"step": 990
},
{
"epoch": 1.972386587771203,
"grad_norm": 1.428387339598408,
"learning_rate": 1.797137576221482e-06,
"loss": 0.2775,
"step": 1000
},
{
"epoch": 1.9921104536489151,
"grad_norm": 1.2370547509299645,
"learning_rate": 1.753021556790314e-06,
"loss": 0.2746,
"step": 1010
},
{
"epoch": 2.0,
"eval_loss": 0.3482723832130432,
"eval_runtime": 46.4255,
"eval_samples_per_second": 293.804,
"eval_steps_per_second": 1.163,
"step": 1014
},
{
"epoch": 2.0118343195266273,
"grad_norm": 1.9502351693684774,
"learning_rate": 1.7093774199948004e-06,
"loss": 0.2309,
"step": 1020
},
{
"epoch": 2.0315581854043394,
"grad_norm": 1.5862323859503984,
"learning_rate": 1.6662258231625331e-06,
"loss": 0.2026,
"step": 1030
},
{
"epoch": 2.051282051282051,
"grad_norm": 1.3292614459089434,
"learning_rate": 1.6235871904956431e-06,
"loss": 0.2034,
"step": 1040
},
{
"epoch": 2.0710059171597632,
"grad_norm": 1.2370582334736997,
"learning_rate": 1.5814817034037715e-06,
"loss": 0.2008,
"step": 1050
},
{
"epoch": 2.0907297830374754,
"grad_norm": 1.325897622024457,
"learning_rate": 1.5399292909519422e-06,
"loss": 0.2042,
"step": 1060
},
{
"epoch": 2.1104536489151875,
"grad_norm": 1.4548395791353137,
"learning_rate": 1.4989496204278897e-06,
"loss": 0.2025,
"step": 1070
},
{
"epoch": 2.1301775147928996,
"grad_norm": 1.36179677292465,
"learning_rate": 1.458562088033273e-06,
"loss": 0.1978,
"step": 1080
},
{
"epoch": 2.1499013806706113,
"grad_norm": 1.4589926591648759,
"learning_rate": 1.4187858097032086e-06,
"loss": 0.2024,
"step": 1090
},
{
"epoch": 2.1696252465483234,
"grad_norm": 1.3095440667780154,
"learning_rate": 1.3796396120584576e-06,
"loss": 0.2032,
"step": 1100
},
{
"epoch": 2.1893491124260356,
"grad_norm": 1.3522834520399176,
"learning_rate": 1.341142023494537e-06,
"loss": 0.1992,
"step": 1110
},
{
"epoch": 2.2090729783037477,
"grad_norm": 1.3914925068585928,
"learning_rate": 1.3033112654120032e-06,
"loss": 0.2029,
"step": 1120
},
{
"epoch": 2.2287968441814594,
"grad_norm": 1.2392072409116117,
"learning_rate": 1.266165243592024e-06,
"loss": 0.2019,
"step": 1130
},
{
"epoch": 2.2485207100591715,
"grad_norm": 1.450828785906611,
"learning_rate": 1.2297215397213442e-06,
"loss": 0.2029,
"step": 1140
},
{
"epoch": 2.2682445759368837,
"grad_norm": 1.3539897715774756,
"learning_rate": 1.1939974030706499e-06,
"loss": 0.1989,
"step": 1150
},
{
"epoch": 2.287968441814596,
"grad_norm": 1.3124427663284721,
"learning_rate": 1.1590097423302681e-06,
"loss": 0.2013,
"step": 1160
},
{
"epoch": 2.3076923076923075,
"grad_norm": 1.2751387286158546,
"learning_rate": 1.1247751176070688e-06,
"loss": 0.2003,
"step": 1170
},
{
"epoch": 2.3274161735700196,
"grad_norm": 1.2826788452929796,
"learning_rate": 1.0913097325863526e-06,
"loss": 0.2013,
"step": 1180
},
{
"epoch": 2.3471400394477318,
"grad_norm": 1.3449233167779666,
"learning_rate": 1.0586294268624391e-06,
"loss": 0.2031,
"step": 1190
},
{
"epoch": 2.366863905325444,
"grad_norm": 1.3034368496811286,
"learning_rate": 1.026749668441587e-06,
"loss": 0.1994,
"step": 1200
},
{
"epoch": 2.386587771203156,
"grad_norm": 1.3565807097213252,
"learning_rate": 9.956855464207873e-07,
"loss": 0.2,
"step": 1210
},
{
"epoch": 2.4063116370808677,
"grad_norm": 1.451004027193357,
"learning_rate": 9.654517638459015e-07,
"loss": 0.1996,
"step": 1220
},
{
"epoch": 2.42603550295858,
"grad_norm": 1.3107553476519733,
"learning_rate": 9.360626307525231e-07,
"loss": 0.2004,
"step": 1230
},
{
"epoch": 2.445759368836292,
"grad_norm": 1.2866100592193557,
"learning_rate": 9.075320573928513e-07,
"loss": 0.2026,
"step": 1240
},
{
"epoch": 2.465483234714004,
"grad_norm": 1.3169876215045113,
"learning_rate": 8.798735476517964e-07,
"loss": 0.2027,
"step": 1250
},
{
"epoch": 2.485207100591716,
"grad_norm": 1.2821201625196061,
"learning_rate": 8.531001926554134e-07,
"loss": 0.2011,
"step": 1260
},
{
"epoch": 2.504930966469428,
"grad_norm": 1.315132765819279,
"learning_rate": 8.272246645747072e-07,
"loss": 0.199,
"step": 1270
},
{
"epoch": 2.52465483234714,
"grad_norm": 1.276154658164099,
"learning_rate": 8.022592106277332e-07,
"loss": 0.2008,
"step": 1280
},
{
"epoch": 2.544378698224852,
"grad_norm": 1.2274421062761773,
"learning_rate": 7.782156472828299e-07,
"loss": 0.1998,
"step": 1290
},
{
"epoch": 2.564102564102564,
"grad_norm": 1.2435720383981574,
"learning_rate": 7.551053546657356e-07,
"loss": 0.1995,
"step": 1300
},
{
"epoch": 2.583826429980276,
"grad_norm": 1.2327909078947592,
"learning_rate": 7.329392711732278e-07,
"loss": 0.2024,
"step": 1310
},
{
"epoch": 2.603550295857988,
"grad_norm": 1.1783489485507048,
"learning_rate": 7.117278882958421e-07,
"loss": 0.2003,
"step": 1320
},
{
"epoch": 2.6232741617357003,
"grad_norm": 1.2687230261577986,
"learning_rate": 6.914812456521138e-07,
"loss": 0.2006,
"step": 1330
},
{
"epoch": 2.6429980276134124,
"grad_norm": 1.2646158919927277,
"learning_rate": 6.722089262366993e-07,
"loss": 0.1982,
"step": 1340
},
{
"epoch": 2.662721893491124,
"grad_norm": 1.2236131305338422,
"learning_rate": 6.539200518846226e-07,
"loss": 0.2001,
"step": 1350
},
{
"epoch": 2.6824457593688362,
"grad_norm": 1.2428023457207789,
"learning_rate": 6.366232789537923e-07,
"loss": 0.2048,
"step": 1360
},
{
"epoch": 2.7021696252465484,
"grad_norm": 1.2559417256017682,
"learning_rate": 6.203267942278395e-07,
"loss": 0.2012,
"step": 1370
},
{
"epoch": 2.7218934911242605,
"grad_norm": 1.2572564112264348,
"learning_rate": 6.050383110412069e-07,
"loss": 0.1994,
"step": 1380
},
{
"epoch": 2.7416173570019726,
"grad_norm": 1.1764889460619852,
"learning_rate": 5.907650656283289e-07,
"loss": 0.2002,
"step": 1390
},
{
"epoch": 2.7613412228796843,
"grad_norm": 1.2804661059833917,
"learning_rate": 5.775138136986298e-07,
"loss": 0.2002,
"step": 1400
},
{
"epoch": 2.7810650887573964,
"grad_norm": 1.3077263435732718,
"learning_rate": 5.652908272389604e-07,
"loss": 0.1995,
"step": 1410
},
{
"epoch": 2.8007889546351086,
"grad_norm": 1.231137370296971,
"learning_rate": 5.541018915449863e-07,
"loss": 0.1989,
"step": 1420
},
{
"epoch": 2.8205128205128203,
"grad_norm": 1.3443797697665705,
"learning_rate": 5.439523024829335e-07,
"loss": 0.1983,
"step": 1430
},
{
"epoch": 2.8402366863905324,
"grad_norm": 1.2092638219767884,
"learning_rate": 5.348468639829871e-07,
"loss": 0.2007,
"step": 1440
},
{
"epoch": 2.8599605522682445,
"grad_norm": 1.2392545674361426,
"learning_rate": 5.267898857655307e-07,
"loss": 0.201,
"step": 1450
},
{
"epoch": 2.8796844181459567,
"grad_norm": 1.255507262390408,
"learning_rate": 5.19785181301299e-07,
"loss": 0.2008,
"step": 1460
},
{
"epoch": 2.899408284023669,
"grad_norm": 1.2545629120536586,
"learning_rate": 5.138360660064146e-07,
"loss": 0.1979,
"step": 1470
},
{
"epoch": 2.9191321499013805,
"grad_norm": 1.2279624795193589,
"learning_rate": 5.08945355673159e-07,
"loss": 0.201,
"step": 1480
},
{
"epoch": 2.9388560157790926,
"grad_norm": 1.2395946923655343,
"learning_rate": 5.05115365137222e-07,
"loss": 0.1999,
"step": 1490
},
{
"epoch": 2.9585798816568047,
"grad_norm": 1.2212433583596156,
"learning_rate": 5.023479071820607e-07,
"loss": 0.1989,
"step": 1500
},
{
"epoch": 2.978303747534517,
"grad_norm": 1.298954785136158,
"learning_rate": 5.006442916808849e-07,
"loss": 0.2019,
"step": 1510
},
{
"epoch": 2.998027613412229,
"grad_norm": 1.3586461216494594,
"learning_rate": 5.000053249766787e-07,
"loss": 0.1999,
"step": 1520
},
{
"epoch": 3.0,
"eval_loss": 0.37075862288475037,
"eval_runtime": 53.9042,
"eval_samples_per_second": 253.041,
"eval_steps_per_second": 1.002,
"step": 1521
},
{
"epoch": 3.0,
"step": 1521,
"total_flos": 2547731650314240.0,
"train_loss": 0.2954053143131192,
"train_runtime": 9131.3778,
"train_samples_per_second": 85.143,
"train_steps_per_second": 0.167
}
],
"logging_steps": 10,
"max_steps": 1521,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2547731650314240.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}