Llama-3.2-1B-sft-full / trainer_state.json
ehzoah's picture
Upload folder using huggingface_hub
1aef2a0 verified
raw
history blame
34.3 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 951,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010515247108307045,
"grad_norm": 9.591360424317704,
"learning_rate": 2.0833333333333333e-07,
"loss": 1.5585,
"step": 1
},
{
"epoch": 0.005257623554153523,
"grad_norm": 9.792129596772575,
"learning_rate": 1.0416666666666667e-06,
"loss": 1.5717,
"step": 5
},
{
"epoch": 0.010515247108307046,
"grad_norm": 3.5907937606092246,
"learning_rate": 2.0833333333333334e-06,
"loss": 1.54,
"step": 10
},
{
"epoch": 0.015772870662460567,
"grad_norm": 2.305935396467309,
"learning_rate": 3.125e-06,
"loss": 1.5164,
"step": 15
},
{
"epoch": 0.02103049421661409,
"grad_norm": 1.3430988630406788,
"learning_rate": 4.166666666666667e-06,
"loss": 1.4398,
"step": 20
},
{
"epoch": 0.026288117770767613,
"grad_norm": 1.3395602302159622,
"learning_rate": 5.208333333333334e-06,
"loss": 1.4374,
"step": 25
},
{
"epoch": 0.031545741324921134,
"grad_norm": 0.927990494487892,
"learning_rate": 6.25e-06,
"loss": 1.3952,
"step": 30
},
{
"epoch": 0.03680336487907466,
"grad_norm": 0.817869963107523,
"learning_rate": 7.291666666666667e-06,
"loss": 1.3688,
"step": 35
},
{
"epoch": 0.04206098843322818,
"grad_norm": 0.8099795395904227,
"learning_rate": 8.333333333333334e-06,
"loss": 1.3543,
"step": 40
},
{
"epoch": 0.0473186119873817,
"grad_norm": 0.8196989977897423,
"learning_rate": 9.375000000000001e-06,
"loss": 1.4065,
"step": 45
},
{
"epoch": 0.052576235541535225,
"grad_norm": 0.7891645835706297,
"learning_rate": 1.0416666666666668e-05,
"loss": 1.3719,
"step": 50
},
{
"epoch": 0.05783385909568875,
"grad_norm": 0.7730662990445142,
"learning_rate": 1.1458333333333333e-05,
"loss": 1.3534,
"step": 55
},
{
"epoch": 0.06309148264984227,
"grad_norm": 0.8436428208666454,
"learning_rate": 1.25e-05,
"loss": 1.3484,
"step": 60
},
{
"epoch": 0.0683491062039958,
"grad_norm": 0.930633139226441,
"learning_rate": 1.3541666666666668e-05,
"loss": 1.3339,
"step": 65
},
{
"epoch": 0.07360672975814932,
"grad_norm": 1.2035603215763095,
"learning_rate": 1.4583333333333333e-05,
"loss": 1.3577,
"step": 70
},
{
"epoch": 0.07886435331230283,
"grad_norm": 0.9963124148010385,
"learning_rate": 1.5625e-05,
"loss": 1.3504,
"step": 75
},
{
"epoch": 0.08412197686645637,
"grad_norm": 0.7705000485956546,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.3316,
"step": 80
},
{
"epoch": 0.08937960042060988,
"grad_norm": 0.892949375851743,
"learning_rate": 1.7708333333333335e-05,
"loss": 1.3289,
"step": 85
},
{
"epoch": 0.0946372239747634,
"grad_norm": 0.909849658521525,
"learning_rate": 1.8750000000000002e-05,
"loss": 1.3311,
"step": 90
},
{
"epoch": 0.09989484752891693,
"grad_norm": 0.8255721604143701,
"learning_rate": 1.979166666666667e-05,
"loss": 1.3224,
"step": 95
},
{
"epoch": 0.10515247108307045,
"grad_norm": 0.8253137907707107,
"learning_rate": 1.9998919935516768e-05,
"loss": 1.3291,
"step": 100
},
{
"epoch": 0.11041009463722397,
"grad_norm": 0.8322397994562853,
"learning_rate": 1.999453257340926e-05,
"loss": 1.321,
"step": 105
},
{
"epoch": 0.1156677181913775,
"grad_norm": 0.8169700972120059,
"learning_rate": 1.9986771889316172e-05,
"loss": 1.3145,
"step": 110
},
{
"epoch": 0.12092534174553102,
"grad_norm": 0.8054378165910604,
"learning_rate": 1.9975640502598243e-05,
"loss": 1.3537,
"step": 115
},
{
"epoch": 0.12618296529968454,
"grad_norm": 0.8851185792466449,
"learning_rate": 1.9961142170284762e-05,
"loss": 1.3081,
"step": 120
},
{
"epoch": 0.13144058885383805,
"grad_norm": 0.7983608745240434,
"learning_rate": 1.9943281785805483e-05,
"loss": 1.3235,
"step": 125
},
{
"epoch": 0.1366982124079916,
"grad_norm": 0.8101924460502825,
"learning_rate": 1.9922065377339037e-05,
"loss": 1.3234,
"step": 130
},
{
"epoch": 0.14195583596214512,
"grad_norm": 0.8181918475547033,
"learning_rate": 1.98975001057783e-05,
"loss": 1.3275,
"step": 135
},
{
"epoch": 0.14721345951629863,
"grad_norm": 0.8204094295073951,
"learning_rate": 1.986959426231349e-05,
"loss": 1.3191,
"step": 140
},
{
"epoch": 0.15247108307045215,
"grad_norm": 0.8193214439419285,
"learning_rate": 1.983835726563373e-05,
"loss": 1.3151,
"step": 145
},
{
"epoch": 0.15772870662460567,
"grad_norm": 0.8008496299830612,
"learning_rate": 1.9803799658748096e-05,
"loss": 1.3173,
"step": 150
},
{
"epoch": 0.16298633017875921,
"grad_norm": 0.8976369540265376,
"learning_rate": 1.976593310542718e-05,
"loss": 1.3193,
"step": 155
},
{
"epoch": 0.16824395373291273,
"grad_norm": 0.8219320759522418,
"learning_rate": 1.9724770386266363e-05,
"loss": 1.3074,
"step": 160
},
{
"epoch": 0.17350157728706625,
"grad_norm": 0.7917666416050304,
"learning_rate": 1.968032539437215e-05,
"loss": 1.3229,
"step": 165
},
{
"epoch": 0.17875920084121977,
"grad_norm": 0.8468899498707804,
"learning_rate": 1.963261313067302e-05,
"loss": 1.3053,
"step": 170
},
{
"epoch": 0.18401682439537329,
"grad_norm": 0.7826255604907222,
"learning_rate": 1.958164969885636e-05,
"loss": 1.2994,
"step": 175
},
{
"epoch": 0.1892744479495268,
"grad_norm": 0.7854651229191463,
"learning_rate": 1.9527452299933192e-05,
"loss": 1.2933,
"step": 180
},
{
"epoch": 0.19453207150368035,
"grad_norm": 0.8335518596689603,
"learning_rate": 1.9470039226432562e-05,
"loss": 1.3053,
"step": 185
},
{
"epoch": 0.19978969505783387,
"grad_norm": 0.7925970741738505,
"learning_rate": 1.9409429856227487e-05,
"loss": 1.3118,
"step": 190
},
{
"epoch": 0.20504731861198738,
"grad_norm": 0.9022667633817576,
"learning_rate": 1.934564464599461e-05,
"loss": 1.3006,
"step": 195
},
{
"epoch": 0.2103049421661409,
"grad_norm": 0.7846381821366918,
"learning_rate": 1.9278705124309724e-05,
"loss": 1.3019,
"step": 200
},
{
"epoch": 0.21556256572029442,
"grad_norm": 0.8523088106258838,
"learning_rate": 1.9208633884381528e-05,
"loss": 1.3096,
"step": 205
},
{
"epoch": 0.22082018927444794,
"grad_norm": 0.7773410227959319,
"learning_rate": 1.913545457642601e-05,
"loss": 1.3292,
"step": 210
},
{
"epoch": 0.22607781282860148,
"grad_norm": 0.7700372006553495,
"learning_rate": 1.9059191899684154e-05,
"loss": 1.3039,
"step": 215
},
{
"epoch": 0.231335436382755,
"grad_norm": 0.888501634972992,
"learning_rate": 1.8979871594085482e-05,
"loss": 1.2877,
"step": 220
},
{
"epoch": 0.23659305993690852,
"grad_norm": 1.1002462859379447,
"learning_rate": 1.8897520431560435e-05,
"loss": 1.3015,
"step": 225
},
{
"epoch": 0.24185068349106204,
"grad_norm": 0.8737204056440837,
"learning_rate": 1.881216620700437e-05,
"loss": 1.3115,
"step": 230
},
{
"epoch": 0.24710830704521555,
"grad_norm": 0.789475981373434,
"learning_rate": 1.872383772889634e-05,
"loss": 1.3046,
"step": 235
},
{
"epoch": 0.25236593059936907,
"grad_norm": 0.7857837110627846,
"learning_rate": 1.863256480957574e-05,
"loss": 1.314,
"step": 240
},
{
"epoch": 0.2576235541535226,
"grad_norm": 0.8735939515101088,
"learning_rate": 1.853837825518014e-05,
"loss": 1.2965,
"step": 245
},
{
"epoch": 0.2628811777076761,
"grad_norm": 0.8274285162573444,
"learning_rate": 1.844130985524771e-05,
"loss": 1.2847,
"step": 250
},
{
"epoch": 0.26813880126182965,
"grad_norm": 0.9655358070442068,
"learning_rate": 1.83413923719877e-05,
"loss": 1.3033,
"step": 255
},
{
"epoch": 0.2733964248159832,
"grad_norm": 0.784933865330852,
"learning_rate": 1.8238659529222672e-05,
"loss": 1.2964,
"step": 260
},
{
"epoch": 0.2786540483701367,
"grad_norm": 0.8072089928871935,
"learning_rate": 1.813314600100612e-05,
"loss": 1.3,
"step": 265
},
{
"epoch": 0.28391167192429023,
"grad_norm": 0.7600500534210821,
"learning_rate": 1.802488739991941e-05,
"loss": 1.2897,
"step": 270
},
{
"epoch": 0.2891692954784437,
"grad_norm": 0.7716489846603405,
"learning_rate": 1.7913920265051947e-05,
"loss": 1.2994,
"step": 275
},
{
"epoch": 0.29442691903259727,
"grad_norm": 0.8439416659266935,
"learning_rate": 1.7800282049668593e-05,
"loss": 1.3146,
"step": 280
},
{
"epoch": 0.2996845425867508,
"grad_norm": 0.7413366975663128,
"learning_rate": 1.7684011108568593e-05,
"loss": 1.3157,
"step": 285
},
{
"epoch": 0.3049421661409043,
"grad_norm": 0.9324094146103826,
"learning_rate": 1.7565146685140168e-05,
"loss": 1.2944,
"step": 290
},
{
"epoch": 0.31019978969505785,
"grad_norm": 0.7497525070350954,
"learning_rate": 1.7443728898115228e-05,
"loss": 1.3041,
"step": 295
},
{
"epoch": 0.31545741324921134,
"grad_norm": 0.7274267632257158,
"learning_rate": 1.7319798728028617e-05,
"loss": 1.2855,
"step": 300
},
{
"epoch": 0.3207150368033649,
"grad_norm": 0.7306525895599777,
"learning_rate": 1.7193398003386514e-05,
"loss": 1.2967,
"step": 305
},
{
"epoch": 0.32597266035751843,
"grad_norm": 0.7718504682998596,
"learning_rate": 1.7064569386548586e-05,
"loss": 1.3116,
"step": 310
},
{
"epoch": 0.3312302839116719,
"grad_norm": 0.7002164572114049,
"learning_rate": 1.6933356359328756e-05,
"loss": 1.2812,
"step": 315
},
{
"epoch": 0.33648790746582546,
"grad_norm": 0.757112323690967,
"learning_rate": 1.679980320831934e-05,
"loss": 1.2654,
"step": 320
},
{
"epoch": 0.34174553101997895,
"grad_norm": 0.7509591301953,
"learning_rate": 1.6663955009943603e-05,
"loss": 1.2755,
"step": 325
},
{
"epoch": 0.3470031545741325,
"grad_norm": 0.7896423206563724,
"learning_rate": 1.6525857615241686e-05,
"loss": 1.291,
"step": 330
},
{
"epoch": 0.352260778128286,
"grad_norm": 0.8538357892614057,
"learning_rate": 1.6385557634395138e-05,
"loss": 1.3,
"step": 335
},
{
"epoch": 0.35751840168243953,
"grad_norm": 0.8257603179264333,
"learning_rate": 1.624310242099518e-05,
"loss": 1.2825,
"step": 340
},
{
"epoch": 0.3627760252365931,
"grad_norm": 0.7506565139625698,
"learning_rate": 1.609854005606009e-05,
"loss": 1.2903,
"step": 345
},
{
"epoch": 0.36803364879074657,
"grad_norm": 0.7962631813094616,
"learning_rate": 1.5951919331807052e-05,
"loss": 1.32,
"step": 350
},
{
"epoch": 0.3732912723449001,
"grad_norm": 0.7846004527594455,
"learning_rate": 1.5803289735183952e-05,
"loss": 1.3094,
"step": 355
},
{
"epoch": 0.3785488958990536,
"grad_norm": 0.7529193912570731,
"learning_rate": 1.565270143116672e-05,
"loss": 1.3097,
"step": 360
},
{
"epoch": 0.38380651945320715,
"grad_norm": 0.8450472817355228,
"learning_rate": 1.5500205245827814e-05,
"loss": 1.2954,
"step": 365
},
{
"epoch": 0.3890641430073607,
"grad_norm": 0.7884988874909659,
"learning_rate": 1.5345852649181555e-05,
"loss": 1.2774,
"step": 370
},
{
"epoch": 0.3943217665615142,
"grad_norm": 0.7181038933510216,
"learning_rate": 1.5189695737812153e-05,
"loss": 1.2788,
"step": 375
},
{
"epoch": 0.39957939011566773,
"grad_norm": 0.7398759340722635,
"learning_rate": 1.503178721729022e-05,
"loss": 1.2825,
"step": 380
},
{
"epoch": 0.4048370136698212,
"grad_norm": 0.6946338532175111,
"learning_rate": 1.4872180384383772e-05,
"loss": 1.2945,
"step": 385
},
{
"epoch": 0.41009463722397477,
"grad_norm": 0.7312230396618198,
"learning_rate": 1.4710929109069674e-05,
"loss": 1.2774,
"step": 390
},
{
"epoch": 0.4153522607781283,
"grad_norm": 0.685269793573933,
"learning_rate": 1.4548087816351616e-05,
"loss": 1.2691,
"step": 395
},
{
"epoch": 0.4206098843322818,
"grad_norm": 0.7367025121344211,
"learning_rate": 1.4383711467890776e-05,
"loss": 1.3029,
"step": 400
},
{
"epoch": 0.42586750788643535,
"grad_norm": 0.954670307639884,
"learning_rate": 1.4217855543455323e-05,
"loss": 1.2846,
"step": 405
},
{
"epoch": 0.43112513144058884,
"grad_norm": 0.7884436143443709,
"learning_rate": 1.4050576022195084e-05,
"loss": 1.2686,
"step": 410
},
{
"epoch": 0.4363827549947424,
"grad_norm": 0.6923648258189327,
"learning_rate": 1.3881929363747628e-05,
"loss": 1.2717,
"step": 415
},
{
"epoch": 0.4416403785488959,
"grad_norm": 0.7131098366130528,
"learning_rate": 1.3711972489182208e-05,
"loss": 1.2968,
"step": 420
},
{
"epoch": 0.4468980021030494,
"grad_norm": 0.7320310034272568,
"learning_rate": 1.3540762761787938e-05,
"loss": 1.2829,
"step": 425
},
{
"epoch": 0.45215562565720296,
"grad_norm": 0.7327893476336677,
"learning_rate": 1.3368357967712726e-05,
"loss": 1.2877,
"step": 430
},
{
"epoch": 0.45741324921135645,
"grad_norm": 0.6970014096675695,
"learning_rate": 1.3194816296459483e-05,
"loss": 1.2871,
"step": 435
},
{
"epoch": 0.46267087276551,
"grad_norm": 0.7014388522833548,
"learning_rate": 1.302019632124619e-05,
"loss": 1.2897,
"step": 440
},
{
"epoch": 0.4679284963196635,
"grad_norm": 0.6970153763179125,
"learning_rate": 1.2844556979236463e-05,
"loss": 1.2714,
"step": 445
},
{
"epoch": 0.47318611987381703,
"grad_norm": 0.7162117080997548,
"learning_rate": 1.2667957551647263e-05,
"loss": 1.2705,
"step": 450
},
{
"epoch": 0.4784437434279706,
"grad_norm": 0.7402201632812351,
"learning_rate": 1.24904576437405e-05,
"loss": 1.2654,
"step": 455
},
{
"epoch": 0.48370136698212407,
"grad_norm": 1.0567871681807486,
"learning_rate": 1.2312117164705267e-05,
"loss": 1.2784,
"step": 460
},
{
"epoch": 0.4889589905362776,
"grad_norm": 0.7383322032141622,
"learning_rate": 1.213299630743747e-05,
"loss": 1.2574,
"step": 465
},
{
"epoch": 0.4942166140904311,
"grad_norm": 0.7402615331947396,
"learning_rate": 1.1953155528223728e-05,
"loss": 1.2861,
"step": 470
},
{
"epoch": 0.49947423764458465,
"grad_norm": 0.7073342447220505,
"learning_rate": 1.1772655526336367e-05,
"loss": 1.2899,
"step": 475
},
{
"epoch": 0.5047318611987381,
"grad_norm": 0.6966270957094003,
"learning_rate": 1.1591557223546394e-05,
"loss": 1.2607,
"step": 480
},
{
"epoch": 0.5099894847528917,
"grad_norm": 0.6995026613750213,
"learning_rate": 1.1409921743561383e-05,
"loss": 1.285,
"step": 485
},
{
"epoch": 0.5152471083070452,
"grad_norm": 0.677915660036085,
"learning_rate": 1.1227810391395199e-05,
"loss": 1.2787,
"step": 490
},
{
"epoch": 0.5205047318611987,
"grad_norm": 0.694058172582696,
"learning_rate": 1.1045284632676535e-05,
"loss": 1.2817,
"step": 495
},
{
"epoch": 0.5257623554153522,
"grad_norm": 0.6977571213673606,
"learning_rate": 1.0862406072903224e-05,
"loss": 1.269,
"step": 500
},
{
"epoch": 0.5310199789695058,
"grad_norm": 0.709669977534319,
"learning_rate": 1.067923643664936e-05,
"loss": 1.2569,
"step": 505
},
{
"epoch": 0.5362776025236593,
"grad_norm": 0.700607260206098,
"learning_rate": 1.0495837546732224e-05,
"loss": 1.2722,
"step": 510
},
{
"epoch": 0.5415352260778128,
"grad_norm": 0.6965062537462676,
"learning_rate": 1.031227130334604e-05,
"loss": 1.2689,
"step": 515
},
{
"epoch": 0.5467928496319664,
"grad_norm": 0.6975551930599063,
"learning_rate": 1.0128599663169629e-05,
"loss": 1.3171,
"step": 520
},
{
"epoch": 0.5520504731861199,
"grad_norm": 0.6835553883087109,
"learning_rate": 9.944884618454996e-06,
"loss": 1.2616,
"step": 525
},
{
"epoch": 0.5573080967402734,
"grad_norm": 0.6932270302779978,
"learning_rate": 9.761188176103902e-06,
"loss": 1.2842,
"step": 530
},
{
"epoch": 0.562565720294427,
"grad_norm": 0.7523295705383632,
"learning_rate": 9.577572336739491e-06,
"loss": 1.276,
"step": 535
},
{
"epoch": 0.5678233438485805,
"grad_norm": 0.7410406370549468,
"learning_rate": 9.394099073780066e-06,
"loss": 1.2451,
"step": 540
},
{
"epoch": 0.573080967402734,
"grad_norm": 0.6626519082062838,
"learning_rate": 9.210830312521991e-06,
"loss": 1.2505,
"step": 545
},
{
"epoch": 0.5783385909568874,
"grad_norm": 0.678178932637677,
"learning_rate": 9.027827909238902e-06,
"loss": 1.2884,
"step": 550
},
{
"epoch": 0.583596214511041,
"grad_norm": 0.6964235915302415,
"learning_rate": 8.84515363030414e-06,
"loss": 1.2674,
"step": 555
},
{
"epoch": 0.5888538380651945,
"grad_norm": 0.7295327929832226,
"learning_rate": 8.662869131343607e-06,
"loss": 1.2606,
"step": 560
},
{
"epoch": 0.594111461619348,
"grad_norm": 0.6671644394075068,
"learning_rate": 8.481035936425928e-06,
"loss": 1.2631,
"step": 565
},
{
"epoch": 0.5993690851735016,
"grad_norm": 0.6918129382316475,
"learning_rate": 8.299715417297072e-06,
"loss": 1.2733,
"step": 570
},
{
"epoch": 0.6046267087276551,
"grad_norm": 0.7648575778497364,
"learning_rate": 8.118968772666338e-06,
"loss": 1.2768,
"step": 575
},
{
"epoch": 0.6098843322818086,
"grad_norm": 0.6660976751002976,
"learning_rate": 7.938857007550797e-06,
"loss": 1.2712,
"step": 580
},
{
"epoch": 0.6151419558359621,
"grad_norm": 0.8206761231157318,
"learning_rate": 7.759440912685043e-06,
"loss": 1.2629,
"step": 585
},
{
"epoch": 0.6203995793901157,
"grad_norm": 0.6781893460495839,
"learning_rate": 7.580781044003324e-06,
"loss": 1.2928,
"step": 590
},
{
"epoch": 0.6256572029442692,
"grad_norm": 0.7043738811990014,
"learning_rate": 7.402937702200905e-06,
"loss": 1.2565,
"step": 595
},
{
"epoch": 0.6309148264984227,
"grad_norm": 0.6676378108729303,
"learning_rate": 7.225970912381557e-06,
"loss": 1.2441,
"step": 600
},
{
"epoch": 0.6361724500525763,
"grad_norm": 0.7086319165657909,
"learning_rate": 7.04994040379809e-06,
"loss": 1.2526,
"step": 605
},
{
"epoch": 0.6414300736067298,
"grad_norm": 0.6584513029407596,
"learning_rate": 6.874905589692734e-06,
"loss": 1.2689,
"step": 610
},
{
"epoch": 0.6466876971608833,
"grad_norm": 0.6556498134167102,
"learning_rate": 6.700925547244173e-06,
"loss": 1.254,
"step": 615
},
{
"epoch": 0.6519453207150369,
"grad_norm": 0.7317497625816267,
"learning_rate": 6.528058997627995e-06,
"loss": 1.2773,
"step": 620
},
{
"epoch": 0.6572029442691903,
"grad_norm": 0.6943495622984553,
"learning_rate": 6.356364286197341e-06,
"loss": 1.2774,
"step": 625
},
{
"epoch": 0.6624605678233438,
"grad_norm": 0.6557703327216338,
"learning_rate": 6.18589936279034e-06,
"loss": 1.2561,
"step": 630
},
{
"epoch": 0.6677181913774973,
"grad_norm": 0.7479106302240468,
"learning_rate": 6.016721762171098e-06,
"loss": 1.2592,
"step": 635
},
{
"epoch": 0.6729758149316509,
"grad_norm": 0.6501670860792287,
"learning_rate": 5.848888584610727e-06,
"loss": 1.2647,
"step": 640
},
{
"epoch": 0.6782334384858044,
"grad_norm": 0.6750656043866252,
"learning_rate": 5.6824564766150724e-06,
"loss": 1.2687,
"step": 645
},
{
"epoch": 0.6834910620399579,
"grad_norm": 0.707162362118668,
"learning_rate": 5.51748161180554e-06,
"loss": 1.2658,
"step": 650
},
{
"epoch": 0.6887486855941115,
"grad_norm": 0.638182422688773,
"learning_rate": 5.354019671959601e-06,
"loss": 1.2618,
"step": 655
},
{
"epoch": 0.694006309148265,
"grad_norm": 0.6799258857804185,
"learning_rate": 5.192125828217203e-06,
"loss": 1.265,
"step": 660
},
{
"epoch": 0.6992639327024185,
"grad_norm": 0.6614260729055945,
"learning_rate": 5.0318547224596525e-06,
"loss": 1.2726,
"step": 665
},
{
"epoch": 0.704521556256572,
"grad_norm": 0.6571261971890797,
"learning_rate": 4.873260448867004e-06,
"loss": 1.2635,
"step": 670
},
{
"epoch": 0.7097791798107256,
"grad_norm": 0.6434154288185444,
"learning_rate": 4.716396535660412e-06,
"loss": 1.2571,
"step": 675
},
{
"epoch": 0.7150368033648791,
"grad_norm": 0.6507843588939136,
"learning_rate": 4.5613159270354455e-06,
"loss": 1.2768,
"step": 680
},
{
"epoch": 0.7202944269190326,
"grad_norm": 0.6888038425645421,
"learning_rate": 4.408070965292534e-06,
"loss": 1.2729,
"step": 685
},
{
"epoch": 0.7255520504731862,
"grad_norm": 0.6548677910438372,
"learning_rate": 4.256713373170565e-06,
"loss": 1.2754,
"step": 690
},
{
"epoch": 0.7308096740273397,
"grad_norm": 0.6924489060189015,
"learning_rate": 4.107294236389603e-06,
"loss": 1.2428,
"step": 695
},
{
"epoch": 0.7360672975814931,
"grad_norm": 0.675766971656018,
"learning_rate": 3.959863986408593e-06,
"loss": 1.2422,
"step": 700
},
{
"epoch": 0.7413249211356467,
"grad_norm": 0.6759780214552317,
"learning_rate": 3.8144723834039076e-06,
"loss": 1.2596,
"step": 705
},
{
"epoch": 0.7465825446898002,
"grad_norm": 0.6577848370025432,
"learning_rate": 3.671168499474449e-06,
"loss": 1.2653,
"step": 710
},
{
"epoch": 0.7518401682439537,
"grad_norm": 0.6816662338694881,
"learning_rate": 3.5300007020789997e-06,
"loss": 1.2612,
"step": 715
},
{
"epoch": 0.7570977917981072,
"grad_norm": 0.6412644189429695,
"learning_rate": 3.3910166377113894e-06,
"loss": 1.2606,
"step": 720
},
{
"epoch": 0.7623554153522608,
"grad_norm": 0.6489342949048617,
"learning_rate": 3.2542632158190135e-06,
"loss": 1.2499,
"step": 725
},
{
"epoch": 0.7676130389064143,
"grad_norm": 0.6305858451193405,
"learning_rate": 3.119786592970102e-06,
"loss": 1.2368,
"step": 730
},
{
"epoch": 0.7728706624605678,
"grad_norm": 0.6601632070842128,
"learning_rate": 2.9876321572751143e-06,
"loss": 1.2826,
"step": 735
},
{
"epoch": 0.7781282860147214,
"grad_norm": 0.6512679140886882,
"learning_rate": 2.8578445130674835e-06,
"loss": 1.2509,
"step": 740
},
{
"epoch": 0.7833859095688749,
"grad_norm": 0.662446849135675,
"learning_rate": 2.7304674658489104e-06,
"loss": 1.2593,
"step": 745
},
{
"epoch": 0.7886435331230284,
"grad_norm": 0.6503725043711072,
"learning_rate": 2.6055440075042793e-06,
"loss": 1.2696,
"step": 750
},
{
"epoch": 0.7939011566771819,
"grad_norm": 0.6349030168998883,
"learning_rate": 2.4831163017911687e-06,
"loss": 1.2458,
"step": 755
},
{
"epoch": 0.7991587802313355,
"grad_norm": 0.6296499888770365,
"learning_rate": 2.3632256701088817e-06,
"loss": 1.2581,
"step": 760
},
{
"epoch": 0.804416403785489,
"grad_norm": 0.6477912073667947,
"learning_rate": 2.2459125775517854e-06,
"loss": 1.2614,
"step": 765
},
{
"epoch": 0.8096740273396424,
"grad_norm": 0.6340147585135972,
"learning_rate": 2.1312166192516593e-06,
"loss": 1.2707,
"step": 770
},
{
"epoch": 0.814931650893796,
"grad_norm": 0.6535117628009195,
"learning_rate": 2.019176507013677e-06,
"loss": 1.2586,
"step": 775
},
{
"epoch": 0.8201892744479495,
"grad_norm": 0.6467399510691909,
"learning_rate": 1.9098300562505266e-06,
"loss": 1.2548,
"step": 780
},
{
"epoch": 0.825446898002103,
"grad_norm": 0.6237075846859721,
"learning_rate": 1.8032141732190722e-06,
"loss": 1.26,
"step": 785
},
{
"epoch": 0.8307045215562566,
"grad_norm": 0.6374095389974708,
"learning_rate": 1.6993648425638797e-06,
"loss": 1.2605,
"step": 790
},
{
"epoch": 0.8359621451104101,
"grad_norm": 0.6326077447290254,
"learning_rate": 1.5983171151717924e-06,
"loss": 1.2701,
"step": 795
},
{
"epoch": 0.8412197686645636,
"grad_norm": 0.6299244980807773,
"learning_rate": 1.5001050963416718e-06,
"loss": 1.2427,
"step": 800
},
{
"epoch": 0.8464773922187171,
"grad_norm": 0.6281934013360307,
"learning_rate": 1.404761934273291e-06,
"loss": 1.2635,
"step": 805
},
{
"epoch": 0.8517350157728707,
"grad_norm": 0.6374294536071755,
"learning_rate": 1.3123198088792577e-06,
"loss": 1.2717,
"step": 810
},
{
"epoch": 0.8569926393270242,
"grad_norm": 0.6298699771941692,
"learning_rate": 1.222809920923761e-06,
"loss": 1.2499,
"step": 815
},
{
"epoch": 0.8622502628811777,
"grad_norm": 0.6359419744147026,
"learning_rate": 1.1362624814917843e-06,
"loss": 1.2552,
"step": 820
},
{
"epoch": 0.8675078864353313,
"grad_norm": 0.6285813274381151,
"learning_rate": 1.0527067017923654e-06,
"loss": 1.2737,
"step": 825
},
{
"epoch": 0.8727655099894848,
"grad_norm": 0.6210713562517961,
"learning_rate": 9.721707832993232e-07,
"loss": 1.2663,
"step": 830
},
{
"epoch": 0.8780231335436383,
"grad_norm": 0.6645416418213763,
"learning_rate": 8.946819082327829e-07,
"loss": 1.2581,
"step": 835
},
{
"epoch": 0.8832807570977917,
"grad_norm": 0.6251751866944497,
"learning_rate": 8.202662303847298e-07,
"loss": 1.2519,
"step": 840
},
{
"epoch": 0.8885383806519453,
"grad_norm": 0.6431790207111582,
"learning_rate": 7.48948866291661e-07,
"loss": 1.2614,
"step": 845
},
{
"epoch": 0.8937960042060988,
"grad_norm": 0.6384407054635708,
"learning_rate": 6.80753886757336e-07,
"loss": 1.2722,
"step": 850
},
{
"epoch": 0.8990536277602523,
"grad_norm": 0.631430226459749,
"learning_rate": 6.157043087284797e-07,
"loss": 1.2587,
"step": 855
},
{
"epoch": 0.9043112513144059,
"grad_norm": 0.631843611779411,
"learning_rate": 5.538220875261736e-07,
"loss": 1.2481,
"step": 860
},
{
"epoch": 0.9095688748685594,
"grad_norm": 0.6250494334708813,
"learning_rate": 4.951281094355708e-07,
"loss": 1.2552,
"step": 865
},
{
"epoch": 0.9148264984227129,
"grad_norm": 0.6300532606909163,
"learning_rate": 4.396421846564236e-07,
"loss": 1.2536,
"step": 870
},
{
"epoch": 0.9200841219768665,
"grad_norm": 0.6200012582073006,
"learning_rate": 3.8738304061681107e-07,
"loss": 1.2694,
"step": 875
},
{
"epoch": 0.92534174553102,
"grad_norm": 0.6324403821172889,
"learning_rate": 3.3836831565231877e-07,
"loss": 1.2545,
"step": 880
},
{
"epoch": 0.9305993690851735,
"grad_norm": 0.6325686488791483,
"learning_rate": 2.926145530528002e-07,
"loss": 1.2531,
"step": 885
},
{
"epoch": 0.935856992639327,
"grad_norm": 0.6373731914514896,
"learning_rate": 2.501371954787479e-07,
"loss": 1.2648,
"step": 890
},
{
"epoch": 0.9411146161934806,
"grad_norm": 0.6255593953506046,
"learning_rate": 2.109505797491318e-07,
"loss": 1.2392,
"step": 895
},
{
"epoch": 0.9463722397476341,
"grad_norm": 0.6353907042408413,
"learning_rate": 1.7506793200248507e-07,
"loss": 1.2633,
"step": 900
},
{
"epoch": 0.9516298633017876,
"grad_norm": 0.6307207707434458,
"learning_rate": 1.4250136323285868e-07,
"loss": 1.2603,
"step": 905
},
{
"epoch": 0.9568874868559412,
"grad_norm": 0.6277705175750871,
"learning_rate": 1.1326186520215888e-07,
"loss": 1.2453,
"step": 910
},
{
"epoch": 0.9621451104100947,
"grad_norm": 0.6031775390358434,
"learning_rate": 8.735930673024806e-08,
"loss": 1.2544,
"step": 915
},
{
"epoch": 0.9674027339642481,
"grad_norm": 0.6189714576376613,
"learning_rate": 6.480243036404598e-08,
"loss": 1.2661,
"step": 920
},
{
"epoch": 0.9726603575184016,
"grad_norm": 0.6219922382476483,
"learning_rate": 4.5598849426777833e-08,
"loss": 1.2418,
"step": 925
},
{
"epoch": 0.9779179810725552,
"grad_norm": 0.6188912441406357,
"learning_rate": 2.9755045448351948e-08,
"loss": 1.2471,
"step": 930
},
{
"epoch": 0.9831756046267087,
"grad_norm": 0.6241442557644146,
"learning_rate": 1.7276365977730858e-08,
"loss": 1.2591,
"step": 935
},
{
"epoch": 0.9884332281808622,
"grad_norm": 0.6148942917091785,
"learning_rate": 8.167022778045042e-09,
"loss": 1.2783,
"step": 940
},
{
"epoch": 0.9936908517350158,
"grad_norm": 0.6142845224019251,
"learning_rate": 2.430090405054486e-09,
"loss": 1.2311,
"step": 945
},
{
"epoch": 0.9989484752891693,
"grad_norm": 0.6318778333420784,
"learning_rate": 6.750516943321295e-11,
"loss": 1.2488,
"step": 950
},
{
"epoch": 1.0,
"eval_loss": 1.266036868095398,
"eval_runtime": 32.6481,
"eval_samples_per_second": 412.213,
"eval_steps_per_second": 6.463,
"step": 951
},
{
"epoch": 1.0,
"step": 951,
"total_flos": 101091718987776.0,
"train_loss": 1.289618753484371,
"train_runtime": 1098.6048,
"train_samples_per_second": 110.718,
"train_steps_per_second": 0.866
}
],
"logging_steps": 5,
"max_steps": 951,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 101091718987776.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}