|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 951, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0010515247108307045, |
|
"grad_norm": 9.591360424317704, |
|
"learning_rate": 2.0833333333333333e-07, |
|
"loss": 1.5585, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005257623554153523, |
|
"grad_norm": 9.792129596772575, |
|
"learning_rate": 1.0416666666666667e-06, |
|
"loss": 1.5717, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.010515247108307046, |
|
"grad_norm": 3.5907937606092246, |
|
"learning_rate": 2.0833333333333334e-06, |
|
"loss": 1.54, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.015772870662460567, |
|
"grad_norm": 2.305935396467309, |
|
"learning_rate": 3.125e-06, |
|
"loss": 1.5164, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.02103049421661409, |
|
"grad_norm": 1.3430988630406788, |
|
"learning_rate": 4.166666666666667e-06, |
|
"loss": 1.4398, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.026288117770767613, |
|
"grad_norm": 1.3395602302159622, |
|
"learning_rate": 5.208333333333334e-06, |
|
"loss": 1.4374, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.031545741324921134, |
|
"grad_norm": 0.927990494487892, |
|
"learning_rate": 6.25e-06, |
|
"loss": 1.3952, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03680336487907466, |
|
"grad_norm": 0.817869963107523, |
|
"learning_rate": 7.291666666666667e-06, |
|
"loss": 1.3688, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.04206098843322818, |
|
"grad_norm": 0.8099795395904227, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 1.3543, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0473186119873817, |
|
"grad_norm": 0.8196989977897423, |
|
"learning_rate": 9.375000000000001e-06, |
|
"loss": 1.4065, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.052576235541535225, |
|
"grad_norm": 0.7891645835706297, |
|
"learning_rate": 1.0416666666666668e-05, |
|
"loss": 1.3719, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05783385909568875, |
|
"grad_norm": 0.7730662990445142, |
|
"learning_rate": 1.1458333333333333e-05, |
|
"loss": 1.3534, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.06309148264984227, |
|
"grad_norm": 0.8436428208666454, |
|
"learning_rate": 1.25e-05, |
|
"loss": 1.3484, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0683491062039958, |
|
"grad_norm": 0.930633139226441, |
|
"learning_rate": 1.3541666666666668e-05, |
|
"loss": 1.3339, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.07360672975814932, |
|
"grad_norm": 1.2035603215763095, |
|
"learning_rate": 1.4583333333333333e-05, |
|
"loss": 1.3577, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.07886435331230283, |
|
"grad_norm": 0.9963124148010385, |
|
"learning_rate": 1.5625e-05, |
|
"loss": 1.3504, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.08412197686645637, |
|
"grad_norm": 0.7705000485956546, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 1.3316, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.08937960042060988, |
|
"grad_norm": 0.892949375851743, |
|
"learning_rate": 1.7708333333333335e-05, |
|
"loss": 1.3289, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.0946372239747634, |
|
"grad_norm": 0.909849658521525, |
|
"learning_rate": 1.8750000000000002e-05, |
|
"loss": 1.3311, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.09989484752891693, |
|
"grad_norm": 0.8255721604143701, |
|
"learning_rate": 1.979166666666667e-05, |
|
"loss": 1.3224, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.10515247108307045, |
|
"grad_norm": 0.8253137907707107, |
|
"learning_rate": 1.9998919935516768e-05, |
|
"loss": 1.3291, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11041009463722397, |
|
"grad_norm": 0.8322397994562853, |
|
"learning_rate": 1.999453257340926e-05, |
|
"loss": 1.321, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.1156677181913775, |
|
"grad_norm": 0.8169700972120059, |
|
"learning_rate": 1.9986771889316172e-05, |
|
"loss": 1.3145, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.12092534174553102, |
|
"grad_norm": 0.8054378165910604, |
|
"learning_rate": 1.9975640502598243e-05, |
|
"loss": 1.3537, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.12618296529968454, |
|
"grad_norm": 0.8851185792466449, |
|
"learning_rate": 1.9961142170284762e-05, |
|
"loss": 1.3081, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.13144058885383805, |
|
"grad_norm": 0.7983608745240434, |
|
"learning_rate": 1.9943281785805483e-05, |
|
"loss": 1.3235, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.1366982124079916, |
|
"grad_norm": 0.8101924460502825, |
|
"learning_rate": 1.9922065377339037e-05, |
|
"loss": 1.3234, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.14195583596214512, |
|
"grad_norm": 0.8181918475547033, |
|
"learning_rate": 1.98975001057783e-05, |
|
"loss": 1.3275, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.14721345951629863, |
|
"grad_norm": 0.8204094295073951, |
|
"learning_rate": 1.986959426231349e-05, |
|
"loss": 1.3191, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.15247108307045215, |
|
"grad_norm": 0.8193214439419285, |
|
"learning_rate": 1.983835726563373e-05, |
|
"loss": 1.3151, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.15772870662460567, |
|
"grad_norm": 0.8008496299830612, |
|
"learning_rate": 1.9803799658748096e-05, |
|
"loss": 1.3173, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.16298633017875921, |
|
"grad_norm": 0.8976369540265376, |
|
"learning_rate": 1.976593310542718e-05, |
|
"loss": 1.3193, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.16824395373291273, |
|
"grad_norm": 0.8219320759522418, |
|
"learning_rate": 1.9724770386266363e-05, |
|
"loss": 1.3074, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.17350157728706625, |
|
"grad_norm": 0.7917666416050304, |
|
"learning_rate": 1.968032539437215e-05, |
|
"loss": 1.3229, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.17875920084121977, |
|
"grad_norm": 0.8468899498707804, |
|
"learning_rate": 1.963261313067302e-05, |
|
"loss": 1.3053, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.18401682439537329, |
|
"grad_norm": 0.7826255604907222, |
|
"learning_rate": 1.958164969885636e-05, |
|
"loss": 1.2994, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.1892744479495268, |
|
"grad_norm": 0.7854651229191463, |
|
"learning_rate": 1.9527452299933192e-05, |
|
"loss": 1.2933, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.19453207150368035, |
|
"grad_norm": 0.8335518596689603, |
|
"learning_rate": 1.9470039226432562e-05, |
|
"loss": 1.3053, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.19978969505783387, |
|
"grad_norm": 0.7925970741738505, |
|
"learning_rate": 1.9409429856227487e-05, |
|
"loss": 1.3118, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.20504731861198738, |
|
"grad_norm": 0.9022667633817576, |
|
"learning_rate": 1.934564464599461e-05, |
|
"loss": 1.3006, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.2103049421661409, |
|
"grad_norm": 0.7846381821366918, |
|
"learning_rate": 1.9278705124309724e-05, |
|
"loss": 1.3019, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.21556256572029442, |
|
"grad_norm": 0.8523088106258838, |
|
"learning_rate": 1.9208633884381528e-05, |
|
"loss": 1.3096, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.22082018927444794, |
|
"grad_norm": 0.7773410227959319, |
|
"learning_rate": 1.913545457642601e-05, |
|
"loss": 1.3292, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.22607781282860148, |
|
"grad_norm": 0.7700372006553495, |
|
"learning_rate": 1.9059191899684154e-05, |
|
"loss": 1.3039, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.231335436382755, |
|
"grad_norm": 0.888501634972992, |
|
"learning_rate": 1.8979871594085482e-05, |
|
"loss": 1.2877, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.23659305993690852, |
|
"grad_norm": 1.1002462859379447, |
|
"learning_rate": 1.8897520431560435e-05, |
|
"loss": 1.3015, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.24185068349106204, |
|
"grad_norm": 0.8737204056440837, |
|
"learning_rate": 1.881216620700437e-05, |
|
"loss": 1.3115, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.24710830704521555, |
|
"grad_norm": 0.789475981373434, |
|
"learning_rate": 1.872383772889634e-05, |
|
"loss": 1.3046, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.25236593059936907, |
|
"grad_norm": 0.7857837110627846, |
|
"learning_rate": 1.863256480957574e-05, |
|
"loss": 1.314, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2576235541535226, |
|
"grad_norm": 0.8735939515101088, |
|
"learning_rate": 1.853837825518014e-05, |
|
"loss": 1.2965, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.2628811777076761, |
|
"grad_norm": 0.8274285162573444, |
|
"learning_rate": 1.844130985524771e-05, |
|
"loss": 1.2847, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.26813880126182965, |
|
"grad_norm": 0.9655358070442068, |
|
"learning_rate": 1.83413923719877e-05, |
|
"loss": 1.3033, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.2733964248159832, |
|
"grad_norm": 0.784933865330852, |
|
"learning_rate": 1.8238659529222672e-05, |
|
"loss": 1.2964, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2786540483701367, |
|
"grad_norm": 0.8072089928871935, |
|
"learning_rate": 1.813314600100612e-05, |
|
"loss": 1.3, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.28391167192429023, |
|
"grad_norm": 0.7600500534210821, |
|
"learning_rate": 1.802488739991941e-05, |
|
"loss": 1.2897, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.2891692954784437, |
|
"grad_norm": 0.7716489846603405, |
|
"learning_rate": 1.7913920265051947e-05, |
|
"loss": 1.2994, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.29442691903259727, |
|
"grad_norm": 0.8439416659266935, |
|
"learning_rate": 1.7800282049668593e-05, |
|
"loss": 1.3146, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2996845425867508, |
|
"grad_norm": 0.7413366975663128, |
|
"learning_rate": 1.7684011108568593e-05, |
|
"loss": 1.3157, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.3049421661409043, |
|
"grad_norm": 0.9324094146103826, |
|
"learning_rate": 1.7565146685140168e-05, |
|
"loss": 1.2944, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.31019978969505785, |
|
"grad_norm": 0.7497525070350954, |
|
"learning_rate": 1.7443728898115228e-05, |
|
"loss": 1.3041, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.31545741324921134, |
|
"grad_norm": 0.7274267632257158, |
|
"learning_rate": 1.7319798728028617e-05, |
|
"loss": 1.2855, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3207150368033649, |
|
"grad_norm": 0.7306525895599777, |
|
"learning_rate": 1.7193398003386514e-05, |
|
"loss": 1.2967, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.32597266035751843, |
|
"grad_norm": 0.7718504682998596, |
|
"learning_rate": 1.7064569386548586e-05, |
|
"loss": 1.3116, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3312302839116719, |
|
"grad_norm": 0.7002164572114049, |
|
"learning_rate": 1.6933356359328756e-05, |
|
"loss": 1.2812, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.33648790746582546, |
|
"grad_norm": 0.757112323690967, |
|
"learning_rate": 1.679980320831934e-05, |
|
"loss": 1.2654, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.34174553101997895, |
|
"grad_norm": 0.7509591301953, |
|
"learning_rate": 1.6663955009943603e-05, |
|
"loss": 1.2755, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.3470031545741325, |
|
"grad_norm": 0.7896423206563724, |
|
"learning_rate": 1.6525857615241686e-05, |
|
"loss": 1.291, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.352260778128286, |
|
"grad_norm": 0.8538357892614057, |
|
"learning_rate": 1.6385557634395138e-05, |
|
"loss": 1.3, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.35751840168243953, |
|
"grad_norm": 0.8257603179264333, |
|
"learning_rate": 1.624310242099518e-05, |
|
"loss": 1.2825, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3627760252365931, |
|
"grad_norm": 0.7506565139625698, |
|
"learning_rate": 1.609854005606009e-05, |
|
"loss": 1.2903, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.36803364879074657, |
|
"grad_norm": 0.7962631813094616, |
|
"learning_rate": 1.5951919331807052e-05, |
|
"loss": 1.32, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3732912723449001, |
|
"grad_norm": 0.7846004527594455, |
|
"learning_rate": 1.5803289735183952e-05, |
|
"loss": 1.3094, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.3785488958990536, |
|
"grad_norm": 0.7529193912570731, |
|
"learning_rate": 1.565270143116672e-05, |
|
"loss": 1.3097, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.38380651945320715, |
|
"grad_norm": 0.8450472817355228, |
|
"learning_rate": 1.5500205245827814e-05, |
|
"loss": 1.2954, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.3890641430073607, |
|
"grad_norm": 0.7884988874909659, |
|
"learning_rate": 1.5345852649181555e-05, |
|
"loss": 1.2774, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.3943217665615142, |
|
"grad_norm": 0.7181038933510216, |
|
"learning_rate": 1.5189695737812153e-05, |
|
"loss": 1.2788, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.39957939011566773, |
|
"grad_norm": 0.7398759340722635, |
|
"learning_rate": 1.503178721729022e-05, |
|
"loss": 1.2825, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4048370136698212, |
|
"grad_norm": 0.6946338532175111, |
|
"learning_rate": 1.4872180384383772e-05, |
|
"loss": 1.2945, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.41009463722397477, |
|
"grad_norm": 0.7312230396618198, |
|
"learning_rate": 1.4710929109069674e-05, |
|
"loss": 1.2774, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.4153522607781283, |
|
"grad_norm": 0.685269793573933, |
|
"learning_rate": 1.4548087816351616e-05, |
|
"loss": 1.2691, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.4206098843322818, |
|
"grad_norm": 0.7367025121344211, |
|
"learning_rate": 1.4383711467890776e-05, |
|
"loss": 1.3029, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.42586750788643535, |
|
"grad_norm": 0.954670307639884, |
|
"learning_rate": 1.4217855543455323e-05, |
|
"loss": 1.2846, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.43112513144058884, |
|
"grad_norm": 0.7884436143443709, |
|
"learning_rate": 1.4050576022195084e-05, |
|
"loss": 1.2686, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.4363827549947424, |
|
"grad_norm": 0.6923648258189327, |
|
"learning_rate": 1.3881929363747628e-05, |
|
"loss": 1.2717, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.4416403785488959, |
|
"grad_norm": 0.7131098366130528, |
|
"learning_rate": 1.3711972489182208e-05, |
|
"loss": 1.2968, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.4468980021030494, |
|
"grad_norm": 0.7320310034272568, |
|
"learning_rate": 1.3540762761787938e-05, |
|
"loss": 1.2829, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.45215562565720296, |
|
"grad_norm": 0.7327893476336677, |
|
"learning_rate": 1.3368357967712726e-05, |
|
"loss": 1.2877, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.45741324921135645, |
|
"grad_norm": 0.6970014096675695, |
|
"learning_rate": 1.3194816296459483e-05, |
|
"loss": 1.2871, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.46267087276551, |
|
"grad_norm": 0.7014388522833548, |
|
"learning_rate": 1.302019632124619e-05, |
|
"loss": 1.2897, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.4679284963196635, |
|
"grad_norm": 0.6970153763179125, |
|
"learning_rate": 1.2844556979236463e-05, |
|
"loss": 1.2714, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.47318611987381703, |
|
"grad_norm": 0.7162117080997548, |
|
"learning_rate": 1.2667957551647263e-05, |
|
"loss": 1.2705, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4784437434279706, |
|
"grad_norm": 0.7402201632812351, |
|
"learning_rate": 1.24904576437405e-05, |
|
"loss": 1.2654, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.48370136698212407, |
|
"grad_norm": 1.0567871681807486, |
|
"learning_rate": 1.2312117164705267e-05, |
|
"loss": 1.2784, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.4889589905362776, |
|
"grad_norm": 0.7383322032141622, |
|
"learning_rate": 1.213299630743747e-05, |
|
"loss": 1.2574, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.4942166140904311, |
|
"grad_norm": 0.7402615331947396, |
|
"learning_rate": 1.1953155528223728e-05, |
|
"loss": 1.2861, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.49947423764458465, |
|
"grad_norm": 0.7073342447220505, |
|
"learning_rate": 1.1772655526336367e-05, |
|
"loss": 1.2899, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.5047318611987381, |
|
"grad_norm": 0.6966270957094003, |
|
"learning_rate": 1.1591557223546394e-05, |
|
"loss": 1.2607, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5099894847528917, |
|
"grad_norm": 0.6995026613750213, |
|
"learning_rate": 1.1409921743561383e-05, |
|
"loss": 1.285, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.5152471083070452, |
|
"grad_norm": 0.677915660036085, |
|
"learning_rate": 1.1227810391395199e-05, |
|
"loss": 1.2787, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5205047318611987, |
|
"grad_norm": 0.694058172582696, |
|
"learning_rate": 1.1045284632676535e-05, |
|
"loss": 1.2817, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.5257623554153522, |
|
"grad_norm": 0.6977571213673606, |
|
"learning_rate": 1.0862406072903224e-05, |
|
"loss": 1.269, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5310199789695058, |
|
"grad_norm": 0.709669977534319, |
|
"learning_rate": 1.067923643664936e-05, |
|
"loss": 1.2569, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.5362776025236593, |
|
"grad_norm": 0.700607260206098, |
|
"learning_rate": 1.0495837546732224e-05, |
|
"loss": 1.2722, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5415352260778128, |
|
"grad_norm": 0.6965062537462676, |
|
"learning_rate": 1.031227130334604e-05, |
|
"loss": 1.2689, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.5467928496319664, |
|
"grad_norm": 0.6975551930599063, |
|
"learning_rate": 1.0128599663169629e-05, |
|
"loss": 1.3171, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5520504731861199, |
|
"grad_norm": 0.6835553883087109, |
|
"learning_rate": 9.944884618454996e-06, |
|
"loss": 1.2616, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.5573080967402734, |
|
"grad_norm": 0.6932270302779978, |
|
"learning_rate": 9.761188176103902e-06, |
|
"loss": 1.2842, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.562565720294427, |
|
"grad_norm": 0.7523295705383632, |
|
"learning_rate": 9.577572336739491e-06, |
|
"loss": 1.276, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.5678233438485805, |
|
"grad_norm": 0.7410406370549468, |
|
"learning_rate": 9.394099073780066e-06, |
|
"loss": 1.2451, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.573080967402734, |
|
"grad_norm": 0.6626519082062838, |
|
"learning_rate": 9.210830312521991e-06, |
|
"loss": 1.2505, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.5783385909568874, |
|
"grad_norm": 0.678178932637677, |
|
"learning_rate": 9.027827909238902e-06, |
|
"loss": 1.2884, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.583596214511041, |
|
"grad_norm": 0.6964235915302415, |
|
"learning_rate": 8.84515363030414e-06, |
|
"loss": 1.2674, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.5888538380651945, |
|
"grad_norm": 0.7295327929832226, |
|
"learning_rate": 8.662869131343607e-06, |
|
"loss": 1.2606, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.594111461619348, |
|
"grad_norm": 0.6671644394075068, |
|
"learning_rate": 8.481035936425928e-06, |
|
"loss": 1.2631, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.5993690851735016, |
|
"grad_norm": 0.6918129382316475, |
|
"learning_rate": 8.299715417297072e-06, |
|
"loss": 1.2733, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6046267087276551, |
|
"grad_norm": 0.7648575778497364, |
|
"learning_rate": 8.118968772666338e-06, |
|
"loss": 1.2768, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.6098843322818086, |
|
"grad_norm": 0.6660976751002976, |
|
"learning_rate": 7.938857007550797e-06, |
|
"loss": 1.2712, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6151419558359621, |
|
"grad_norm": 0.8206761231157318, |
|
"learning_rate": 7.759440912685043e-06, |
|
"loss": 1.2629, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.6203995793901157, |
|
"grad_norm": 0.6781893460495839, |
|
"learning_rate": 7.580781044003324e-06, |
|
"loss": 1.2928, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.6256572029442692, |
|
"grad_norm": 0.7043738811990014, |
|
"learning_rate": 7.402937702200905e-06, |
|
"loss": 1.2565, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.6309148264984227, |
|
"grad_norm": 0.6676378108729303, |
|
"learning_rate": 7.225970912381557e-06, |
|
"loss": 1.2441, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6361724500525763, |
|
"grad_norm": 0.7086319165657909, |
|
"learning_rate": 7.04994040379809e-06, |
|
"loss": 1.2526, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.6414300736067298, |
|
"grad_norm": 0.6584513029407596, |
|
"learning_rate": 6.874905589692734e-06, |
|
"loss": 1.2689, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6466876971608833, |
|
"grad_norm": 0.6556498134167102, |
|
"learning_rate": 6.700925547244173e-06, |
|
"loss": 1.254, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.6519453207150369, |
|
"grad_norm": 0.7317497625816267, |
|
"learning_rate": 6.528058997627995e-06, |
|
"loss": 1.2773, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6572029442691903, |
|
"grad_norm": 0.6943495622984553, |
|
"learning_rate": 6.356364286197341e-06, |
|
"loss": 1.2774, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.6624605678233438, |
|
"grad_norm": 0.6557703327216338, |
|
"learning_rate": 6.18589936279034e-06, |
|
"loss": 1.2561, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6677181913774973, |
|
"grad_norm": 0.7479106302240468, |
|
"learning_rate": 6.016721762171098e-06, |
|
"loss": 1.2592, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.6729758149316509, |
|
"grad_norm": 0.6501670860792287, |
|
"learning_rate": 5.848888584610727e-06, |
|
"loss": 1.2647, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.6782334384858044, |
|
"grad_norm": 0.6750656043866252, |
|
"learning_rate": 5.6824564766150724e-06, |
|
"loss": 1.2687, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.6834910620399579, |
|
"grad_norm": 0.707162362118668, |
|
"learning_rate": 5.51748161180554e-06, |
|
"loss": 1.2658, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6887486855941115, |
|
"grad_norm": 0.638182422688773, |
|
"learning_rate": 5.354019671959601e-06, |
|
"loss": 1.2618, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.694006309148265, |
|
"grad_norm": 0.6799258857804185, |
|
"learning_rate": 5.192125828217203e-06, |
|
"loss": 1.265, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.6992639327024185, |
|
"grad_norm": 0.6614260729055945, |
|
"learning_rate": 5.0318547224596525e-06, |
|
"loss": 1.2726, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.704521556256572, |
|
"grad_norm": 0.6571261971890797, |
|
"learning_rate": 4.873260448867004e-06, |
|
"loss": 1.2635, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.7097791798107256, |
|
"grad_norm": 0.6434154288185444, |
|
"learning_rate": 4.716396535660412e-06, |
|
"loss": 1.2571, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.7150368033648791, |
|
"grad_norm": 0.6507843588939136, |
|
"learning_rate": 4.5613159270354455e-06, |
|
"loss": 1.2768, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.7202944269190326, |
|
"grad_norm": 0.6888038425645421, |
|
"learning_rate": 4.408070965292534e-06, |
|
"loss": 1.2729, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.7255520504731862, |
|
"grad_norm": 0.6548677910438372, |
|
"learning_rate": 4.256713373170565e-06, |
|
"loss": 1.2754, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.7308096740273397, |
|
"grad_norm": 0.6924489060189015, |
|
"learning_rate": 4.107294236389603e-06, |
|
"loss": 1.2428, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.7360672975814931, |
|
"grad_norm": 0.675766971656018, |
|
"learning_rate": 3.959863986408593e-06, |
|
"loss": 1.2422, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7413249211356467, |
|
"grad_norm": 0.6759780214552317, |
|
"learning_rate": 3.8144723834039076e-06, |
|
"loss": 1.2596, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.7465825446898002, |
|
"grad_norm": 0.6577848370025432, |
|
"learning_rate": 3.671168499474449e-06, |
|
"loss": 1.2653, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.7518401682439537, |
|
"grad_norm": 0.6816662338694881, |
|
"learning_rate": 3.5300007020789997e-06, |
|
"loss": 1.2612, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.7570977917981072, |
|
"grad_norm": 0.6412644189429695, |
|
"learning_rate": 3.3910166377113894e-06, |
|
"loss": 1.2606, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.7623554153522608, |
|
"grad_norm": 0.6489342949048617, |
|
"learning_rate": 3.2542632158190135e-06, |
|
"loss": 1.2499, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.7676130389064143, |
|
"grad_norm": 0.6305858451193405, |
|
"learning_rate": 3.119786592970102e-06, |
|
"loss": 1.2368, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.7728706624605678, |
|
"grad_norm": 0.6601632070842128, |
|
"learning_rate": 2.9876321572751143e-06, |
|
"loss": 1.2826, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.7781282860147214, |
|
"grad_norm": 0.6512679140886882, |
|
"learning_rate": 2.8578445130674835e-06, |
|
"loss": 1.2509, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.7833859095688749, |
|
"grad_norm": 0.662446849135675, |
|
"learning_rate": 2.7304674658489104e-06, |
|
"loss": 1.2593, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.7886435331230284, |
|
"grad_norm": 0.6503725043711072, |
|
"learning_rate": 2.6055440075042793e-06, |
|
"loss": 1.2696, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.7939011566771819, |
|
"grad_norm": 0.6349030168998883, |
|
"learning_rate": 2.4831163017911687e-06, |
|
"loss": 1.2458, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.7991587802313355, |
|
"grad_norm": 0.6296499888770365, |
|
"learning_rate": 2.3632256701088817e-06, |
|
"loss": 1.2581, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.804416403785489, |
|
"grad_norm": 0.6477912073667947, |
|
"learning_rate": 2.2459125775517854e-06, |
|
"loss": 1.2614, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.8096740273396424, |
|
"grad_norm": 0.6340147585135972, |
|
"learning_rate": 2.1312166192516593e-06, |
|
"loss": 1.2707, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.814931650893796, |
|
"grad_norm": 0.6535117628009195, |
|
"learning_rate": 2.019176507013677e-06, |
|
"loss": 1.2586, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.8201892744479495, |
|
"grad_norm": 0.6467399510691909, |
|
"learning_rate": 1.9098300562505266e-06, |
|
"loss": 1.2548, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.825446898002103, |
|
"grad_norm": 0.6237075846859721, |
|
"learning_rate": 1.8032141732190722e-06, |
|
"loss": 1.26, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.8307045215562566, |
|
"grad_norm": 0.6374095389974708, |
|
"learning_rate": 1.6993648425638797e-06, |
|
"loss": 1.2605, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.8359621451104101, |
|
"grad_norm": 0.6326077447290254, |
|
"learning_rate": 1.5983171151717924e-06, |
|
"loss": 1.2701, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.8412197686645636, |
|
"grad_norm": 0.6299244980807773, |
|
"learning_rate": 1.5001050963416718e-06, |
|
"loss": 1.2427, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8464773922187171, |
|
"grad_norm": 0.6281934013360307, |
|
"learning_rate": 1.404761934273291e-06, |
|
"loss": 1.2635, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.8517350157728707, |
|
"grad_norm": 0.6374294536071755, |
|
"learning_rate": 1.3123198088792577e-06, |
|
"loss": 1.2717, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.8569926393270242, |
|
"grad_norm": 0.6298699771941692, |
|
"learning_rate": 1.222809920923761e-06, |
|
"loss": 1.2499, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.8622502628811777, |
|
"grad_norm": 0.6359419744147026, |
|
"learning_rate": 1.1362624814917843e-06, |
|
"loss": 1.2552, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.8675078864353313, |
|
"grad_norm": 0.6285813274381151, |
|
"learning_rate": 1.0527067017923654e-06, |
|
"loss": 1.2737, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.8727655099894848, |
|
"grad_norm": 0.6210713562517961, |
|
"learning_rate": 9.721707832993232e-07, |
|
"loss": 1.2663, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.8780231335436383, |
|
"grad_norm": 0.6645416418213763, |
|
"learning_rate": 8.946819082327829e-07, |
|
"loss": 1.2581, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.8832807570977917, |
|
"grad_norm": 0.6251751866944497, |
|
"learning_rate": 8.202662303847298e-07, |
|
"loss": 1.2519, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.8885383806519453, |
|
"grad_norm": 0.6431790207111582, |
|
"learning_rate": 7.48948866291661e-07, |
|
"loss": 1.2614, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.8937960042060988, |
|
"grad_norm": 0.6384407054635708, |
|
"learning_rate": 6.80753886757336e-07, |
|
"loss": 1.2722, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.8990536277602523, |
|
"grad_norm": 0.631430226459749, |
|
"learning_rate": 6.157043087284797e-07, |
|
"loss": 1.2587, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.9043112513144059, |
|
"grad_norm": 0.631843611779411, |
|
"learning_rate": 5.538220875261736e-07, |
|
"loss": 1.2481, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.9095688748685594, |
|
"grad_norm": 0.6250494334708813, |
|
"learning_rate": 4.951281094355708e-07, |
|
"loss": 1.2552, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.9148264984227129, |
|
"grad_norm": 0.6300532606909163, |
|
"learning_rate": 4.396421846564236e-07, |
|
"loss": 1.2536, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.9200841219768665, |
|
"grad_norm": 0.6200012582073006, |
|
"learning_rate": 3.8738304061681107e-07, |
|
"loss": 1.2694, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.92534174553102, |
|
"grad_norm": 0.6324403821172889, |
|
"learning_rate": 3.3836831565231877e-07, |
|
"loss": 1.2545, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.9305993690851735, |
|
"grad_norm": 0.6325686488791483, |
|
"learning_rate": 2.926145530528002e-07, |
|
"loss": 1.2531, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.935856992639327, |
|
"grad_norm": 0.6373731914514896, |
|
"learning_rate": 2.501371954787479e-07, |
|
"loss": 1.2648, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.9411146161934806, |
|
"grad_norm": 0.6255593953506046, |
|
"learning_rate": 2.109505797491318e-07, |
|
"loss": 1.2392, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.9463722397476341, |
|
"grad_norm": 0.6353907042408413, |
|
"learning_rate": 1.7506793200248507e-07, |
|
"loss": 1.2633, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9516298633017876, |
|
"grad_norm": 0.6307207707434458, |
|
"learning_rate": 1.4250136323285868e-07, |
|
"loss": 1.2603, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.9568874868559412, |
|
"grad_norm": 0.6277705175750871, |
|
"learning_rate": 1.1326186520215888e-07, |
|
"loss": 1.2453, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.9621451104100947, |
|
"grad_norm": 0.6031775390358434, |
|
"learning_rate": 8.735930673024806e-08, |
|
"loss": 1.2544, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.9674027339642481, |
|
"grad_norm": 0.6189714576376613, |
|
"learning_rate": 6.480243036404598e-08, |
|
"loss": 1.2661, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.9726603575184016, |
|
"grad_norm": 0.6219922382476483, |
|
"learning_rate": 4.5598849426777833e-08, |
|
"loss": 1.2418, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.9779179810725552, |
|
"grad_norm": 0.6188912441406357, |
|
"learning_rate": 2.9755045448351948e-08, |
|
"loss": 1.2471, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.9831756046267087, |
|
"grad_norm": 0.6241442557644146, |
|
"learning_rate": 1.7276365977730858e-08, |
|
"loss": 1.2591, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.9884332281808622, |
|
"grad_norm": 0.6148942917091785, |
|
"learning_rate": 8.167022778045042e-09, |
|
"loss": 1.2783, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.9936908517350158, |
|
"grad_norm": 0.6142845224019251, |
|
"learning_rate": 2.430090405054486e-09, |
|
"loss": 1.2311, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.9989484752891693, |
|
"grad_norm": 0.6318778333420784, |
|
"learning_rate": 6.750516943321295e-11, |
|
"loss": 1.2488, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.266036868095398, |
|
"eval_runtime": 32.6481, |
|
"eval_samples_per_second": 412.213, |
|
"eval_steps_per_second": 6.463, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 951, |
|
"total_flos": 101091718987776.0, |
|
"train_loss": 1.289618753484371, |
|
"train_runtime": 1098.6048, |
|
"train_samples_per_second": 110.718, |
|
"train_steps_per_second": 0.866 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 951, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 101091718987776.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|