|
{ |
|
"best_metric": 0.794740617275238, |
|
"best_model_checkpoint": "saves/CADICA_qwenvl_stenosis_classily/lora/sft/checkpoint-700", |
|
"epoch": 1.9968586387434555, |
|
"eval_steps": 50, |
|
"global_step": 716, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.013961605584642234, |
|
"grad_norm": 21.25276507868793, |
|
"learning_rate": 6.944444444444445e-06, |
|
"loss": 2.9908, |
|
"num_input_tokens_seen": 77944, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.027923211169284468, |
|
"grad_norm": 21.89043285054519, |
|
"learning_rate": 1.388888888888889e-05, |
|
"loss": 3.0071, |
|
"num_input_tokens_seen": 155896, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.041884816753926704, |
|
"grad_norm": 16.65776874449816, |
|
"learning_rate": 2.0833333333333336e-05, |
|
"loss": 2.354, |
|
"num_input_tokens_seen": 233896, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.055846422338568937, |
|
"grad_norm": 3.772799389266845, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 1.2959, |
|
"num_input_tokens_seen": 311840, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06980802792321117, |
|
"grad_norm": 2.5936011954385334, |
|
"learning_rate": 3.472222222222222e-05, |
|
"loss": 1.0206, |
|
"num_input_tokens_seen": 389816, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.08376963350785341, |
|
"grad_norm": 1.380523901017673, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 0.9285, |
|
"num_input_tokens_seen": 467808, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09773123909249563, |
|
"grad_norm": 0.9535971270874376, |
|
"learning_rate": 4.8611111111111115e-05, |
|
"loss": 0.9052, |
|
"num_input_tokens_seen": 545776, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.11169284467713787, |
|
"grad_norm": 0.7487685762175865, |
|
"learning_rate": 5.555555555555556e-05, |
|
"loss": 0.929, |
|
"num_input_tokens_seen": 623744, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1256544502617801, |
|
"grad_norm": 0.9517829869317949, |
|
"learning_rate": 6.25e-05, |
|
"loss": 0.9076, |
|
"num_input_tokens_seen": 701720, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.13961605584642234, |
|
"grad_norm": 0.5105376471286923, |
|
"learning_rate": 6.944444444444444e-05, |
|
"loss": 0.9039, |
|
"num_input_tokens_seen": 779728, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.13961605584642234, |
|
"eval_loss": 0.9039102792739868, |
|
"eval_runtime": 74.9579, |
|
"eval_samples_per_second": 1.948, |
|
"eval_steps_per_second": 0.494, |
|
"num_input_tokens_seen": 779728, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.15357766143106458, |
|
"grad_norm": 0.6125311992064874, |
|
"learning_rate": 7.638888888888889e-05, |
|
"loss": 0.8983, |
|
"num_input_tokens_seen": 857728, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.16753926701570682, |
|
"grad_norm": 0.8799068808838695, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 0.9115, |
|
"num_input_tokens_seen": 935680, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.18150087260034903, |
|
"grad_norm": 0.7270711909487898, |
|
"learning_rate": 9.027777777777779e-05, |
|
"loss": 0.9022, |
|
"num_input_tokens_seen": 1013664, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.19546247818499127, |
|
"grad_norm": 0.6023654770278246, |
|
"learning_rate": 9.722222222222223e-05, |
|
"loss": 0.8981, |
|
"num_input_tokens_seen": 1091656, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2094240837696335, |
|
"grad_norm": 0.5698794386547648, |
|
"learning_rate": 9.999464569905628e-05, |
|
"loss": 0.9067, |
|
"num_input_tokens_seen": 1169664, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.22338568935427575, |
|
"grad_norm": 0.32260644881875, |
|
"learning_rate": 9.99619291237835e-05, |
|
"loss": 0.9075, |
|
"num_input_tokens_seen": 1247672, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.23734729493891799, |
|
"grad_norm": 0.41708039368778405, |
|
"learning_rate": 9.989949002448076e-05, |
|
"loss": 0.8964, |
|
"num_input_tokens_seen": 1325640, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.2513089005235602, |
|
"grad_norm": 0.6145758907120942, |
|
"learning_rate": 9.980736554638366e-05, |
|
"loss": 0.9128, |
|
"num_input_tokens_seen": 1403688, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.26527050610820246, |
|
"grad_norm": 0.30302247663937915, |
|
"learning_rate": 9.968561049466214e-05, |
|
"loss": 0.8991, |
|
"num_input_tokens_seen": 1481664, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2792321116928447, |
|
"grad_norm": 0.32920212256475023, |
|
"learning_rate": 9.953429730181653e-05, |
|
"loss": 0.9033, |
|
"num_input_tokens_seen": 1559632, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2792321116928447, |
|
"eval_loss": 0.9009457230567932, |
|
"eval_runtime": 47.3577, |
|
"eval_samples_per_second": 3.083, |
|
"eval_steps_per_second": 0.781, |
|
"num_input_tokens_seen": 1559632, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2931937172774869, |
|
"grad_norm": 0.3748068082841054, |
|
"learning_rate": 9.935351598458742e-05, |
|
"loss": 0.902, |
|
"num_input_tokens_seen": 1637592, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.30715532286212915, |
|
"grad_norm": 0.367692204424778, |
|
"learning_rate": 9.914337409040418e-05, |
|
"loss": 0.903, |
|
"num_input_tokens_seen": 1715592, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.32111692844677137, |
|
"grad_norm": 0.523389228578757, |
|
"learning_rate": 9.890399663340478e-05, |
|
"loss": 0.9014, |
|
"num_input_tokens_seen": 1793544, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.33507853403141363, |
|
"grad_norm": 0.7666885810234405, |
|
"learning_rate": 9.863552602006435e-05, |
|
"loss": 0.8966, |
|
"num_input_tokens_seen": 1871520, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.34904013961605584, |
|
"grad_norm": 0.45411297588089927, |
|
"learning_rate": 9.83381219644771e-05, |
|
"loss": 0.9032, |
|
"num_input_tokens_seen": 1949488, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.36300174520069806, |
|
"grad_norm": 0.34304009173464395, |
|
"learning_rate": 9.801196139334195e-05, |
|
"loss": 0.8919, |
|
"num_input_tokens_seen": 2027488, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3769633507853403, |
|
"grad_norm": 0.46756876437741973, |
|
"learning_rate": 9.765723834070804e-05, |
|
"loss": 0.9025, |
|
"num_input_tokens_seen": 2105424, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.39092495636998253, |
|
"grad_norm": 0.5067842714503226, |
|
"learning_rate": 9.72741638325434e-05, |
|
"loss": 0.9001, |
|
"num_input_tokens_seen": 2183432, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4048865619546248, |
|
"grad_norm": 0.45330848125911494, |
|
"learning_rate": 9.686296576119471e-05, |
|
"loss": 0.9007, |
|
"num_input_tokens_seen": 2261408, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.418848167539267, |
|
"grad_norm": 0.3379495380306588, |
|
"learning_rate": 9.642388874981347e-05, |
|
"loss": 0.9001, |
|
"num_input_tokens_seen": 2339368, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.418848167539267, |
|
"eval_loss": 0.8987511396408081, |
|
"eval_runtime": 46.4204, |
|
"eval_samples_per_second": 3.145, |
|
"eval_steps_per_second": 0.797, |
|
"num_input_tokens_seen": 2339368, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4328097731239092, |
|
"grad_norm": 0.4705971769713144, |
|
"learning_rate": 9.595719400682881e-05, |
|
"loss": 0.8974, |
|
"num_input_tokens_seen": 2417328, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.4467713787085515, |
|
"grad_norm": 0.2877245912486978, |
|
"learning_rate": 9.546315917055361e-05, |
|
"loss": 0.895, |
|
"num_input_tokens_seen": 2495328, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4607329842931937, |
|
"grad_norm": 0.3089085786477158, |
|
"learning_rate": 9.494207814401672e-05, |
|
"loss": 0.8993, |
|
"num_input_tokens_seen": 2573264, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.47469458987783597, |
|
"grad_norm": 0.23434151328428765, |
|
"learning_rate": 9.439426092011875e-05, |
|
"loss": 0.9011, |
|
"num_input_tokens_seen": 2651200, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4886561954624782, |
|
"grad_norm": 0.3895079869752368, |
|
"learning_rate": 9.382003339721652e-05, |
|
"loss": 0.8943, |
|
"num_input_tokens_seen": 2729208, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.5026178010471204, |
|
"grad_norm": 0.21859941380068879, |
|
"learning_rate": 9.321973718524472e-05, |
|
"loss": 0.9074, |
|
"num_input_tokens_seen": 2807176, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5165794066317626, |
|
"grad_norm": 0.3026569089827376, |
|
"learning_rate": 9.25937294024912e-05, |
|
"loss": 0.8979, |
|
"num_input_tokens_seen": 2885136, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.5305410122164049, |
|
"grad_norm": 0.24552828812005026, |
|
"learning_rate": 9.194238246314599e-05, |
|
"loss": 0.8908, |
|
"num_input_tokens_seen": 2963120, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5445026178010471, |
|
"grad_norm": 0.31370225105827704, |
|
"learning_rate": 9.126608385575076e-05, |
|
"loss": 0.8922, |
|
"num_input_tokens_seen": 3041096, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5584642233856894, |
|
"grad_norm": 0.19991243424614802, |
|
"learning_rate": 9.056523591268064e-05, |
|
"loss": 0.902, |
|
"num_input_tokens_seen": 3119064, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5584642233856894, |
|
"eval_loss": 0.9003660678863525, |
|
"eval_runtime": 46.4656, |
|
"eval_samples_per_second": 3.142, |
|
"eval_steps_per_second": 0.796, |
|
"num_input_tokens_seen": 3119064, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5724258289703316, |
|
"grad_norm": 0.29215232660029566, |
|
"learning_rate": 8.984025557079523e-05, |
|
"loss": 0.9016, |
|
"num_input_tokens_seen": 3197048, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5863874345549738, |
|
"grad_norm": 0.4850357234182881, |
|
"learning_rate": 8.90915741234015e-05, |
|
"loss": 0.907, |
|
"num_input_tokens_seen": 3275024, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6003490401396161, |
|
"grad_norm": 0.3567000129034729, |
|
"learning_rate": 8.831963696367581e-05, |
|
"loss": 0.8966, |
|
"num_input_tokens_seen": 3353024, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.6143106457242583, |
|
"grad_norm": 0.26853087654006846, |
|
"learning_rate": 8.752490331969807e-05, |
|
"loss": 0.9031, |
|
"num_input_tokens_seen": 3430936, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6282722513089005, |
|
"grad_norm": 0.20742364801678845, |
|
"learning_rate": 8.670784598125533e-05, |
|
"loss": 0.9012, |
|
"num_input_tokens_seen": 3508920, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.6422338568935427, |
|
"grad_norm": 0.2838565453202246, |
|
"learning_rate": 8.586895101857747e-05, |
|
"loss": 0.8936, |
|
"num_input_tokens_seen": 3586920, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6561954624781849, |
|
"grad_norm": 0.4892803498442385, |
|
"learning_rate": 8.500871749317243e-05, |
|
"loss": 0.9042, |
|
"num_input_tokens_seen": 3664896, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.6701570680628273, |
|
"grad_norm": 0.37976656488421395, |
|
"learning_rate": 8.412765716093272e-05, |
|
"loss": 0.9034, |
|
"num_input_tokens_seen": 3742832, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6841186736474695, |
|
"grad_norm": 0.291645150101734, |
|
"learning_rate": 8.322629416769006e-05, |
|
"loss": 0.8969, |
|
"num_input_tokens_seen": 3820792, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.6980802792321117, |
|
"grad_norm": 0.30837187813275513, |
|
"learning_rate": 8.230516473739935e-05, |
|
"loss": 0.8933, |
|
"num_input_tokens_seen": 3898784, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6980802792321117, |
|
"eval_loss": 0.9052047729492188, |
|
"eval_runtime": 46.4894, |
|
"eval_samples_per_second": 3.141, |
|
"eval_steps_per_second": 0.796, |
|
"num_input_tokens_seen": 3898784, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7120418848167539, |
|
"grad_norm": 0.30727171157251326, |
|
"learning_rate": 8.1364816853137e-05, |
|
"loss": 0.9079, |
|
"num_input_tokens_seen": 3976824, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.7260034904013961, |
|
"grad_norm": 0.24728667308223853, |
|
"learning_rate": 8.040580993110404e-05, |
|
"loss": 0.9044, |
|
"num_input_tokens_seen": 4054752, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7399650959860384, |
|
"grad_norm": 0.154782478333375, |
|
"learning_rate": 7.942871448782748e-05, |
|
"loss": 0.895, |
|
"num_input_tokens_seen": 4132664, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.7539267015706806, |
|
"grad_norm": 0.21937062747939326, |
|
"learning_rate": 7.843411180075794e-05, |
|
"loss": 0.8984, |
|
"num_input_tokens_seen": 4210656, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7678883071553229, |
|
"grad_norm": 0.28854882430140594, |
|
"learning_rate": 7.742259356246593e-05, |
|
"loss": 0.904, |
|
"num_input_tokens_seen": 4288664, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.7818499127399651, |
|
"grad_norm": 0.23726372374470991, |
|
"learning_rate": 7.639476152864162e-05, |
|
"loss": 0.8973, |
|
"num_input_tokens_seen": 4366608, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7958115183246073, |
|
"grad_norm": 0.34106252388262337, |
|
"learning_rate": 7.535122716010849e-05, |
|
"loss": 0.9018, |
|
"num_input_tokens_seen": 4444568, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.8097731239092496, |
|
"grad_norm": 0.37561723629929783, |
|
"learning_rate": 7.42926112590631e-05, |
|
"loss": 0.8886, |
|
"num_input_tokens_seen": 4522512, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8237347294938918, |
|
"grad_norm": 0.26053439132207656, |
|
"learning_rate": 7.321954359975776e-05, |
|
"loss": 0.9002, |
|
"num_input_tokens_seen": 4600504, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.837696335078534, |
|
"grad_norm": 0.248855595995717, |
|
"learning_rate": 7.21326625538456e-05, |
|
"loss": 0.897, |
|
"num_input_tokens_seen": 4678472, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.837696335078534, |
|
"eval_loss": 0.9003945589065552, |
|
"eval_runtime": 46.3308, |
|
"eval_samples_per_second": 3.151, |
|
"eval_steps_per_second": 0.799, |
|
"num_input_tokens_seen": 4678472, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8516579406631762, |
|
"grad_norm": 0.18200381843269203, |
|
"learning_rate": 7.103261471061116e-05, |
|
"loss": 0.9088, |
|
"num_input_tokens_seen": 4756440, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.8656195462478184, |
|
"grad_norm": 0.19255528640902111, |
|
"learning_rate": 6.992005449231208e-05, |
|
"loss": 0.899, |
|
"num_input_tokens_seen": 4834424, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8795811518324608, |
|
"grad_norm": 0.3960850388870267, |
|
"learning_rate": 6.879564376486114e-05, |
|
"loss": 0.905, |
|
"num_input_tokens_seen": 4912376, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.893542757417103, |
|
"grad_norm": 0.3472454916197344, |
|
"learning_rate": 6.76600514440799e-05, |
|
"loss": 0.8968, |
|
"num_input_tokens_seen": 4990328, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9075043630017452, |
|
"grad_norm": 0.42131468150264795, |
|
"learning_rate": 6.651395309775837e-05, |
|
"loss": 0.8916, |
|
"num_input_tokens_seen": 5068304, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.9214659685863874, |
|
"grad_norm": 0.7373865772840422, |
|
"learning_rate": 6.535803054375738e-05, |
|
"loss": 0.8937, |
|
"num_input_tokens_seen": 5146272, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.9354275741710296, |
|
"grad_norm": 0.6075085371236206, |
|
"learning_rate": 6.419297144439283e-05, |
|
"loss": 0.8965, |
|
"num_input_tokens_seen": 5224232, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.9493891797556719, |
|
"grad_norm": 0.5336896324950464, |
|
"learning_rate": 6.301946889734302e-05, |
|
"loss": 0.8957, |
|
"num_input_tokens_seen": 5302200, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9633507853403142, |
|
"grad_norm": 0.7535717304812762, |
|
"learning_rate": 6.183822102332234e-05, |
|
"loss": 0.9025, |
|
"num_input_tokens_seen": 5380168, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.9773123909249564, |
|
"grad_norm": 1.9146693047973427, |
|
"learning_rate": 6.064993055076698e-05, |
|
"loss": 0.8997, |
|
"num_input_tokens_seen": 5458104, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9773123909249564, |
|
"eval_loss": 0.9016226530075073, |
|
"eval_runtime": 46.29, |
|
"eval_samples_per_second": 3.154, |
|
"eval_steps_per_second": 0.799, |
|
"num_input_tokens_seen": 5458104, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9912739965095986, |
|
"grad_norm": 0.4275320872134287, |
|
"learning_rate": 5.945530439777923e-05, |
|
"loss": 0.902, |
|
"num_input_tokens_seen": 5536072, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.0027923211169285, |
|
"grad_norm": 1.5609644797837416, |
|
"learning_rate": 5.8255053251579616e-05, |
|
"loss": 0.7347, |
|
"num_input_tokens_seen": 5600392, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.0167539267015706, |
|
"grad_norm": 0.4880192687307677, |
|
"learning_rate": 5.704989114571648e-05, |
|
"loss": 0.8899, |
|
"num_input_tokens_seen": 5678424, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.030715532286213, |
|
"grad_norm": 0.37744049750587216, |
|
"learning_rate": 5.5840535035285025e-05, |
|
"loss": 0.8929, |
|
"num_input_tokens_seen": 5756400, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.0446771378708553, |
|
"grad_norm": 0.9395658719072697, |
|
"learning_rate": 5.4627704370408236e-05, |
|
"loss": 0.8904, |
|
"num_input_tokens_seen": 5834352, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.0586387434554974, |
|
"grad_norm": 0.5490353927737941, |
|
"learning_rate": 5.341212066823355e-05, |
|
"loss": 0.8964, |
|
"num_input_tokens_seen": 5912320, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.0726003490401397, |
|
"grad_norm": 1.1114978460946199, |
|
"learning_rate": 5.219450708369977e-05, |
|
"loss": 0.8843, |
|
"num_input_tokens_seen": 5990312, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.0865619546247818, |
|
"grad_norm": 1.1913979980069938, |
|
"learning_rate": 5.0975587979329734e-05, |
|
"loss": 0.8879, |
|
"num_input_tokens_seen": 6068280, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.100523560209424, |
|
"grad_norm": 2.222951083897475, |
|
"learning_rate": 4.9756088494304504e-05, |
|
"loss": 0.8816, |
|
"num_input_tokens_seen": 6146288, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.1144851657940662, |
|
"grad_norm": 1.5657191660795382, |
|
"learning_rate": 4.853673411307564e-05, |
|
"loss": 0.9109, |
|
"num_input_tokens_seen": 6224248, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.1144851657940662, |
|
"eval_loss": 0.8960007429122925, |
|
"eval_runtime": 46.5432, |
|
"eval_samples_per_second": 3.137, |
|
"eval_steps_per_second": 0.795, |
|
"num_input_tokens_seen": 6224248, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.1284467713787085, |
|
"grad_norm": 2.66254478509122, |
|
"learning_rate": 4.731825023377192e-05, |
|
"loss": 0.8631, |
|
"num_input_tokens_seen": 6302208, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.1424083769633508, |
|
"grad_norm": 1.7871963773512198, |
|
"learning_rate": 4.610136173665751e-05, |
|
"loss": 0.8722, |
|
"num_input_tokens_seen": 6380096, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.156369982547993, |
|
"grad_norm": 2.511479150461674, |
|
"learning_rate": 4.4886792552898286e-05, |
|
"loss": 0.864, |
|
"num_input_tokens_seen": 6458096, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.1703315881326353, |
|
"grad_norm": 2.9185608745372837, |
|
"learning_rate": 4.367526523389253e-05, |
|
"loss": 0.8446, |
|
"num_input_tokens_seen": 6536064, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.1842931937172776, |
|
"grad_norm": 3.7053458609530403, |
|
"learning_rate": 4.24675005214227e-05, |
|
"loss": 0.8576, |
|
"num_input_tokens_seen": 6614048, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.1982547993019197, |
|
"grad_norm": 2.3073667949552124, |
|
"learning_rate": 4.1264216918883656e-05, |
|
"loss": 0.8715, |
|
"num_input_tokens_seen": 6691984, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.212216404886562, |
|
"grad_norm": 2.597389084730036, |
|
"learning_rate": 4.006613026384249e-05, |
|
"loss": 0.8708, |
|
"num_input_tokens_seen": 6769984, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.2261780104712041, |
|
"grad_norm": 1.889225145403208, |
|
"learning_rate": 3.887395330218429e-05, |
|
"loss": 0.8546, |
|
"num_input_tokens_seen": 6847976, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.2401396160558464, |
|
"grad_norm": 2.61428130233799, |
|
"learning_rate": 3.768839526409718e-05, |
|
"loss": 0.8592, |
|
"num_input_tokens_seen": 6925944, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.2541012216404885, |
|
"grad_norm": 5.825082846471074, |
|
"learning_rate": 3.651016144214878e-05, |
|
"loss": 0.8127, |
|
"num_input_tokens_seen": 7003904, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.2541012216404885, |
|
"eval_loss": 0.8821887373924255, |
|
"eval_runtime": 46.527, |
|
"eval_samples_per_second": 3.138, |
|
"eval_steps_per_second": 0.795, |
|
"num_input_tokens_seen": 7003904, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.2680628272251309, |
|
"grad_norm": 3.745385127572465, |
|
"learning_rate": 3.533995277170532e-05, |
|
"loss": 0.837, |
|
"num_input_tokens_seen": 7081856, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.2820244328097732, |
|
"grad_norm": 3.1595331481084896, |
|
"learning_rate": 3.4178465413942625e-05, |
|
"loss": 0.8631, |
|
"num_input_tokens_seen": 7159776, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.2959860383944153, |
|
"grad_norm": 3.241335528071935, |
|
"learning_rate": 3.3026390341697576e-05, |
|
"loss": 0.8511, |
|
"num_input_tokens_seen": 7237720, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.3099476439790576, |
|
"grad_norm": 2.065109380918334, |
|
"learning_rate": 3.188441292840587e-05, |
|
"loss": 0.8439, |
|
"num_input_tokens_seen": 7315704, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.3239092495637, |
|
"grad_norm": 3.25037907997737, |
|
"learning_rate": 3.075321254037112e-05, |
|
"loss": 0.872, |
|
"num_input_tokens_seen": 7393672, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.337870855148342, |
|
"grad_norm": 4.772644937407586, |
|
"learning_rate": 2.963346213260737e-05, |
|
"loss": 0.8397, |
|
"num_input_tokens_seen": 7471632, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.3518324607329844, |
|
"grad_norm": 4.371699254241009, |
|
"learning_rate": 2.8525827848495913e-05, |
|
"loss": 0.8254, |
|
"num_input_tokens_seen": 7549624, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.3657940663176265, |
|
"grad_norm": 3.1620853722419797, |
|
"learning_rate": 2.743096862349427e-05, |
|
"loss": 0.8236, |
|
"num_input_tokens_seen": 7627568, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.3797556719022688, |
|
"grad_norm": 5.330846549870451, |
|
"learning_rate": 2.6349535793133196e-05, |
|
"loss": 0.8561, |
|
"num_input_tokens_seen": 7705512, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.3937172774869109, |
|
"grad_norm": 6.723230790293138, |
|
"learning_rate": 2.5282172705535013e-05, |
|
"loss": 0.8198, |
|
"num_input_tokens_seen": 7783528, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.3937172774869109, |
|
"eval_loss": 0.846021294593811, |
|
"eval_runtime": 46.2996, |
|
"eval_samples_per_second": 3.153, |
|
"eval_steps_per_second": 0.799, |
|
"num_input_tokens_seen": 7783528, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.4076788830715532, |
|
"grad_norm": 3.08479773422221, |
|
"learning_rate": 2.4229514338683458e-05, |
|
"loss": 0.8498, |
|
"num_input_tokens_seen": 7861512, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.4216404886561955, |
|
"grad_norm": 6.6639723675246385, |
|
"learning_rate": 2.3192186922673186e-05, |
|
"loss": 0.8195, |
|
"num_input_tokens_seen": 7939480, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.4356020942408376, |
|
"grad_norm": 5.041383098958532, |
|
"learning_rate": 2.2170807567163294e-05, |
|
"loss": 0.8428, |
|
"num_input_tokens_seen": 8017496, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.44956369982548, |
|
"grad_norm": 6.547003709523484, |
|
"learning_rate": 2.1165983894256647e-05, |
|
"loss": 0.8534, |
|
"num_input_tokens_seen": 8095504, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.4635253054101223, |
|
"grad_norm": 6.396371201229991, |
|
"learning_rate": 2.0178313677023425e-05, |
|
"loss": 0.8113, |
|
"num_input_tokens_seen": 8173440, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.4774869109947644, |
|
"grad_norm": 3.194803428296668, |
|
"learning_rate": 1.9208384483883817e-05, |
|
"loss": 0.8325, |
|
"num_input_tokens_seen": 8251400, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.4914485165794067, |
|
"grad_norm": 3.8201513107026552, |
|
"learning_rate": 1.8256773329061567e-05, |
|
"loss": 0.8158, |
|
"num_input_tokens_seen": 8329384, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.505410122164049, |
|
"grad_norm": 7.081372136918514, |
|
"learning_rate": 1.732404632931625e-05, |
|
"loss": 0.8183, |
|
"num_input_tokens_seen": 8407384, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.5193717277486911, |
|
"grad_norm": 5.594321718168384, |
|
"learning_rate": 1.6410758367158385e-05, |
|
"loss": 0.8364, |
|
"num_input_tokens_seen": 8485328, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.5333333333333332, |
|
"grad_norm": 6.74399246622262, |
|
"learning_rate": 1.5517452760747975e-05, |
|
"loss": 0.832, |
|
"num_input_tokens_seen": 8563264, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.5333333333333332, |
|
"eval_loss": 0.8187811374664307, |
|
"eval_runtime": 46.4274, |
|
"eval_samples_per_second": 3.145, |
|
"eval_steps_per_second": 0.797, |
|
"num_input_tokens_seen": 8563264, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.5472949389179755, |
|
"grad_norm": 3.6364413823808968, |
|
"learning_rate": 1.4644660940672627e-05, |
|
"loss": 0.8445, |
|
"num_input_tokens_seen": 8641240, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.5612565445026179, |
|
"grad_norm": 4.8432505694035495, |
|
"learning_rate": 1.3792902133797692e-05, |
|
"loss": 0.8372, |
|
"num_input_tokens_seen": 8719256, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.57521815008726, |
|
"grad_norm": 3.5469277881436545, |
|
"learning_rate": 1.2962683054376373e-05, |
|
"loss": 0.8107, |
|
"num_input_tokens_seen": 8797240, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.5891797556719023, |
|
"grad_norm": 3.916756694748693, |
|
"learning_rate": 1.2154497602603703e-05, |
|
"loss": 0.8472, |
|
"num_input_tokens_seen": 8875208, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.6031413612565446, |
|
"grad_norm": 5.159527110013197, |
|
"learning_rate": 1.13688265707936e-05, |
|
"loss": 0.8221, |
|
"num_input_tokens_seen": 8953176, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.6171029668411867, |
|
"grad_norm": 3.9357730966522637, |
|
"learning_rate": 1.060613735735384e-05, |
|
"loss": 0.8076, |
|
"num_input_tokens_seen": 9031192, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.6310645724258288, |
|
"grad_norm": 4.296138435306084, |
|
"learning_rate": 9.86688368872919e-06, |
|
"loss": 0.7987, |
|
"num_input_tokens_seen": 9109184, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.6450261780104714, |
|
"grad_norm": 5.917164796099687, |
|
"learning_rate": 9.151505349477902e-06, |
|
"loss": 0.7814, |
|
"num_input_tokens_seen": 9187136, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.6589877835951135, |
|
"grad_norm": 6.642916615289702, |
|
"learning_rate": 8.460427920642423e-06, |
|
"loss": 0.7907, |
|
"num_input_tokens_seen": 9265112, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.6729493891797556, |
|
"grad_norm": 6.455529863705343, |
|
"learning_rate": 7.794062526569734e-06, |
|
"loss": 0.786, |
|
"num_input_tokens_seen": 9343120, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.6729493891797556, |
|
"eval_loss": 0.8021153211593628, |
|
"eval_runtime": 46.5141, |
|
"eval_samples_per_second": 3.139, |
|
"eval_steps_per_second": 0.795, |
|
"num_input_tokens_seen": 9343120, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.6869109947643979, |
|
"grad_norm": 18.00061211274046, |
|
"learning_rate": 7.152805590332079e-06, |
|
"loss": 0.7702, |
|
"num_input_tokens_seen": 9421080, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.7008726003490402, |
|
"grad_norm": 6.515893934614007, |
|
"learning_rate": 6.53703859789348e-06, |
|
"loss": 0.7447, |
|
"num_input_tokens_seen": 9499048, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.7148342059336823, |
|
"grad_norm": 8.393536107737633, |
|
"learning_rate": 5.947127871162456e-06, |
|
"loss": 0.7943, |
|
"num_input_tokens_seen": 9577048, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.7287958115183246, |
|
"grad_norm": 10.522936719634327, |
|
"learning_rate": 5.383424350065824e-06, |
|
"loss": 0.7784, |
|
"num_input_tokens_seen": 9655032, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.742757417102967, |
|
"grad_norm": 4.303878340242376, |
|
"learning_rate": 4.846263383773364e-06, |
|
"loss": 0.8188, |
|
"num_input_tokens_seen": 9733000, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.756719022687609, |
|
"grad_norm": 7.6073494826846035, |
|
"learning_rate": 4.335964531197401e-06, |
|
"loss": 0.7514, |
|
"num_input_tokens_seen": 9810984, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.7706806282722511, |
|
"grad_norm": 4.962378312676078, |
|
"learning_rate": 3.8528313708861174e-06, |
|
"loss": 0.8165, |
|
"num_input_tokens_seen": 9888984, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.7846422338568937, |
|
"grad_norm": 5.475236961963435, |
|
"learning_rate": 3.397151320423647e-06, |
|
"loss": 0.7778, |
|
"num_input_tokens_seen": 9966984, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.7986038394415358, |
|
"grad_norm": 7.658362208235343, |
|
"learning_rate": 2.9691954654443355e-06, |
|
"loss": 0.7745, |
|
"num_input_tokens_seen": 10044928, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.812565445026178, |
|
"grad_norm": 6.541910443575109, |
|
"learning_rate": 2.5692183983629713e-06, |
|
"loss": 0.8312, |
|
"num_input_tokens_seen": 10122936, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.812565445026178, |
|
"eval_loss": 0.7986289858818054, |
|
"eval_runtime": 46.4273, |
|
"eval_samples_per_second": 3.145, |
|
"eval_steps_per_second": 0.797, |
|
"num_input_tokens_seen": 10122936, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.8265270506108202, |
|
"grad_norm": 9.39953604203485, |
|
"learning_rate": 2.197458066916891e-06, |
|
"loss": 0.794, |
|
"num_input_tokens_seen": 10200848, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.8404886561954625, |
|
"grad_norm": 5.45740552150688, |
|
"learning_rate": 1.8541356326100433e-06, |
|
"loss": 0.8129, |
|
"num_input_tokens_seen": 10278848, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.8544502617801046, |
|
"grad_norm": 5.2960185422888575, |
|
"learning_rate": 1.5394553391432143e-06, |
|
"loss": 0.8057, |
|
"num_input_tokens_seen": 10356800, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.868411867364747, |
|
"grad_norm": 4.792059943516234, |
|
"learning_rate": 1.2536043909088191e-06, |
|
"loss": 0.7678, |
|
"num_input_tokens_seen": 10434768, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.8823734729493893, |
|
"grad_norm": 4.924682676533088, |
|
"learning_rate": 9.967528416222838e-07, |
|
"loss": 0.7851, |
|
"num_input_tokens_seen": 10512752, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.8963350785340314, |
|
"grad_norm": 7.6405932797853096, |
|
"learning_rate": 7.690534931565518e-07, |
|
"loss": 0.7381, |
|
"num_input_tokens_seen": 10590760, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.9102966841186735, |
|
"grad_norm": 4.326706788996488, |
|
"learning_rate": 5.706418046396989e-07, |
|
"loss": 0.7495, |
|
"num_input_tokens_seen": 10668752, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.924258289703316, |
|
"grad_norm": 5.0004961580783815, |
|
"learning_rate": 4.0163581186984935e-07, |
|
"loss": 0.7406, |
|
"num_input_tokens_seen": 10746696, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.9382198952879581, |
|
"grad_norm": 7.870737498179825, |
|
"learning_rate": 2.62136057095258e-07, |
|
"loss": 0.7823, |
|
"num_input_tokens_seen": 10824664, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.9521815008726002, |
|
"grad_norm": 6.748561230572036, |
|
"learning_rate": 1.5222552920138856e-07, |
|
"loss": 0.7797, |
|
"num_input_tokens_seen": 10902632, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.9521815008726002, |
|
"eval_loss": 0.794740617275238, |
|
"eval_runtime": 46.5422, |
|
"eval_samples_per_second": 3.137, |
|
"eval_steps_per_second": 0.795, |
|
"num_input_tokens_seen": 10902632, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.9661431064572426, |
|
"grad_norm": 23.84213538429929, |
|
"learning_rate": 7.196961434052796e-08, |
|
"loss": 0.8029, |
|
"num_input_tokens_seen": 10980576, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.9801047120418849, |
|
"grad_norm": 7.4190814811781145, |
|
"learning_rate": 2.1416057033352144e-08, |
|
"loss": 0.7942, |
|
"num_input_tokens_seen": 11058552, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.994066317626527, |
|
"grad_norm": 6.100028794501354, |
|
"learning_rate": 5.949317655462583e-10, |
|
"loss": 0.7964, |
|
"num_input_tokens_seen": 11136520, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.9968586387434555, |
|
"num_input_tokens_seen": 11152104, |
|
"step": 716, |
|
"total_flos": 754095660204032.0, |
|
"train_loss": 0.9049516569136241, |
|
"train_runtime": 17154.0309, |
|
"train_samples_per_second": 1.336, |
|
"train_steps_per_second": 0.042 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 716, |
|
"num_input_tokens_seen": 11152104, |
|
"num_train_epochs": 2, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 754095660204032.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|