|
{ |
|
"best_metric": 2.017824411392212, |
|
"best_model_checkpoint": "/home/datta0/models/lora_final/Mistral-7B-v0.3_pct_default_r16/checkpoint-376", |
|
"epoch": 0.9983071342200726, |
|
"eval_steps": 8, |
|
"global_step": 387, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0025796049979846837, |
|
"grad_norm": 16.779726028442383, |
|
"learning_rate": 1.25e-05, |
|
"loss": 2.1524, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.010318419991938735, |
|
"grad_norm": 10.524264335632324, |
|
"learning_rate": 5e-05, |
|
"loss": 2.0461, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.02063683998387747, |
|
"grad_norm": 10.28955078125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.9646, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.02063683998387747, |
|
"eval_loss": 2.0348494052886963, |
|
"eval_runtime": 136.7937, |
|
"eval_samples_per_second": 1.791, |
|
"eval_steps_per_second": 0.899, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.030955259975816204, |
|
"grad_norm": 19.959510803222656, |
|
"learning_rate": 9.997251843068762e-05, |
|
"loss": 2.012, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.04127367996775494, |
|
"grad_norm": 14.419052124023438, |
|
"learning_rate": 9.989010393221656e-05, |
|
"loss": 2.0531, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04127367996775494, |
|
"eval_loss": 2.034454107284546, |
|
"eval_runtime": 132.3292, |
|
"eval_samples_per_second": 1.851, |
|
"eval_steps_per_second": 0.93, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.051592099959693674, |
|
"grad_norm": 17.94999122619629, |
|
"learning_rate": 9.97528470997769e-05, |
|
"loss": 2.1603, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06191051995163241, |
|
"grad_norm": 14.041346549987793, |
|
"learning_rate": 9.956089881469482e-05, |
|
"loss": 2.1168, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.06191051995163241, |
|
"eval_loss": 2.046419382095337, |
|
"eval_runtime": 124.8023, |
|
"eval_samples_per_second": 1.963, |
|
"eval_steps_per_second": 0.986, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.07222893994357114, |
|
"grad_norm": 16.775529861450195, |
|
"learning_rate": 9.931447007857432e-05, |
|
"loss": 2.075, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.08254735993550988, |
|
"grad_norm": 16.405385971069336, |
|
"learning_rate": 9.901383178135113e-05, |
|
"loss": 2.0712, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.08254735993550988, |
|
"eval_loss": 2.046201705932617, |
|
"eval_runtime": 241.0686, |
|
"eval_samples_per_second": 1.016, |
|
"eval_steps_per_second": 0.51, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.09286577992744861, |
|
"grad_norm": 27.64794158935547, |
|
"learning_rate": 9.865931440351337e-05, |
|
"loss": 2.0807, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.10318419991938735, |
|
"grad_norm": 20.194869995117188, |
|
"learning_rate": 9.825130765281668e-05, |
|
"loss": 2.0779, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10318419991938735, |
|
"eval_loss": 2.0509941577911377, |
|
"eval_runtime": 141.7947, |
|
"eval_samples_per_second": 1.728, |
|
"eval_steps_per_second": 0.867, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11350261991132608, |
|
"grad_norm": 18.115293502807617, |
|
"learning_rate": 9.779026003589304e-05, |
|
"loss": 2.065, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.12382103990326482, |
|
"grad_norm": 14.515559196472168, |
|
"learning_rate": 9.727667836522407e-05, |
|
"loss": 2.0905, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.12382103990326482, |
|
"eval_loss": 2.0475986003875732, |
|
"eval_runtime": 128.356, |
|
"eval_samples_per_second": 1.909, |
|
"eval_steps_per_second": 0.958, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.13413945989520354, |
|
"grad_norm": 15.200082778930664, |
|
"learning_rate": 9.6711127202021e-05, |
|
"loss": 2.049, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1444578798871423, |
|
"grad_norm": 14.234698295593262, |
|
"learning_rate": 9.609422823562345e-05, |
|
"loss": 2.0624, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.1444578798871423, |
|
"eval_loss": 2.0475313663482666, |
|
"eval_runtime": 123.5017, |
|
"eval_samples_per_second": 1.984, |
|
"eval_steps_per_second": 0.996, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.154776299879081, |
|
"grad_norm": 34.070438385009766, |
|
"learning_rate": 9.542665960009959e-05, |
|
"loss": 2.0794, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16509471987101976, |
|
"grad_norm": 14.070226669311523, |
|
"learning_rate": 9.470915512879852e-05, |
|
"loss": 2.0793, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.16509471987101976, |
|
"eval_loss": 2.042788505554199, |
|
"eval_runtime": 124.8794, |
|
"eval_samples_per_second": 1.962, |
|
"eval_steps_per_second": 0.985, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.17541313986295848, |
|
"grad_norm": 13.649319648742676, |
|
"learning_rate": 9.394250354767467e-05, |
|
"loss": 2.0942, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.18573155985489723, |
|
"grad_norm": 14.56338882446289, |
|
"learning_rate": 9.312754760827061e-05, |
|
"loss": 2.0559, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.18573155985489723, |
|
"eval_loss": 2.052072763442993, |
|
"eval_runtime": 252.4609, |
|
"eval_samples_per_second": 0.97, |
|
"eval_steps_per_second": 0.487, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.19604997984683595, |
|
"grad_norm": 16.887882232666016, |
|
"learning_rate": 9.226518316131176e-05, |
|
"loss": 2.0926, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.2063683998387747, |
|
"grad_norm": 16.470928192138672, |
|
"learning_rate": 9.1356358171931e-05, |
|
"loss": 2.0597, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2063683998387747, |
|
"eval_loss": 2.068044424057007, |
|
"eval_runtime": 120.7858, |
|
"eval_samples_per_second": 2.028, |
|
"eval_steps_per_second": 1.018, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.21668681983071342, |
|
"grad_norm": 16.528242111206055, |
|
"learning_rate": 9.040207167760586e-05, |
|
"loss": 2.0819, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.22700523982265217, |
|
"grad_norm": 18.425939559936523, |
|
"learning_rate": 8.940337268995385e-05, |
|
"loss": 2.1235, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.22700523982265217, |
|
"eval_loss": 2.0848894119262695, |
|
"eval_runtime": 117.9661, |
|
"eval_samples_per_second": 2.077, |
|
"eval_steps_per_second": 1.043, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.2373236598145909, |
|
"grad_norm": 11.150261878967285, |
|
"learning_rate": 8.836135904159302e-05, |
|
"loss": 2.1182, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.24764207980652964, |
|
"grad_norm": 13.942819595336914, |
|
"learning_rate": 8.727717617933544e-05, |
|
"loss": 2.14, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.24764207980652964, |
|
"eval_loss": 2.0771827697753906, |
|
"eval_runtime": 147.6689, |
|
"eval_samples_per_second": 1.659, |
|
"eval_steps_per_second": 0.833, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.25796049979846836, |
|
"grad_norm": 21.486312866210938, |
|
"learning_rate": 8.615201590504017e-05, |
|
"loss": 2.1272, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2682789197904071, |
|
"grad_norm": 13.030159950256348, |
|
"learning_rate": 8.498711506550983e-05, |
|
"loss": 2.1586, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.2682789197904071, |
|
"eval_loss": 2.0880281925201416, |
|
"eval_runtime": 155.8186, |
|
"eval_samples_per_second": 1.572, |
|
"eval_steps_per_second": 0.789, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.27859733978234585, |
|
"grad_norm": 18.889177322387695, |
|
"learning_rate": 8.378375419287099e-05, |
|
"loss": 2.0883, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.2889157597742846, |
|
"grad_norm": 16.2430419921875, |
|
"learning_rate": 8.25432560969328e-05, |
|
"loss": 2.0974, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.2889157597742846, |
|
"eval_loss": 2.0836544036865234, |
|
"eval_runtime": 249.1613, |
|
"eval_samples_per_second": 0.983, |
|
"eval_steps_per_second": 0.494, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.2992341797662233, |
|
"grad_norm": 16.362117767333984, |
|
"learning_rate": 8.126698441107146e-05, |
|
"loss": 2.1367, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.309552599758162, |
|
"grad_norm": 16.559032440185547, |
|
"learning_rate": 7.995634209323886e-05, |
|
"loss": 2.1577, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.309552599758162, |
|
"eval_loss": 2.0837860107421875, |
|
"eval_runtime": 142.7033, |
|
"eval_samples_per_second": 1.717, |
|
"eval_steps_per_second": 0.862, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3198710197501008, |
|
"grad_norm": 16.882129669189453, |
|
"learning_rate": 7.861276988374302e-05, |
|
"loss": 2.1391, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3301894397420395, |
|
"grad_norm": 16.510204315185547, |
|
"learning_rate": 7.723774472149601e-05, |
|
"loss": 2.0998, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.3301894397420395, |
|
"eval_loss": 2.089890718460083, |
|
"eval_runtime": 134.7898, |
|
"eval_samples_per_second": 1.818, |
|
"eval_steps_per_second": 0.913, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.34050785973397824, |
|
"grad_norm": 15.429149627685547, |
|
"learning_rate": 7.583277812046993e-05, |
|
"loss": 2.0801, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.35082627972591696, |
|
"grad_norm": 16.833860397338867, |
|
"learning_rate": 7.439941450814591e-05, |
|
"loss": 2.1069, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.35082627972591696, |
|
"eval_loss": 2.088224172592163, |
|
"eval_runtime": 119.5762, |
|
"eval_samples_per_second": 2.049, |
|
"eval_steps_per_second": 1.029, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.3611446997178557, |
|
"grad_norm": 15.170039176940918, |
|
"learning_rate": 7.293922952778239e-05, |
|
"loss": 2.1277, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.37146311970979445, |
|
"grad_norm": 16.08381462097168, |
|
"learning_rate": 7.145382830636924e-05, |
|
"loss": 2.1621, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.37146311970979445, |
|
"eval_loss": 2.084606647491455, |
|
"eval_runtime": 51.8689, |
|
"eval_samples_per_second": 4.723, |
|
"eval_steps_per_second": 2.371, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3817815397017332, |
|
"grad_norm": 13.434906005859375, |
|
"learning_rate": 6.994484369017143e-05, |
|
"loss": 2.124, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.3920999596936719, |
|
"grad_norm": 16.64518165588379, |
|
"learning_rate": 6.841393444980177e-05, |
|
"loss": 2.1441, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.3920999596936719, |
|
"eval_loss": 2.0949418544769287, |
|
"eval_runtime": 54.9018, |
|
"eval_samples_per_second": 4.463, |
|
"eval_steps_per_second": 2.24, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.4024183796856106, |
|
"grad_norm": 14.433173179626465, |
|
"learning_rate": 6.686278345679625e-05, |
|
"loss": 2.1492, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.4127367996775494, |
|
"grad_norm": 21.335298538208008, |
|
"learning_rate": 6.529309583369605e-05, |
|
"loss": 2.1355, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4127367996775494, |
|
"eval_loss": 2.085907220840454, |
|
"eval_runtime": 31.9309, |
|
"eval_samples_per_second": 7.673, |
|
"eval_steps_per_second": 3.852, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4230552196694881, |
|
"grad_norm": 12.250011444091797, |
|
"learning_rate": 6.370659707966967e-05, |
|
"loss": 2.1237, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.43337363966142683, |
|
"grad_norm": 16.425058364868164, |
|
"learning_rate": 6.2105031173736e-05, |
|
"loss": 2.084, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.43337363966142683, |
|
"eval_loss": 2.087137460708618, |
|
"eval_runtime": 33.4112, |
|
"eval_samples_per_second": 7.333, |
|
"eval_steps_per_second": 3.681, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.44369205965336556, |
|
"grad_norm": 13.831182479858398, |
|
"learning_rate": 6.049015865767318e-05, |
|
"loss": 2.138, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.45401047964530433, |
|
"grad_norm": 17.53313446044922, |
|
"learning_rate": 5.88637547007204e-05, |
|
"loss": 2.1649, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.45401047964530433, |
|
"eval_loss": 2.084533214569092, |
|
"eval_runtime": 32.4375, |
|
"eval_samples_per_second": 7.553, |
|
"eval_steps_per_second": 3.792, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.46432889963724305, |
|
"grad_norm": 14.465840339660645, |
|
"learning_rate": 5.722760714820057e-05, |
|
"loss": 2.1365, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4746473196291818, |
|
"grad_norm": 14.579000473022461, |
|
"learning_rate": 5.5583514556208514e-05, |
|
"loss": 2.0651, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.4746473196291818, |
|
"eval_loss": 2.071864604949951, |
|
"eval_runtime": 51.0877, |
|
"eval_samples_per_second": 4.796, |
|
"eval_steps_per_second": 2.408, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.4849657396211205, |
|
"grad_norm": 15.363332748413086, |
|
"learning_rate": 5.393328421452514e-05, |
|
"loss": 2.1009, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.49528415961305927, |
|
"grad_norm": 13.905656814575195, |
|
"learning_rate": 5.2278730159931076e-05, |
|
"loss": 2.1708, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.49528415961305927, |
|
"eval_loss": 2.072157144546509, |
|
"eval_runtime": 32.8055, |
|
"eval_samples_per_second": 7.468, |
|
"eval_steps_per_second": 3.749, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.505602579604998, |
|
"grad_norm": 13.91466236114502, |
|
"learning_rate": 5.062167118210367e-05, |
|
"loss": 2.1383, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.5159209995969367, |
|
"grad_norm": 15.01607608795166, |
|
"learning_rate": 4.896392882428901e-05, |
|
"loss": 2.1311, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5159209995969367, |
|
"eval_loss": 2.0677082538604736, |
|
"eval_runtime": 33.5658, |
|
"eval_samples_per_second": 7.299, |
|
"eval_steps_per_second": 3.664, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5262394195888754, |
|
"grad_norm": 10.953645706176758, |
|
"learning_rate": 4.730732538094749e-05, |
|
"loss": 2.1166, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5365578395808142, |
|
"grad_norm": 15.371764183044434, |
|
"learning_rate": 4.565368189457313e-05, |
|
"loss": 2.1038, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5365578395808142, |
|
"eval_loss": 2.0626721382141113, |
|
"eval_runtime": 32.7774, |
|
"eval_samples_per_second": 7.475, |
|
"eval_steps_per_second": 3.753, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5468762595727529, |
|
"grad_norm": 14.383162498474121, |
|
"learning_rate": 4.400481615388948e-05, |
|
"loss": 2.1023, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5571946795646917, |
|
"grad_norm": 15.261186599731445, |
|
"learning_rate": 4.236254069562213e-05, |
|
"loss": 2.0804, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5571946795646917, |
|
"eval_loss": 2.075681686401367, |
|
"eval_runtime": 32.6725, |
|
"eval_samples_per_second": 7.499, |
|
"eval_steps_per_second": 3.765, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5675130995566304, |
|
"grad_norm": 10.065788269042969, |
|
"learning_rate": 4.0728660812044536e-05, |
|
"loss": 2.0796, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5778315195485692, |
|
"grad_norm": 13.413424491882324, |
|
"learning_rate": 3.910497256648742e-05, |
|
"loss": 2.0695, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5778315195485692, |
|
"eval_loss": 2.064877986907959, |
|
"eval_runtime": 53.1644, |
|
"eval_samples_per_second": 4.608, |
|
"eval_steps_per_second": 2.314, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5881499395405079, |
|
"grad_norm": 11.468412399291992, |
|
"learning_rate": 3.749326081899329e-05, |
|
"loss": 2.0694, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.5984683595324466, |
|
"grad_norm": 13.881438255310059, |
|
"learning_rate": 3.589529726428615e-05, |
|
"loss": 2.0961, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.5984683595324466, |
|
"eval_loss": 2.064319610595703, |
|
"eval_runtime": 31.9203, |
|
"eval_samples_per_second": 7.675, |
|
"eval_steps_per_second": 3.853, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.6087867795243853, |
|
"grad_norm": 10.702425956726074, |
|
"learning_rate": 3.431283848421347e-05, |
|
"loss": 2.0564, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.619105199516324, |
|
"grad_norm": 11.869012832641602, |
|
"learning_rate": 3.274762401680124e-05, |
|
"loss": 2.0808, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.619105199516324, |
|
"eval_loss": 2.056699752807617, |
|
"eval_runtime": 30.4915, |
|
"eval_samples_per_second": 8.035, |
|
"eval_steps_per_second": 4.034, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6294236195082628, |
|
"grad_norm": 9.28243637084961, |
|
"learning_rate": 3.120137444404442e-05, |
|
"loss": 2.1001, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.6397420395002016, |
|
"grad_norm": 10.947897911071777, |
|
"learning_rate": 2.9675789500535328e-05, |
|
"loss": 2.1337, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6397420395002016, |
|
"eval_loss": 2.0556507110595703, |
|
"eval_runtime": 31.1947, |
|
"eval_samples_per_second": 7.854, |
|
"eval_steps_per_second": 3.943, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6500604594921403, |
|
"grad_norm": 11.127779006958008, |
|
"learning_rate": 2.8172546205008683e-05, |
|
"loss": 2.0562, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.660378879484079, |
|
"grad_norm": 10.932923316955566, |
|
"learning_rate": 2.6693297016857188e-05, |
|
"loss": 2.0565, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.660378879484079, |
|
"eval_loss": 2.055548906326294, |
|
"eval_runtime": 31.9066, |
|
"eval_samples_per_second": 7.679, |
|
"eval_steps_per_second": 3.855, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6706972994760177, |
|
"grad_norm": 11.094362258911133, |
|
"learning_rate": 2.523966801964468e-05, |
|
"loss": 2.0685, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6810157194679565, |
|
"grad_norm": 10.753888130187988, |
|
"learning_rate": 2.3813257133612827e-05, |
|
"loss": 2.1184, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6810157194679565, |
|
"eval_loss": 2.0496644973754883, |
|
"eval_runtime": 54.0908, |
|
"eval_samples_per_second": 4.529, |
|
"eval_steps_per_second": 2.274, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6913341394598952, |
|
"grad_norm": 9.6819429397583, |
|
"learning_rate": 2.2415632359146856e-05, |
|
"loss": 2.0437, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.7016525594518339, |
|
"grad_norm": 10.608392715454102, |
|
"learning_rate": 2.104833005313131e-05, |
|
"loss": 2.0604, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.7016525594518339, |
|
"eval_loss": 2.041249990463257, |
|
"eval_runtime": 34.03, |
|
"eval_samples_per_second": 7.2, |
|
"eval_steps_per_second": 3.614, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.7119709794437726, |
|
"grad_norm": 11.31783390045166, |
|
"learning_rate": 1.971285324008994e-05, |
|
"loss": 2.093, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.7222893994357114, |
|
"grad_norm": 11.558327674865723, |
|
"learning_rate": 1.84106699599668e-05, |
|
"loss": 2.1099, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7222893994357114, |
|
"eval_loss": 2.038382053375244, |
|
"eval_runtime": 30.4289, |
|
"eval_samples_per_second": 8.052, |
|
"eval_steps_per_second": 4.042, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7326078194276502, |
|
"grad_norm": 9.70297908782959, |
|
"learning_rate": 1.7143211654364762e-05, |
|
"loss": 2.108, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.7429262394195889, |
|
"grad_norm": 12.303277969360352, |
|
"learning_rate": 1.5911871593014837e-05, |
|
"loss": 2.1048, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7429262394195889, |
|
"eval_loss": 2.041520118713379, |
|
"eval_runtime": 32.0147, |
|
"eval_samples_per_second": 7.653, |
|
"eval_steps_per_second": 3.842, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7532446594115276, |
|
"grad_norm": 11.557732582092285, |
|
"learning_rate": 1.4718003342206722e-05, |
|
"loss": 2.1051, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.7635630794034663, |
|
"grad_norm": 11.619261741638184, |
|
"learning_rate": 1.3562919276863844e-05, |
|
"loss": 2.0692, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7635630794034663, |
|
"eval_loss": 2.033982992172241, |
|
"eval_runtime": 32.7706, |
|
"eval_samples_per_second": 7.476, |
|
"eval_steps_per_second": 3.753, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7738814993954051, |
|
"grad_norm": 10.316109657287598, |
|
"learning_rate": 1.2447889137898293e-05, |
|
"loss": 2.0314, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7841999193873438, |
|
"grad_norm": 12.697169303894043, |
|
"learning_rate": 1.1374138636432053e-05, |
|
"loss": 2.0489, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7841999193873438, |
|
"eval_loss": 2.033088445663452, |
|
"eval_runtime": 50.91, |
|
"eval_samples_per_second": 4.812, |
|
"eval_steps_per_second": 2.416, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7945183393792825, |
|
"grad_norm": 11.112667083740234, |
|
"learning_rate": 1.0342848106418368e-05, |
|
"loss": 2.0759, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.8048367593712212, |
|
"grad_norm": 11.560709953308105, |
|
"learning_rate": 9.35515120714447e-06, |
|
"loss": 2.057, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.8048367593712212, |
|
"eval_loss": 2.027519702911377, |
|
"eval_runtime": 32.5351, |
|
"eval_samples_per_second": 7.53, |
|
"eval_steps_per_second": 3.781, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.8151551793631601, |
|
"grad_norm": 10.368136405944824, |
|
"learning_rate": 8.41213367704224e-06, |
|
"loss": 2.0332, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.8254735993550988, |
|
"grad_norm": 9.943156242370605, |
|
"learning_rate": 7.51483214017637e-06, |
|
"loss": 2.0485, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8254735993550988, |
|
"eval_loss": 2.022447347640991, |
|
"eval_runtime": 32.7692, |
|
"eval_samples_per_second": 7.477, |
|
"eval_steps_per_second": 3.754, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8357920193470375, |
|
"grad_norm": 9.394889831542969, |
|
"learning_rate": 6.664232966721995e-06, |
|
"loss": 2.1139, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.8461104393389762, |
|
"grad_norm": 8.868988037109375, |
|
"learning_rate": 5.8612711886848196e-06, |
|
"loss": 2.0364, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.8461104393389762, |
|
"eval_loss": 2.0201539993286133, |
|
"eval_runtime": 32.5539, |
|
"eval_samples_per_second": 7.526, |
|
"eval_steps_per_second": 3.778, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.856428859330915, |
|
"grad_norm": 10.170036315917969, |
|
"learning_rate": 5.106829472055202e-06, |
|
"loss": 2.0615, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.8667472793228537, |
|
"grad_norm": 9.667701721191406, |
|
"learning_rate": 4.401737146526219e-06, |
|
"loss": 2.014, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8667472793228537, |
|
"eval_loss": 2.02396821975708, |
|
"eval_runtime": 51.0734, |
|
"eval_samples_per_second": 4.797, |
|
"eval_steps_per_second": 2.408, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8770656993147924, |
|
"grad_norm": 10.14772891998291, |
|
"learning_rate": 3.7467692938425057e-06, |
|
"loss": 2.0966, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8873841193067311, |
|
"grad_norm": 10.002238273620605, |
|
"learning_rate": 3.142645895781715e-06, |
|
"loss": 2.0656, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.8873841193067311, |
|
"eval_loss": 2.0236427783966064, |
|
"eval_runtime": 31.6007, |
|
"eval_samples_per_second": 7.753, |
|
"eval_steps_per_second": 3.892, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.8977025392986699, |
|
"grad_norm": 8.875214576721191, |
|
"learning_rate": 2.5900310427053044e-06, |
|
"loss": 2.0104, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.9080209592906087, |
|
"grad_norm": 9.890630722045898, |
|
"learning_rate": 2.089532203548794e-06, |
|
"loss": 2.0473, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.9080209592906087, |
|
"eval_loss": 2.0197205543518066, |
|
"eval_runtime": 33.634, |
|
"eval_samples_per_second": 7.284, |
|
"eval_steps_per_second": 3.657, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.9183393792825474, |
|
"grad_norm": 10.363245964050293, |
|
"learning_rate": 1.6416995580537664e-06, |
|
"loss": 2.0972, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.9286577992744861, |
|
"grad_norm": 9.002208709716797, |
|
"learning_rate": 1.247025391975698e-06, |
|
"loss": 2.0279, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9286577992744861, |
|
"eval_loss": 2.0180423259735107, |
|
"eval_runtime": 33.0195, |
|
"eval_samples_per_second": 7.42, |
|
"eval_steps_per_second": 3.725, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9389762192664248, |
|
"grad_norm": 9.901611328125, |
|
"learning_rate": 9.059435559326257e-07, |
|
"loss": 2.0544, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.9492946392583635, |
|
"grad_norm": 9.303104400634766, |
|
"learning_rate": 6.188289884893062e-07, |
|
"loss": 2.0415, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9492946392583635, |
|
"eval_loss": 2.017828941345215, |
|
"eval_runtime": 31.5389, |
|
"eval_samples_per_second": 7.768, |
|
"eval_steps_per_second": 3.9, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9596130592503023, |
|
"grad_norm": 9.742131233215332, |
|
"learning_rate": 3.8599730400115107e-07, |
|
"loss": 2.0714, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.969931479242241, |
|
"grad_norm": 11.851016998291016, |
|
"learning_rate": 2.0770444567118075e-07, |
|
"loss": 2.0419, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.969931479242241, |
|
"eval_loss": 2.017824411392212, |
|
"eval_runtime": 51.264, |
|
"eval_samples_per_second": 4.779, |
|
"eval_steps_per_second": 2.399, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.9802498992341797, |
|
"grad_norm": 12.985248565673828, |
|
"learning_rate": 8.414640420116305e-08, |
|
"loss": 2.0687, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9905683192261185, |
|
"grad_norm": 9.977989196777344, |
|
"learning_rate": 1.5459002346324135e-08, |
|
"loss": 2.0597, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.9905683192261185, |
|
"eval_loss": 2.0180203914642334, |
|
"eval_runtime": 33.9596, |
|
"eval_samples_per_second": 7.214, |
|
"eval_steps_per_second": 3.622, |
|
"step": 384 |
|
} |
|
], |
|
"logging_steps": 4, |
|
"max_steps": 387, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 8, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.875183525374198e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|