|
{ |
|
"best_metric": 2.015996217727661, |
|
"best_model_checkpoint": "/home/datta0/models/lora_final/Mistral-7B-v0.3_pct_reverse_r16/checkpoint-368", |
|
"epoch": 0.9905683192261185, |
|
"eval_steps": 8, |
|
"global_step": 384, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0025796049979846837, |
|
"grad_norm": 16.794553756713867, |
|
"learning_rate": 1.25e-05, |
|
"loss": 2.1524, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.010318419991938735, |
|
"grad_norm": 10.606828689575195, |
|
"learning_rate": 5e-05, |
|
"loss": 2.046, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.02063683998387747, |
|
"grad_norm": 10.568798065185547, |
|
"learning_rate": 0.0001, |
|
"loss": 1.9648, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.02063683998387747, |
|
"eval_loss": 2.0392367839813232, |
|
"eval_runtime": 46.0067, |
|
"eval_samples_per_second": 5.325, |
|
"eval_steps_per_second": 2.674, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.030955259975816204, |
|
"grad_norm": 29.913541793823242, |
|
"learning_rate": 9.997251843068762e-05, |
|
"loss": 2.0181, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.04127367996775494, |
|
"grad_norm": 20.060449600219727, |
|
"learning_rate": 9.989010393221656e-05, |
|
"loss": 2.0599, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04127367996775494, |
|
"eval_loss": 2.0531184673309326, |
|
"eval_runtime": 45.5707, |
|
"eval_samples_per_second": 5.376, |
|
"eval_steps_per_second": 2.699, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.051592099959693674, |
|
"grad_norm": 42.19817352294922, |
|
"learning_rate": 9.97528470997769e-05, |
|
"loss": 2.1649, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06191051995163241, |
|
"grad_norm": 53.00956344604492, |
|
"learning_rate": 9.956089881469482e-05, |
|
"loss": 2.1274, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.06191051995163241, |
|
"eval_loss": 2.057147741317749, |
|
"eval_runtime": 46.5266, |
|
"eval_samples_per_second": 5.266, |
|
"eval_steps_per_second": 2.644, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.07222893994357114, |
|
"grad_norm": 36.37834548950195, |
|
"learning_rate": 9.931447007857432e-05, |
|
"loss": 2.084, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.08254735993550988, |
|
"grad_norm": 26.134798049926758, |
|
"learning_rate": 9.901383178135113e-05, |
|
"loss": 2.0718, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.08254735993550988, |
|
"eval_loss": 2.0472965240478516, |
|
"eval_runtime": 80.6576, |
|
"eval_samples_per_second": 3.038, |
|
"eval_steps_per_second": 1.525, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.09286577992744861, |
|
"grad_norm": 15.81359577178955, |
|
"learning_rate": 9.865931440351337e-05, |
|
"loss": 2.0784, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.10318419991938735, |
|
"grad_norm": 20.870193481445312, |
|
"learning_rate": 9.825130765281668e-05, |
|
"loss": 2.0646, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10318419991938735, |
|
"eval_loss": 2.041971445083618, |
|
"eval_runtime": 46.7629, |
|
"eval_samples_per_second": 5.239, |
|
"eval_steps_per_second": 2.63, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11350261991132608, |
|
"grad_norm": 18.623149871826172, |
|
"learning_rate": 9.779026003589304e-05, |
|
"loss": 2.0528, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.12382103990326482, |
|
"grad_norm": 13.586410522460938, |
|
"learning_rate": 9.727667836522407e-05, |
|
"loss": 2.0883, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.12382103990326482, |
|
"eval_loss": 2.046041488647461, |
|
"eval_runtime": 47.9414, |
|
"eval_samples_per_second": 5.11, |
|
"eval_steps_per_second": 2.566, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.13413945989520354, |
|
"grad_norm": 14.266491889953613, |
|
"learning_rate": 9.6711127202021e-05, |
|
"loss": 2.0503, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1444578798871423, |
|
"grad_norm": 16.555265426635742, |
|
"learning_rate": 9.609422823562345e-05, |
|
"loss": 2.0611, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.1444578798871423, |
|
"eval_loss": 2.0496585369110107, |
|
"eval_runtime": 46.5878, |
|
"eval_samples_per_second": 5.259, |
|
"eval_steps_per_second": 2.64, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.154776299879081, |
|
"grad_norm": 16.17994499206543, |
|
"learning_rate": 9.542665960009959e-05, |
|
"loss": 2.082, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16509471987101976, |
|
"grad_norm": 16.075532913208008, |
|
"learning_rate": 9.470915512879852e-05, |
|
"loss": 2.0841, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.16509471987101976, |
|
"eval_loss": 2.053602933883667, |
|
"eval_runtime": 147.0784, |
|
"eval_samples_per_second": 1.666, |
|
"eval_steps_per_second": 0.836, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.17541313986295848, |
|
"grad_norm": 20.747583389282227, |
|
"learning_rate": 9.394250354767467e-05, |
|
"loss": 2.1047, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.18573155985489723, |
|
"grad_norm": 20.321056365966797, |
|
"learning_rate": 9.312754760827061e-05, |
|
"loss": 2.0695, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.18573155985489723, |
|
"eval_loss": 2.0687732696533203, |
|
"eval_runtime": 276.898, |
|
"eval_samples_per_second": 0.885, |
|
"eval_steps_per_second": 0.444, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.19604997984683595, |
|
"grad_norm": 21.839866638183594, |
|
"learning_rate": 9.226518316131176e-05, |
|
"loss": 2.1056, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.2063683998387747, |
|
"grad_norm": 16.164918899536133, |
|
"learning_rate": 9.1356358171931e-05, |
|
"loss": 2.0696, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2063683998387747, |
|
"eval_loss": 2.079158067703247, |
|
"eval_runtime": 138.6965, |
|
"eval_samples_per_second": 1.766, |
|
"eval_steps_per_second": 0.887, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.21668681983071342, |
|
"grad_norm": 16.828935623168945, |
|
"learning_rate": 9.040207167760586e-05, |
|
"loss": 2.0932, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.22700523982265217, |
|
"grad_norm": 24.9482479095459, |
|
"learning_rate": 8.940337268995385e-05, |
|
"loss": 2.1315, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.22700523982265217, |
|
"eval_loss": 2.089974880218506, |
|
"eval_runtime": 145.0596, |
|
"eval_samples_per_second": 1.689, |
|
"eval_steps_per_second": 0.848, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.2373236598145909, |
|
"grad_norm": 14.4788179397583, |
|
"learning_rate": 8.836135904159302e-05, |
|
"loss": 2.1236, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.24764207980652964, |
|
"grad_norm": 14.118853569030762, |
|
"learning_rate": 8.727717617933544e-05, |
|
"loss": 2.1466, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.24764207980652964, |
|
"eval_loss": 2.0826759338378906, |
|
"eval_runtime": 129.8085, |
|
"eval_samples_per_second": 1.887, |
|
"eval_steps_per_second": 0.948, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.25796049979846836, |
|
"grad_norm": 16.814769744873047, |
|
"learning_rate": 8.615201590504017e-05, |
|
"loss": 2.1325, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2682789197904071, |
|
"grad_norm": 13.403229713439941, |
|
"learning_rate": 8.498711506550983e-05, |
|
"loss": 2.1575, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.2682789197904071, |
|
"eval_loss": 2.082623243331909, |
|
"eval_runtime": 132.6815, |
|
"eval_samples_per_second": 1.847, |
|
"eval_steps_per_second": 0.927, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.27859733978234585, |
|
"grad_norm": 19.730573654174805, |
|
"learning_rate": 8.378375419287099e-05, |
|
"loss": 2.0873, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.2889157597742846, |
|
"grad_norm": 19.85085678100586, |
|
"learning_rate": 8.25432560969328e-05, |
|
"loss": 2.0925, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.2889157597742846, |
|
"eval_loss": 2.086350440979004, |
|
"eval_runtime": 271.6641, |
|
"eval_samples_per_second": 0.902, |
|
"eval_steps_per_second": 0.453, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.2992341797662233, |
|
"grad_norm": 14.944377899169922, |
|
"learning_rate": 8.126698441107146e-05, |
|
"loss": 2.1405, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.309552599758162, |
|
"grad_norm": 16.105512619018555, |
|
"learning_rate": 7.995634209323886e-05, |
|
"loss": 2.1647, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.309552599758162, |
|
"eval_loss": 2.0815000534057617, |
|
"eval_runtime": 133.9294, |
|
"eval_samples_per_second": 1.829, |
|
"eval_steps_per_second": 0.918, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3198710197501008, |
|
"grad_norm": 17.4757080078125, |
|
"learning_rate": 7.861276988374302e-05, |
|
"loss": 2.1381, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3301894397420395, |
|
"grad_norm": 14.932345390319824, |
|
"learning_rate": 7.723774472149601e-05, |
|
"loss": 2.1018, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.3301894397420395, |
|
"eval_loss": 2.0882339477539062, |
|
"eval_runtime": 139.4099, |
|
"eval_samples_per_second": 1.757, |
|
"eval_steps_per_second": 0.882, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.34050785973397824, |
|
"grad_norm": 14.54583740234375, |
|
"learning_rate": 7.583277812046993e-05, |
|
"loss": 2.0851, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.35082627972591696, |
|
"grad_norm": 18.295686721801758, |
|
"learning_rate": 7.439941450814591e-05, |
|
"loss": 2.1062, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.35082627972591696, |
|
"eval_loss": 2.0903666019439697, |
|
"eval_runtime": 138.7624, |
|
"eval_samples_per_second": 1.766, |
|
"eval_steps_per_second": 0.886, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.3611446997178557, |
|
"grad_norm": 14.11962890625, |
|
"learning_rate": 7.293922952778239e-05, |
|
"loss": 2.1298, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.37146311970979445, |
|
"grad_norm": 13.146790504455566, |
|
"learning_rate": 7.145382830636924e-05, |
|
"loss": 2.1596, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.37146311970979445, |
|
"eval_loss": 2.0847134590148926, |
|
"eval_runtime": 271.2553, |
|
"eval_samples_per_second": 0.903, |
|
"eval_steps_per_second": 0.453, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3817815397017332, |
|
"grad_norm": 14.248947143554688, |
|
"learning_rate": 6.994484369017143e-05, |
|
"loss": 2.1231, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.3920999596936719, |
|
"grad_norm": 15.606071472167969, |
|
"learning_rate": 6.841393444980177e-05, |
|
"loss": 2.1473, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.3920999596936719, |
|
"eval_loss": 2.0933380126953125, |
|
"eval_runtime": 276.868, |
|
"eval_samples_per_second": 0.885, |
|
"eval_steps_per_second": 0.444, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.4024183796856106, |
|
"grad_norm": 14.0585298538208, |
|
"learning_rate": 6.686278345679625e-05, |
|
"loss": 2.1495, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.4127367996775494, |
|
"grad_norm": 17.74847412109375, |
|
"learning_rate": 6.529309583369605e-05, |
|
"loss": 2.1388, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4127367996775494, |
|
"eval_loss": 2.088761568069458, |
|
"eval_runtime": 130.7698, |
|
"eval_samples_per_second": 1.874, |
|
"eval_steps_per_second": 0.941, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4230552196694881, |
|
"grad_norm": 14.887138366699219, |
|
"learning_rate": 6.370659707966967e-05, |
|
"loss": 2.1268, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.43337363966142683, |
|
"grad_norm": 13.487187385559082, |
|
"learning_rate": 6.2105031173736e-05, |
|
"loss": 2.093, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.43337363966142683, |
|
"eval_loss": 2.088651657104492, |
|
"eval_runtime": 139.173, |
|
"eval_samples_per_second": 1.76, |
|
"eval_steps_per_second": 0.884, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.44369205965336556, |
|
"grad_norm": 12.735310554504395, |
|
"learning_rate": 6.049015865767318e-05, |
|
"loss": 2.1416, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.45401047964530433, |
|
"grad_norm": 15.59807014465332, |
|
"learning_rate": 5.88637547007204e-05, |
|
"loss": 2.1704, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.45401047964530433, |
|
"eval_loss": 2.093348979949951, |
|
"eval_runtime": 141.3221, |
|
"eval_samples_per_second": 1.734, |
|
"eval_steps_per_second": 0.87, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.46432889963724305, |
|
"grad_norm": 14.380990028381348, |
|
"learning_rate": 5.722760714820057e-05, |
|
"loss": 2.1412, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4746473196291818, |
|
"grad_norm": 12.070490837097168, |
|
"learning_rate": 5.5583514556208514e-05, |
|
"loss": 2.0697, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.4746473196291818, |
|
"eval_loss": 2.0778870582580566, |
|
"eval_runtime": 245.7592, |
|
"eval_samples_per_second": 0.997, |
|
"eval_steps_per_second": 0.5, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.4849657396211205, |
|
"grad_norm": 13.802606582641602, |
|
"learning_rate": 5.393328421452514e-05, |
|
"loss": 2.1088, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.49528415961305927, |
|
"grad_norm": 13.323396682739258, |
|
"learning_rate": 5.2278730159931076e-05, |
|
"loss": 2.1725, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.49528415961305927, |
|
"eval_loss": 2.071396589279175, |
|
"eval_runtime": 139.8299, |
|
"eval_samples_per_second": 1.752, |
|
"eval_steps_per_second": 0.88, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.505602579604998, |
|
"grad_norm": 12.754402160644531, |
|
"learning_rate": 5.062167118210367e-05, |
|
"loss": 2.1362, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.5159209995969367, |
|
"grad_norm": 16.00323486328125, |
|
"learning_rate": 4.896392882428901e-05, |
|
"loss": 2.1339, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5159209995969367, |
|
"eval_loss": 2.069511890411377, |
|
"eval_runtime": 139.0327, |
|
"eval_samples_per_second": 1.762, |
|
"eval_steps_per_second": 0.885, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5262394195888754, |
|
"grad_norm": 13.225739479064941, |
|
"learning_rate": 4.730732538094749e-05, |
|
"loss": 2.1241, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5365578395808142, |
|
"grad_norm": 14.486602783203125, |
|
"learning_rate": 4.565368189457313e-05, |
|
"loss": 2.106, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5365578395808142, |
|
"eval_loss": 2.063960075378418, |
|
"eval_runtime": 141.0631, |
|
"eval_samples_per_second": 1.737, |
|
"eval_steps_per_second": 0.872, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5468762595727529, |
|
"grad_norm": 12.7869234085083, |
|
"learning_rate": 4.400481615388948e-05, |
|
"loss": 2.1111, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5571946795646917, |
|
"grad_norm": 13.145895957946777, |
|
"learning_rate": 4.236254069562213e-05, |
|
"loss": 2.0857, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5571946795646917, |
|
"eval_loss": 2.0791518688201904, |
|
"eval_runtime": 143.4311, |
|
"eval_samples_per_second": 1.708, |
|
"eval_steps_per_second": 0.858, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5675130995566304, |
|
"grad_norm": 12.685562133789062, |
|
"learning_rate": 4.0728660812044536e-05, |
|
"loss": 2.0832, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5778315195485692, |
|
"grad_norm": 13.919960975646973, |
|
"learning_rate": 3.910497256648742e-05, |
|
"loss": 2.0751, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5778315195485692, |
|
"eval_loss": 2.0658464431762695, |
|
"eval_runtime": 269.6696, |
|
"eval_samples_per_second": 0.909, |
|
"eval_steps_per_second": 0.456, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5881499395405079, |
|
"grad_norm": 13.588690757751465, |
|
"learning_rate": 3.749326081899329e-05, |
|
"loss": 2.0779, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.5984683595324466, |
|
"grad_norm": 12.779603004455566, |
|
"learning_rate": 3.589529726428615e-05, |
|
"loss": 2.0987, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.5984683595324466, |
|
"eval_loss": 2.0659260749816895, |
|
"eval_runtime": 142.9536, |
|
"eval_samples_per_second": 1.714, |
|
"eval_steps_per_second": 0.86, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.6087867795243853, |
|
"grad_norm": 14.055486679077148, |
|
"learning_rate": 3.431283848421347e-05, |
|
"loss": 2.0618, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.619105199516324, |
|
"grad_norm": 12.830402374267578, |
|
"learning_rate": 3.274762401680124e-05, |
|
"loss": 2.0817, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.619105199516324, |
|
"eval_loss": 2.0628087520599365, |
|
"eval_runtime": 144.7768, |
|
"eval_samples_per_second": 1.692, |
|
"eval_steps_per_second": 0.85, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6294236195082628, |
|
"grad_norm": 11.332627296447754, |
|
"learning_rate": 3.120137444404442e-05, |
|
"loss": 2.1072, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.6397420395002016, |
|
"grad_norm": 11.619379997253418, |
|
"learning_rate": 2.9675789500535328e-05, |
|
"loss": 2.1341, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6397420395002016, |
|
"eval_loss": 2.05637526512146, |
|
"eval_runtime": 134.9288, |
|
"eval_samples_per_second": 1.816, |
|
"eval_steps_per_second": 0.912, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6500604594921403, |
|
"grad_norm": 11.539495468139648, |
|
"learning_rate": 2.8172546205008683e-05, |
|
"loss": 2.0623, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.660378879484079, |
|
"grad_norm": 10.27650260925293, |
|
"learning_rate": 2.6693297016857188e-05, |
|
"loss": 2.0567, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.660378879484079, |
|
"eval_loss": 2.051720142364502, |
|
"eval_runtime": 132.5711, |
|
"eval_samples_per_second": 1.848, |
|
"eval_steps_per_second": 0.928, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6706972994760177, |
|
"grad_norm": 10.966778755187988, |
|
"learning_rate": 2.523966801964468e-05, |
|
"loss": 2.0717, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6810157194679565, |
|
"grad_norm": 10.349825859069824, |
|
"learning_rate": 2.3813257133612827e-05, |
|
"loss": 2.1246, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6810157194679565, |
|
"eval_loss": 2.045745611190796, |
|
"eval_runtime": 269.5914, |
|
"eval_samples_per_second": 0.909, |
|
"eval_steps_per_second": 0.456, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6913341394598952, |
|
"grad_norm": 10.632711410522461, |
|
"learning_rate": 2.2415632359146856e-05, |
|
"loss": 2.0458, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.7016525594518339, |
|
"grad_norm": 9.394647598266602, |
|
"learning_rate": 2.104833005313131e-05, |
|
"loss": 2.0623, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.7016525594518339, |
|
"eval_loss": 2.0422868728637695, |
|
"eval_runtime": 131.0567, |
|
"eval_samples_per_second": 1.869, |
|
"eval_steps_per_second": 0.939, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.7119709794437726, |
|
"grad_norm": 11.318469047546387, |
|
"learning_rate": 1.971285324008994e-05, |
|
"loss": 2.0952, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.7222893994357114, |
|
"grad_norm": 10.162789344787598, |
|
"learning_rate": 1.84106699599668e-05, |
|
"loss": 2.1106, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7222893994357114, |
|
"eval_loss": 2.0368635654449463, |
|
"eval_runtime": 125.1131, |
|
"eval_samples_per_second": 1.958, |
|
"eval_steps_per_second": 0.983, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7326078194276502, |
|
"grad_norm": 11.464627265930176, |
|
"learning_rate": 1.7143211654364762e-05, |
|
"loss": 2.112, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.7429262394195889, |
|
"grad_norm": 12.67012882232666, |
|
"learning_rate": 1.5911871593014837e-05, |
|
"loss": 2.1094, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7429262394195889, |
|
"eval_loss": 2.0374763011932373, |
|
"eval_runtime": 135.9513, |
|
"eval_samples_per_second": 1.802, |
|
"eval_steps_per_second": 0.905, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7532446594115276, |
|
"grad_norm": 11.443336486816406, |
|
"learning_rate": 1.4718003342206722e-05, |
|
"loss": 2.1089, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.7635630794034663, |
|
"grad_norm": 11.790454864501953, |
|
"learning_rate": 1.3562919276863844e-05, |
|
"loss": 2.0678, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7635630794034663, |
|
"eval_loss": 2.032977819442749, |
|
"eval_runtime": 145.1862, |
|
"eval_samples_per_second": 1.687, |
|
"eval_steps_per_second": 0.847, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7738814993954051, |
|
"grad_norm": 11.48520565032959, |
|
"learning_rate": 1.2447889137898293e-05, |
|
"loss": 2.0354, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7841999193873438, |
|
"grad_norm": 10.889254570007324, |
|
"learning_rate": 1.1374138636432053e-05, |
|
"loss": 2.0521, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7841999193873438, |
|
"eval_loss": 2.032576560974121, |
|
"eval_runtime": 266.89, |
|
"eval_samples_per_second": 0.918, |
|
"eval_steps_per_second": 0.461, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7945183393792825, |
|
"grad_norm": 11.634953498840332, |
|
"learning_rate": 1.0342848106418368e-05, |
|
"loss": 2.0765, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.8048367593712212, |
|
"grad_norm": 10.742921829223633, |
|
"learning_rate": 9.35515120714447e-06, |
|
"loss": 2.0594, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.8048367593712212, |
|
"eval_loss": 2.0241119861602783, |
|
"eval_runtime": 145.6455, |
|
"eval_samples_per_second": 1.682, |
|
"eval_steps_per_second": 0.845, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.8151551793631601, |
|
"grad_norm": 9.77017593383789, |
|
"learning_rate": 8.41213367704224e-06, |
|
"loss": 2.0305, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.8254735993550988, |
|
"grad_norm": 9.984450340270996, |
|
"learning_rate": 7.51483214017637e-06, |
|
"loss": 2.051, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8254735993550988, |
|
"eval_loss": 2.0207557678222656, |
|
"eval_runtime": 141.4793, |
|
"eval_samples_per_second": 1.732, |
|
"eval_steps_per_second": 0.869, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8357920193470375, |
|
"grad_norm": 10.030397415161133, |
|
"learning_rate": 6.664232966721995e-06, |
|
"loss": 2.1179, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.8461104393389762, |
|
"grad_norm": 9.973139762878418, |
|
"learning_rate": 5.8612711886848196e-06, |
|
"loss": 2.0392, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.8461104393389762, |
|
"eval_loss": 2.020094156265259, |
|
"eval_runtime": 136.0416, |
|
"eval_samples_per_second": 1.801, |
|
"eval_steps_per_second": 0.904, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.856428859330915, |
|
"grad_norm": 9.772870063781738, |
|
"learning_rate": 5.106829472055202e-06, |
|
"loss": 2.0651, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.8667472793228537, |
|
"grad_norm": 10.798050880432129, |
|
"learning_rate": 4.401737146526219e-06, |
|
"loss": 2.0143, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8667472793228537, |
|
"eval_loss": 2.0207338333129883, |
|
"eval_runtime": 254.8265, |
|
"eval_samples_per_second": 0.961, |
|
"eval_steps_per_second": 0.483, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8770656993147924, |
|
"grad_norm": 9.898650169372559, |
|
"learning_rate": 3.7467692938425057e-06, |
|
"loss": 2.0972, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8873841193067311, |
|
"grad_norm": 9.767654418945312, |
|
"learning_rate": 3.142645895781715e-06, |
|
"loss": 2.0678, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.8873841193067311, |
|
"eval_loss": 2.0221924781799316, |
|
"eval_runtime": 135.0488, |
|
"eval_samples_per_second": 1.814, |
|
"eval_steps_per_second": 0.911, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.8977025392986699, |
|
"grad_norm": 8.583464622497559, |
|
"learning_rate": 2.5900310427053044e-06, |
|
"loss": 2.0127, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.9080209592906087, |
|
"grad_norm": 10.855988502502441, |
|
"learning_rate": 2.089532203548794e-06, |
|
"loss": 2.0473, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.9080209592906087, |
|
"eval_loss": 2.0186901092529297, |
|
"eval_runtime": 141.8755, |
|
"eval_samples_per_second": 1.727, |
|
"eval_steps_per_second": 0.867, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.9183393792825474, |
|
"grad_norm": 10.131417274475098, |
|
"learning_rate": 1.6416995580537664e-06, |
|
"loss": 2.0994, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.9286577992744861, |
|
"grad_norm": 9.968809127807617, |
|
"learning_rate": 1.247025391975698e-06, |
|
"loss": 2.0324, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9286577992744861, |
|
"eval_loss": 2.016472101211548, |
|
"eval_runtime": 135.0646, |
|
"eval_samples_per_second": 1.814, |
|
"eval_steps_per_second": 0.911, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9389762192664248, |
|
"grad_norm": 9.20028018951416, |
|
"learning_rate": 9.059435559326257e-07, |
|
"loss": 2.0579, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.9492946392583635, |
|
"grad_norm": 9.900856971740723, |
|
"learning_rate": 6.188289884893062e-07, |
|
"loss": 2.0404, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9492946392583635, |
|
"eval_loss": 2.015996217727661, |
|
"eval_runtime": 135.5315, |
|
"eval_samples_per_second": 1.808, |
|
"eval_steps_per_second": 0.908, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9596130592503023, |
|
"grad_norm": 10.436941146850586, |
|
"learning_rate": 3.8599730400115107e-07, |
|
"loss": 2.0711, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.969931479242241, |
|
"grad_norm": 10.571781158447266, |
|
"learning_rate": 2.0770444567118075e-07, |
|
"loss": 2.0426, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.969931479242241, |
|
"eval_loss": 2.0162649154663086, |
|
"eval_runtime": 275.4389, |
|
"eval_samples_per_second": 0.889, |
|
"eval_steps_per_second": 0.447, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.9802498992341797, |
|
"grad_norm": 8.854607582092285, |
|
"learning_rate": 8.414640420116305e-08, |
|
"loss": 2.0661, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9905683192261185, |
|
"grad_norm": 9.778440475463867, |
|
"learning_rate": 1.5459002346324135e-08, |
|
"loss": 2.0635, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.9905683192261185, |
|
"eval_loss": 2.0161848068237305, |
|
"eval_runtime": 140.5211, |
|
"eval_samples_per_second": 1.744, |
|
"eval_steps_per_second": 0.875, |
|
"step": 384 |
|
} |
|
], |
|
"logging_steps": 4, |
|
"max_steps": 387, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 8, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.83492814456488e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|