|
{ |
|
"best_metric": 1.967491626739502, |
|
"best_model_checkpoint": "/home/datta0/models/lora_final/Mistral-7B-v0.3_pct_ortho_r32/checkpoint-8", |
|
"epoch": 0.9981059842836993, |
|
"eval_steps": 8, |
|
"global_step": 387, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0025790852307072333, |
|
"grad_norm": 7.501750946044922, |
|
"learning_rate": 1.25e-05, |
|
"loss": 2.2239, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.010316340922828933, |
|
"grad_norm": 4.365421295166016, |
|
"learning_rate": 5e-05, |
|
"loss": 2.1072, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.020632681845657867, |
|
"grad_norm": 3.2742128372192383, |
|
"learning_rate": 0.0001, |
|
"loss": 1.9673, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.020632681845657867, |
|
"eval_loss": 1.967491626739502, |
|
"eval_runtime": 94.9785, |
|
"eval_samples_per_second": 2.59, |
|
"eval_steps_per_second": 2.59, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0309490227684868, |
|
"grad_norm": 3.315385580062866, |
|
"learning_rate": 9.997251843068762e-05, |
|
"loss": 1.9892, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.04126536369131573, |
|
"grad_norm": 3.2009124755859375, |
|
"learning_rate": 9.989010393221656e-05, |
|
"loss": 1.982, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04126536369131573, |
|
"eval_loss": 1.976182460784912, |
|
"eval_runtime": 122.0257, |
|
"eval_samples_per_second": 2.016, |
|
"eval_steps_per_second": 2.016, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.05158170461414467, |
|
"grad_norm": 2.4668922424316406, |
|
"learning_rate": 9.97528470997769e-05, |
|
"loss": 1.931, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0618980455369736, |
|
"grad_norm": 2.5818123817443848, |
|
"learning_rate": 9.956089881469482e-05, |
|
"loss": 1.9563, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0618980455369736, |
|
"eval_loss": 1.9745111465454102, |
|
"eval_runtime": 82.5917, |
|
"eval_samples_per_second": 2.979, |
|
"eval_steps_per_second": 2.979, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.07221438645980253, |
|
"grad_norm": 2.7342212200164795, |
|
"learning_rate": 9.931447007857432e-05, |
|
"loss": 2.003, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.08253072738263147, |
|
"grad_norm": 2.6568496227264404, |
|
"learning_rate": 9.901383178135113e-05, |
|
"loss": 1.957, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.08253072738263147, |
|
"eval_loss": 1.97785484790802, |
|
"eval_runtime": 110.4681, |
|
"eval_samples_per_second": 2.227, |
|
"eval_steps_per_second": 2.227, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.09284706830546041, |
|
"grad_norm": 3.633096218109131, |
|
"learning_rate": 9.865931440351337e-05, |
|
"loss": 1.9643, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.10316340922828934, |
|
"grad_norm": 2.6414403915405273, |
|
"learning_rate": 9.825130765281668e-05, |
|
"loss": 2.0248, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10316340922828934, |
|
"eval_loss": 1.9851678609848022, |
|
"eval_runtime": 85.4255, |
|
"eval_samples_per_second": 2.88, |
|
"eval_steps_per_second": 2.88, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11347975015111827, |
|
"grad_norm": 4.06710958480835, |
|
"learning_rate": 9.779026003589304e-05, |
|
"loss": 1.9751, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.1237960910739472, |
|
"grad_norm": 2.688786506652832, |
|
"learning_rate": 9.727667836522407e-05, |
|
"loss": 1.9753, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.1237960910739472, |
|
"eval_loss": 1.9988040924072266, |
|
"eval_runtime": 83.5761, |
|
"eval_samples_per_second": 2.943, |
|
"eval_steps_per_second": 2.943, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.13411243199677614, |
|
"grad_norm": 2.983027696609497, |
|
"learning_rate": 9.6711127202021e-05, |
|
"loss": 2.0274, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.14442877291960507, |
|
"grad_norm": 2.963914394378662, |
|
"learning_rate": 9.609422823562345e-05, |
|
"loss": 1.9752, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.14442877291960507, |
|
"eval_loss": 1.997394323348999, |
|
"eval_runtime": 109.8713, |
|
"eval_samples_per_second": 2.239, |
|
"eval_steps_per_second": 2.239, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.154745113842434, |
|
"grad_norm": 2.4576425552368164, |
|
"learning_rate": 9.542665960009959e-05, |
|
"loss": 1.9912, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16506145476526293, |
|
"grad_norm": 2.9133598804473877, |
|
"learning_rate": 9.470915512879852e-05, |
|
"loss": 2.0253, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.16506145476526293, |
|
"eval_loss": 1.994693398475647, |
|
"eval_runtime": 85.1875, |
|
"eval_samples_per_second": 2.888, |
|
"eval_steps_per_second": 2.888, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.1753777956880919, |
|
"grad_norm": 2.7807934284210205, |
|
"learning_rate": 9.394250354767467e-05, |
|
"loss": 2.0054, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.18569413661092082, |
|
"grad_norm": 2.75816011428833, |
|
"learning_rate": 9.312754760827061e-05, |
|
"loss": 2.0073, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.18569413661092082, |
|
"eval_loss": 1.9871970415115356, |
|
"eval_runtime": 116.486, |
|
"eval_samples_per_second": 2.112, |
|
"eval_steps_per_second": 2.112, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.19601047753374976, |
|
"grad_norm": 2.501340389251709, |
|
"learning_rate": 9.226518316131176e-05, |
|
"loss": 2.013, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.2063268184565787, |
|
"grad_norm": 3.0418202877044678, |
|
"learning_rate": 9.1356358171931e-05, |
|
"loss": 1.9826, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2063268184565787, |
|
"eval_loss": 1.9953396320343018, |
|
"eval_runtime": 79.6585, |
|
"eval_samples_per_second": 3.088, |
|
"eval_steps_per_second": 3.088, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.21664315937940762, |
|
"grad_norm": 2.696978807449341, |
|
"learning_rate": 9.040207167760586e-05, |
|
"loss": 1.9693, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.22695950030223655, |
|
"grad_norm": 2.604274034500122, |
|
"learning_rate": 8.940337268995385e-05, |
|
"loss": 1.9907, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.22695950030223655, |
|
"eval_loss": 2.0014798641204834, |
|
"eval_runtime": 119.4751, |
|
"eval_samples_per_second": 2.059, |
|
"eval_steps_per_second": 2.059, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.23727584122506548, |
|
"grad_norm": 2.6495442390441895, |
|
"learning_rate": 8.836135904159302e-05, |
|
"loss": 2.0053, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.2475921821478944, |
|
"grad_norm": 2.7287392616271973, |
|
"learning_rate": 8.727717617933544e-05, |
|
"loss": 1.9795, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2475921821478944, |
|
"eval_loss": 1.9950815439224243, |
|
"eval_runtime": 82.9311, |
|
"eval_samples_per_second": 2.966, |
|
"eval_steps_per_second": 2.966, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.25790852307072337, |
|
"grad_norm": 2.6722488403320312, |
|
"learning_rate": 8.615201590504017e-05, |
|
"loss": 2.033, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2682248639935523, |
|
"grad_norm": 3.8678576946258545, |
|
"learning_rate": 8.498711506550983e-05, |
|
"loss": 1.9882, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.2682248639935523, |
|
"eval_loss": 2.0020015239715576, |
|
"eval_runtime": 79.5681, |
|
"eval_samples_per_second": 3.092, |
|
"eval_steps_per_second": 3.092, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.27854120491638124, |
|
"grad_norm": 3.1447510719299316, |
|
"learning_rate": 8.378375419287099e-05, |
|
"loss": 2.0347, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.28885754583921014, |
|
"grad_norm": 2.7415413856506348, |
|
"learning_rate": 8.25432560969328e-05, |
|
"loss": 1.9896, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.28885754583921014, |
|
"eval_loss": 1.9962644577026367, |
|
"eval_runtime": 119.6182, |
|
"eval_samples_per_second": 2.057, |
|
"eval_steps_per_second": 2.057, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.2991738867620391, |
|
"grad_norm": 2.7823774814605713, |
|
"learning_rate": 8.126698441107146e-05, |
|
"loss": 1.9733, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.309490227684868, |
|
"grad_norm": 2.875882387161255, |
|
"learning_rate": 7.995634209323886e-05, |
|
"loss": 2.0177, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.309490227684868, |
|
"eval_loss": 2.0145890712738037, |
|
"eval_runtime": 82.837, |
|
"eval_samples_per_second": 2.97, |
|
"eval_steps_per_second": 2.97, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.31980656860769696, |
|
"grad_norm": 2.600053548812866, |
|
"learning_rate": 7.861276988374302e-05, |
|
"loss": 2.0148, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.33012290953052587, |
|
"grad_norm": 3.0531198978424072, |
|
"learning_rate": 7.723774472149601e-05, |
|
"loss": 2.0131, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.33012290953052587, |
|
"eval_loss": 2.0012505054473877, |
|
"eval_runtime": 112.1719, |
|
"eval_samples_per_second": 2.193, |
|
"eval_steps_per_second": 2.193, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.3404392504533548, |
|
"grad_norm": 2.9197065830230713, |
|
"learning_rate": 7.583277812046993e-05, |
|
"loss": 2.0205, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.3507555913761838, |
|
"grad_norm": 2.766697406768799, |
|
"learning_rate": 7.439941450814591e-05, |
|
"loss": 2.0384, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.3507555913761838, |
|
"eval_loss": 2.001709222793579, |
|
"eval_runtime": 84.1224, |
|
"eval_samples_per_second": 2.924, |
|
"eval_steps_per_second": 2.924, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.3610719322990127, |
|
"grad_norm": 3.0844602584838867, |
|
"learning_rate": 7.293922952778239e-05, |
|
"loss": 2.0319, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.37138827322184165, |
|
"grad_norm": 3.19889760017395, |
|
"learning_rate": 7.145382830636924e-05, |
|
"loss": 2.0587, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.37138827322184165, |
|
"eval_loss": 2.001880168914795, |
|
"eval_runtime": 114.9668, |
|
"eval_samples_per_second": 2.14, |
|
"eval_steps_per_second": 2.14, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.38170461414467055, |
|
"grad_norm": 3.044001817703247, |
|
"learning_rate": 6.994484369017143e-05, |
|
"loss": 2.0052, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.3920209550674995, |
|
"grad_norm": 3.1708483695983887, |
|
"learning_rate": 6.841393444980177e-05, |
|
"loss": 1.9998, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.3920209550674995, |
|
"eval_loss": 1.996483325958252, |
|
"eval_runtime": 111.8248, |
|
"eval_samples_per_second": 2.2, |
|
"eval_steps_per_second": 2.2, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.4023372959903284, |
|
"grad_norm": 2.9223573207855225, |
|
"learning_rate": 6.686278345679625e-05, |
|
"loss": 1.9855, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.4126536369131574, |
|
"grad_norm": 2.8330798149108887, |
|
"learning_rate": 6.529309583369605e-05, |
|
"loss": 1.9729, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4126536369131574, |
|
"eval_loss": 1.9904886484146118, |
|
"eval_runtime": 85.3147, |
|
"eval_samples_per_second": 2.883, |
|
"eval_steps_per_second": 2.883, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4229699778359863, |
|
"grad_norm": 2.665085792541504, |
|
"learning_rate": 6.370659707966967e-05, |
|
"loss": 1.994, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.43328631875881524, |
|
"grad_norm": 3.091874122619629, |
|
"learning_rate": 6.2105031173736e-05, |
|
"loss": 2.0339, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.43328631875881524, |
|
"eval_loss": 2.0232627391815186, |
|
"eval_runtime": 113.1394, |
|
"eval_samples_per_second": 2.174, |
|
"eval_steps_per_second": 2.174, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.44360265968164414, |
|
"grad_norm": 3.6090798377990723, |
|
"learning_rate": 6.049015865767318e-05, |
|
"loss": 1.9818, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.4539190006044731, |
|
"grad_norm": 3.000669479370117, |
|
"learning_rate": 5.88637547007204e-05, |
|
"loss": 2.0029, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.4539190006044731, |
|
"eval_loss": 1.9972038269042969, |
|
"eval_runtime": 80.5833, |
|
"eval_samples_per_second": 3.053, |
|
"eval_steps_per_second": 3.053, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.46423534152730206, |
|
"grad_norm": 2.6875836849212646, |
|
"learning_rate": 5.722760714820057e-05, |
|
"loss": 2.0263, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.47455168245013096, |
|
"grad_norm": 3.1060383319854736, |
|
"learning_rate": 5.5583514556208514e-05, |
|
"loss": 1.997, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.47455168245013096, |
|
"eval_loss": 1.997556209564209, |
|
"eval_runtime": 116.9074, |
|
"eval_samples_per_second": 2.104, |
|
"eval_steps_per_second": 2.104, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.4848680233729599, |
|
"grad_norm": 2.5845789909362793, |
|
"learning_rate": 5.393328421452514e-05, |
|
"loss": 1.9811, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.4951843642957888, |
|
"grad_norm": 2.890994071960449, |
|
"learning_rate": 5.2278730159931076e-05, |
|
"loss": 1.9808, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.4951843642957888, |
|
"eval_loss": 2.000748872756958, |
|
"eval_runtime": 83.3047, |
|
"eval_samples_per_second": 2.953, |
|
"eval_steps_per_second": 2.953, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.5055007052186178, |
|
"grad_norm": 3.2054591178894043, |
|
"learning_rate": 5.062167118210367e-05, |
|
"loss": 2.0514, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.5158170461414467, |
|
"grad_norm": 2.911627769470215, |
|
"learning_rate": 4.896392882428901e-05, |
|
"loss": 2.0169, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5158170461414467, |
|
"eval_loss": 1.9872047901153564, |
|
"eval_runtime": 78.3344, |
|
"eval_samples_per_second": 3.14, |
|
"eval_steps_per_second": 3.14, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5261333870642756, |
|
"grad_norm": 3.094301223754883, |
|
"learning_rate": 4.730732538094749e-05, |
|
"loss": 1.9815, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5364497279871046, |
|
"grad_norm": 2.6194097995758057, |
|
"learning_rate": 4.565368189457313e-05, |
|
"loss": 1.9605, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5364497279871046, |
|
"eval_loss": 1.9974828958511353, |
|
"eval_runtime": 116.5117, |
|
"eval_samples_per_second": 2.111, |
|
"eval_steps_per_second": 2.111, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5467660689099335, |
|
"grad_norm": 3.0435950756073, |
|
"learning_rate": 4.400481615388948e-05, |
|
"loss": 1.9643, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5570824098327625, |
|
"grad_norm": 2.7957849502563477, |
|
"learning_rate": 4.236254069562213e-05, |
|
"loss": 2.0195, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5570824098327625, |
|
"eval_loss": 1.9963051080703735, |
|
"eval_runtime": 83.9136, |
|
"eval_samples_per_second": 2.932, |
|
"eval_steps_per_second": 2.932, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5673987507555914, |
|
"grad_norm": 2.5869903564453125, |
|
"learning_rate": 4.0728660812044536e-05, |
|
"loss": 2.0336, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5777150916784203, |
|
"grad_norm": 2.6142055988311768, |
|
"learning_rate": 3.910497256648742e-05, |
|
"loss": 1.9619, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5777150916784203, |
|
"eval_loss": 1.9877939224243164, |
|
"eval_runtime": 113.1041, |
|
"eval_samples_per_second": 2.175, |
|
"eval_steps_per_second": 2.175, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5880314326012492, |
|
"grad_norm": 2.57497501373291, |
|
"learning_rate": 3.749326081899329e-05, |
|
"loss": 1.9923, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.5983477735240782, |
|
"grad_norm": 2.97292160987854, |
|
"learning_rate": 3.589529726428615e-05, |
|
"loss": 1.9361, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.5983477735240782, |
|
"eval_loss": 2.004483699798584, |
|
"eval_runtime": 85.1956, |
|
"eval_samples_per_second": 2.887, |
|
"eval_steps_per_second": 2.887, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.6086641144469072, |
|
"grad_norm": 2.6015868186950684, |
|
"learning_rate": 3.431283848421347e-05, |
|
"loss": 2.033, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.618980455369736, |
|
"grad_norm": 2.7763192653656006, |
|
"learning_rate": 3.274762401680124e-05, |
|
"loss": 1.9932, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.618980455369736, |
|
"eval_loss": 1.9814573526382446, |
|
"eval_runtime": 114.536, |
|
"eval_samples_per_second": 2.148, |
|
"eval_steps_per_second": 2.148, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.629296796292565, |
|
"grad_norm": 2.6632089614868164, |
|
"learning_rate": 3.120137444404442e-05, |
|
"loss": 1.9622, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.6396131372153939, |
|
"grad_norm": 2.701653480529785, |
|
"learning_rate": 2.9675789500535328e-05, |
|
"loss": 1.9519, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6396131372153939, |
|
"eval_loss": 1.9895679950714111, |
|
"eval_runtime": 79.8573, |
|
"eval_samples_per_second": 3.08, |
|
"eval_steps_per_second": 3.08, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6499294781382229, |
|
"grad_norm": 2.8519248962402344, |
|
"learning_rate": 2.8172546205008683e-05, |
|
"loss": 2.0187, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.6602458190610517, |
|
"grad_norm": 2.4840056896209717, |
|
"learning_rate": 2.6693297016857188e-05, |
|
"loss": 1.9843, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6602458190610517, |
|
"eval_loss": 1.9900814294815063, |
|
"eval_runtime": 89.9348, |
|
"eval_samples_per_second": 2.735, |
|
"eval_steps_per_second": 2.735, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6705621599838807, |
|
"grad_norm": 2.6297292709350586, |
|
"learning_rate": 2.523966801964468e-05, |
|
"loss": 1.985, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6808785009067096, |
|
"grad_norm": 2.6009626388549805, |
|
"learning_rate": 2.3813257133612827e-05, |
|
"loss": 1.963, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6808785009067096, |
|
"eval_loss": 1.9819916486740112, |
|
"eval_runtime": 117.6742, |
|
"eval_samples_per_second": 2.091, |
|
"eval_steps_per_second": 2.091, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6911948418295386, |
|
"grad_norm": 2.4555246829986572, |
|
"learning_rate": 2.2415632359146856e-05, |
|
"loss": 1.9888, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.7015111827523676, |
|
"grad_norm": 2.845592975616455, |
|
"learning_rate": 2.104833005313131e-05, |
|
"loss": 1.9376, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.7015111827523676, |
|
"eval_loss": 1.979328989982605, |
|
"eval_runtime": 83.6262, |
|
"eval_samples_per_second": 2.942, |
|
"eval_steps_per_second": 2.942, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.7118275236751964, |
|
"grad_norm": 2.663501262664795, |
|
"learning_rate": 1.971285324008994e-05, |
|
"loss": 2.0383, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.7221438645980254, |
|
"grad_norm": 2.5632455348968506, |
|
"learning_rate": 1.84106699599668e-05, |
|
"loss": 1.9876, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7221438645980254, |
|
"eval_loss": 1.9884700775146484, |
|
"eval_runtime": 126.7545, |
|
"eval_samples_per_second": 1.941, |
|
"eval_steps_per_second": 1.941, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7324602055208543, |
|
"grad_norm": 2.6025450229644775, |
|
"learning_rate": 1.7143211654364762e-05, |
|
"loss": 2.0093, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.7427765464436833, |
|
"grad_norm": 3.391676187515259, |
|
"learning_rate": 1.5911871593014837e-05, |
|
"loss": 2.0157, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7427765464436833, |
|
"eval_loss": 1.9833872318267822, |
|
"eval_runtime": 84.5923, |
|
"eval_samples_per_second": 2.908, |
|
"eval_steps_per_second": 2.908, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7530928873665121, |
|
"grad_norm": 2.5039122104644775, |
|
"learning_rate": 1.4718003342206722e-05, |
|
"loss": 1.9728, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.7634092282893411, |
|
"grad_norm": 3.065073013305664, |
|
"learning_rate": 1.3562919276863844e-05, |
|
"loss": 2.011, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7634092282893411, |
|
"eval_loss": 1.9842520952224731, |
|
"eval_runtime": 81.8194, |
|
"eval_samples_per_second": 3.007, |
|
"eval_steps_per_second": 3.007, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7737255692121701, |
|
"grad_norm": 2.7710351943969727, |
|
"learning_rate": 1.2447889137898293e-05, |
|
"loss": 2.0308, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.784041910134999, |
|
"grad_norm": 3.067017078399658, |
|
"learning_rate": 1.1374138636432053e-05, |
|
"loss": 2.0179, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.784041910134999, |
|
"eval_loss": 1.9779284000396729, |
|
"eval_runtime": 128.2963, |
|
"eval_samples_per_second": 1.917, |
|
"eval_steps_per_second": 1.917, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.794358251057828, |
|
"grad_norm": 2.3697543144226074, |
|
"learning_rate": 1.0342848106418368e-05, |
|
"loss": 1.9998, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.8046745919806568, |
|
"grad_norm": 2.8660995960235596, |
|
"learning_rate": 9.35515120714447e-06, |
|
"loss": 1.9693, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.8046745919806568, |
|
"eval_loss": 1.978676676750183, |
|
"eval_runtime": 85.106, |
|
"eval_samples_per_second": 2.891, |
|
"eval_steps_per_second": 2.891, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.8149909329034858, |
|
"grad_norm": 2.3432037830352783, |
|
"learning_rate": 8.41213367704224e-06, |
|
"loss": 2.0149, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.8253072738263147, |
|
"grad_norm": 2.753044605255127, |
|
"learning_rate": 7.51483214017637e-06, |
|
"loss": 1.9632, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8253072738263147, |
|
"eval_loss": 1.982351541519165, |
|
"eval_runtime": 114.3934, |
|
"eval_samples_per_second": 2.15, |
|
"eval_steps_per_second": 2.15, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8356236147491437, |
|
"grad_norm": 2.3161306381225586, |
|
"learning_rate": 6.664232966721995e-06, |
|
"loss": 1.9832, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.8459399556719726, |
|
"grad_norm": 2.218827486038208, |
|
"learning_rate": 5.8612711886848196e-06, |
|
"loss": 1.9367, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.8459399556719726, |
|
"eval_loss": 1.9775891304016113, |
|
"eval_runtime": 90.4335, |
|
"eval_samples_per_second": 2.72, |
|
"eval_steps_per_second": 2.72, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.8562562965948015, |
|
"grad_norm": 2.4218974113464355, |
|
"learning_rate": 5.106829472055202e-06, |
|
"loss": 1.9481, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.8665726375176305, |
|
"grad_norm": 2.6613683700561523, |
|
"learning_rate": 4.401737146526219e-06, |
|
"loss": 1.9824, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8665726375176305, |
|
"eval_loss": 1.9729845523834229, |
|
"eval_runtime": 119.6431, |
|
"eval_samples_per_second": 2.056, |
|
"eval_steps_per_second": 2.056, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8768889784404594, |
|
"grad_norm": 2.633181095123291, |
|
"learning_rate": 3.7467692938425057e-06, |
|
"loss": 1.9396, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8872053193632883, |
|
"grad_norm": 2.728327989578247, |
|
"learning_rate": 3.142645895781715e-06, |
|
"loss": 1.9911, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.8872053193632883, |
|
"eval_loss": 1.9718983173370361, |
|
"eval_runtime": 82.9362, |
|
"eval_samples_per_second": 2.966, |
|
"eval_steps_per_second": 2.966, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.8975216602861172, |
|
"grad_norm": 2.4041829109191895, |
|
"learning_rate": 2.5900310427053044e-06, |
|
"loss": 1.9742, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.9078380012089462, |
|
"grad_norm": 2.4489846229553223, |
|
"learning_rate": 2.089532203548794e-06, |
|
"loss": 2.0075, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.9078380012089462, |
|
"eval_loss": 1.9730160236358643, |
|
"eval_runtime": 91.0108, |
|
"eval_samples_per_second": 2.703, |
|
"eval_steps_per_second": 2.703, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.9181543421317752, |
|
"grad_norm": 2.5976321697235107, |
|
"learning_rate": 1.6416995580537664e-06, |
|
"loss": 1.9874, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.9284706830546041, |
|
"grad_norm": 3.244720458984375, |
|
"learning_rate": 1.247025391975698e-06, |
|
"loss": 1.9809, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9284706830546041, |
|
"eval_loss": 1.9729856252670288, |
|
"eval_runtime": 118.5331, |
|
"eval_samples_per_second": 2.075, |
|
"eval_steps_per_second": 2.075, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.938787023977433, |
|
"grad_norm": 2.2416179180145264, |
|
"learning_rate": 9.059435559326257e-07, |
|
"loss": 2.0036, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.9491033649002619, |
|
"grad_norm": 2.4049975872039795, |
|
"learning_rate": 6.188289884893062e-07, |
|
"loss": 1.9971, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9491033649002619, |
|
"eval_loss": 1.9721975326538086, |
|
"eval_runtime": 83.8998, |
|
"eval_samples_per_second": 2.932, |
|
"eval_steps_per_second": 2.932, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9594197058230909, |
|
"grad_norm": 2.46026611328125, |
|
"learning_rate": 3.8599730400115107e-07, |
|
"loss": 1.9341, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.9697360467459198, |
|
"grad_norm": 2.334559917449951, |
|
"learning_rate": 2.0770444567118075e-07, |
|
"loss": 1.9913, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.9697360467459198, |
|
"eval_loss": 1.9720317125320435, |
|
"eval_runtime": 127.7811, |
|
"eval_samples_per_second": 1.925, |
|
"eval_steps_per_second": 1.925, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.9800523876687487, |
|
"grad_norm": 2.3480026721954346, |
|
"learning_rate": 8.414640420116305e-08, |
|
"loss": 1.9948, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9903687285915777, |
|
"grad_norm": 2.4215712547302246, |
|
"learning_rate": 1.5459002346324135e-08, |
|
"loss": 1.916, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.9903687285915777, |
|
"eval_loss": 1.972062587738037, |
|
"eval_runtime": 84.2065, |
|
"eval_samples_per_second": 2.921, |
|
"eval_steps_per_second": 2.921, |
|
"step": 384 |
|
} |
|
], |
|
"logging_steps": 4, |
|
"max_steps": 387, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 8, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.5374998322774016e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|