imdatta0's picture
End of training
8b1b653 verified
{
"best_metric": 2.037567615509033,
"best_model_checkpoint": "/home/datta0/models/lora_final/Mistral-7B-v0.3_pct_default_r32/checkpoint-16",
"epoch": 0.9981059842836993,
"eval_steps": 8,
"global_step": 387,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0025790852307072333,
"grad_norm": 23.78647804260254,
"learning_rate": 1.25e-05,
"loss": 2.2237,
"step": 1
},
{
"epoch": 0.010316340922828933,
"grad_norm": 10.68380355834961,
"learning_rate": 5e-05,
"loss": 2.0556,
"step": 4
},
{
"epoch": 0.020632681845657867,
"grad_norm": 13.343311309814453,
"learning_rate": 0.0001,
"loss": 1.9915,
"step": 8
},
{
"epoch": 0.020632681845657867,
"eval_loss": 2.0385255813598633,
"eval_runtime": 251.2319,
"eval_samples_per_second": 0.979,
"eval_steps_per_second": 0.979,
"step": 8
},
{
"epoch": 0.0309490227684868,
"grad_norm": 19.941059112548828,
"learning_rate": 9.997251843068762e-05,
"loss": 2.0389,
"step": 12
},
{
"epoch": 0.04126536369131573,
"grad_norm": 13.265397071838379,
"learning_rate": 9.989010393221656e-05,
"loss": 2.054,
"step": 16
},
{
"epoch": 0.04126536369131573,
"eval_loss": 2.037567615509033,
"eval_runtime": 356.5096,
"eval_samples_per_second": 0.69,
"eval_steps_per_second": 0.69,
"step": 16
},
{
"epoch": 0.05158170461414467,
"grad_norm": 12.110239028930664,
"learning_rate": 9.97528470997769e-05,
"loss": 1.9974,
"step": 20
},
{
"epoch": 0.0618980455369736,
"grad_norm": 15.224806785583496,
"learning_rate": 9.956089881469482e-05,
"loss": 2.0356,
"step": 24
},
{
"epoch": 0.0618980455369736,
"eval_loss": 2.060382127761841,
"eval_runtime": 260.5723,
"eval_samples_per_second": 0.944,
"eval_steps_per_second": 0.944,
"step": 24
},
{
"epoch": 0.07221438645980253,
"grad_norm": 15.310386657714844,
"learning_rate": 9.931447007857432e-05,
"loss": 2.0835,
"step": 28
},
{
"epoch": 0.08253072738263147,
"grad_norm": 14.485530853271484,
"learning_rate": 9.901383178135113e-05,
"loss": 2.0385,
"step": 32
},
{
"epoch": 0.08253072738263147,
"eval_loss": 2.0639231204986572,
"eval_runtime": 347.075,
"eval_samples_per_second": 0.709,
"eval_steps_per_second": 0.709,
"step": 32
},
{
"epoch": 0.09284706830546041,
"grad_norm": 23.205764770507812,
"learning_rate": 9.865931440351337e-05,
"loss": 2.0658,
"step": 36
},
{
"epoch": 0.10316340922828934,
"grad_norm": 20.585407257080078,
"learning_rate": 9.825130765281668e-05,
"loss": 2.1223,
"step": 40
},
{
"epoch": 0.10316340922828934,
"eval_loss": 2.083315849304199,
"eval_runtime": 244.456,
"eval_samples_per_second": 1.006,
"eval_steps_per_second": 1.006,
"step": 40
},
{
"epoch": 0.11347975015111827,
"grad_norm": 23.029006958007812,
"learning_rate": 9.779026003589304e-05,
"loss": 2.0669,
"step": 44
},
{
"epoch": 0.1237960910739472,
"grad_norm": 27.581724166870117,
"learning_rate": 9.727667836522407e-05,
"loss": 2.0677,
"step": 48
},
{
"epoch": 0.1237960910739472,
"eval_loss": 2.0909621715545654,
"eval_runtime": 258.3491,
"eval_samples_per_second": 0.952,
"eval_steps_per_second": 0.952,
"step": 48
},
{
"epoch": 0.13411243199677614,
"grad_norm": 20.641569137573242,
"learning_rate": 9.6711127202021e-05,
"loss": 2.118,
"step": 52
},
{
"epoch": 0.14442877291960507,
"grad_norm": 15.354959487915039,
"learning_rate": 9.609422823562345e-05,
"loss": 2.0729,
"step": 56
},
{
"epoch": 0.14442877291960507,
"eval_loss": 2.0872063636779785,
"eval_runtime": 366.7103,
"eval_samples_per_second": 0.671,
"eval_steps_per_second": 0.671,
"step": 56
},
{
"epoch": 0.154745113842434,
"grad_norm": 13.397254943847656,
"learning_rate": 9.542665960009959e-05,
"loss": 2.0806,
"step": 60
},
{
"epoch": 0.16506145476526293,
"grad_norm": 14.432162284851074,
"learning_rate": 9.470915512879852e-05,
"loss": 2.1197,
"step": 64
},
{
"epoch": 0.16506145476526293,
"eval_loss": 2.0972707271575928,
"eval_runtime": 241.6226,
"eval_samples_per_second": 1.018,
"eval_steps_per_second": 1.018,
"step": 64
},
{
"epoch": 0.1753777956880919,
"grad_norm": 16.219799041748047,
"learning_rate": 9.394250354767467e-05,
"loss": 2.1002,
"step": 68
},
{
"epoch": 0.18569413661092082,
"grad_norm": 19.845632553100586,
"learning_rate": 9.312754760827061e-05,
"loss": 2.1053,
"step": 72
},
{
"epoch": 0.18569413661092082,
"eval_loss": 2.091938018798828,
"eval_runtime": 374.7664,
"eval_samples_per_second": 0.656,
"eval_steps_per_second": 0.656,
"step": 72
},
{
"epoch": 0.19601047753374976,
"grad_norm": 19.960819244384766,
"learning_rate": 9.226518316131176e-05,
"loss": 2.1111,
"step": 76
},
{
"epoch": 0.2063268184565787,
"grad_norm": 22.90218734741211,
"learning_rate": 9.1356358171931e-05,
"loss": 2.0848,
"step": 80
},
{
"epoch": 0.2063268184565787,
"eval_loss": 2.1034836769104004,
"eval_runtime": 250.3178,
"eval_samples_per_second": 0.983,
"eval_steps_per_second": 0.983,
"step": 80
},
{
"epoch": 0.21664315937940762,
"grad_norm": 20.233325958251953,
"learning_rate": 9.040207167760586e-05,
"loss": 2.0845,
"step": 84
},
{
"epoch": 0.22695950030223655,
"grad_norm": 18.64703941345215,
"learning_rate": 8.940337268995385e-05,
"loss": 2.1015,
"step": 88
},
{
"epoch": 0.22695950030223655,
"eval_loss": 2.1114280223846436,
"eval_runtime": 378.994,
"eval_samples_per_second": 0.649,
"eval_steps_per_second": 0.649,
"step": 88
},
{
"epoch": 0.23727584122506548,
"grad_norm": 12.925492286682129,
"learning_rate": 8.836135904159302e-05,
"loss": 2.1163,
"step": 92
},
{
"epoch": 0.2475921821478944,
"grad_norm": 17.775686264038086,
"learning_rate": 8.727717617933544e-05,
"loss": 2.0872,
"step": 96
},
{
"epoch": 0.2475921821478944,
"eval_loss": 2.113255023956299,
"eval_runtime": 253.955,
"eval_samples_per_second": 0.969,
"eval_steps_per_second": 0.969,
"step": 96
},
{
"epoch": 0.25790852307072337,
"grad_norm": 13.843049049377441,
"learning_rate": 8.615201590504017e-05,
"loss": 2.1374,
"step": 100
},
{
"epoch": 0.2682248639935523,
"grad_norm": 17.85464859008789,
"learning_rate": 8.498711506550983e-05,
"loss": 2.0948,
"step": 104
},
{
"epoch": 0.2682248639935523,
"eval_loss": 2.1220598220825195,
"eval_runtime": 250.0959,
"eval_samples_per_second": 0.984,
"eval_steps_per_second": 0.984,
"step": 104
},
{
"epoch": 0.27854120491638124,
"grad_norm": 14.508893966674805,
"learning_rate": 8.378375419287099e-05,
"loss": 2.1539,
"step": 108
},
{
"epoch": 0.28885754583921014,
"grad_norm": 14.467598915100098,
"learning_rate": 8.25432560969328e-05,
"loss": 2.097,
"step": 112
},
{
"epoch": 0.28885754583921014,
"eval_loss": 2.1218836307525635,
"eval_runtime": 366.7235,
"eval_samples_per_second": 0.671,
"eval_steps_per_second": 0.671,
"step": 112
},
{
"epoch": 0.2991738867620391,
"grad_norm": 25.095067977905273,
"learning_rate": 8.126698441107146e-05,
"loss": 2.1127,
"step": 116
},
{
"epoch": 0.309490227684868,
"grad_norm": 16.594051361083984,
"learning_rate": 7.995634209323886e-05,
"loss": 2.147,
"step": 120
},
{
"epoch": 0.309490227684868,
"eval_loss": 2.1240272521972656,
"eval_runtime": 255.4843,
"eval_samples_per_second": 0.963,
"eval_steps_per_second": 0.963,
"step": 120
},
{
"epoch": 0.31980656860769696,
"grad_norm": 14.561055183410645,
"learning_rate": 7.861276988374302e-05,
"loss": 2.1397,
"step": 124
},
{
"epoch": 0.33012290953052587,
"grad_norm": 16.39865493774414,
"learning_rate": 7.723774472149601e-05,
"loss": 2.1315,
"step": 128
},
{
"epoch": 0.33012290953052587,
"eval_loss": 2.1188597679138184,
"eval_runtime": 389.6053,
"eval_samples_per_second": 0.631,
"eval_steps_per_second": 0.631,
"step": 128
},
{
"epoch": 0.3404392504533548,
"grad_norm": 15.670085906982422,
"learning_rate": 7.583277812046993e-05,
"loss": 2.1466,
"step": 132
},
{
"epoch": 0.3507555913761838,
"grad_norm": 22.689796447753906,
"learning_rate": 7.439941450814591e-05,
"loss": 2.1563,
"step": 136
},
{
"epoch": 0.3507555913761838,
"eval_loss": 2.136768341064453,
"eval_runtime": 252.2156,
"eval_samples_per_second": 0.975,
"eval_steps_per_second": 0.975,
"step": 136
},
{
"epoch": 0.3610719322990127,
"grad_norm": 18.863473892211914,
"learning_rate": 7.293922952778239e-05,
"loss": 2.1586,
"step": 140
},
{
"epoch": 0.37138827322184165,
"grad_norm": 17.896759033203125,
"learning_rate": 7.145382830636924e-05,
"loss": 2.1836,
"step": 144
},
{
"epoch": 0.37138827322184165,
"eval_loss": 2.127108335494995,
"eval_runtime": 364.7213,
"eval_samples_per_second": 0.674,
"eval_steps_per_second": 0.674,
"step": 144
},
{
"epoch": 0.38170461414467055,
"grad_norm": 21.21641731262207,
"learning_rate": 6.994484369017143e-05,
"loss": 2.1371,
"step": 148
},
{
"epoch": 0.3920209550674995,
"grad_norm": 15.671196937561035,
"learning_rate": 6.841393444980177e-05,
"loss": 2.1245,
"step": 152
},
{
"epoch": 0.3920209550674995,
"eval_loss": 2.1197926998138428,
"eval_runtime": 395.7933,
"eval_samples_per_second": 0.622,
"eval_steps_per_second": 0.622,
"step": 152
},
{
"epoch": 0.4023372959903284,
"grad_norm": 16.22295379638672,
"learning_rate": 6.686278345679625e-05,
"loss": 2.1008,
"step": 156
},
{
"epoch": 0.4126536369131574,
"grad_norm": 14.019622802734375,
"learning_rate": 6.529309583369605e-05,
"loss": 2.0947,
"step": 160
},
{
"epoch": 0.4126536369131574,
"eval_loss": 2.123974561691284,
"eval_runtime": 257.3823,
"eval_samples_per_second": 0.956,
"eval_steps_per_second": 0.956,
"step": 160
},
{
"epoch": 0.4229699778359863,
"grad_norm": 18.07195281982422,
"learning_rate": 6.370659707966967e-05,
"loss": 2.1187,
"step": 164
},
{
"epoch": 0.43328631875881524,
"grad_norm": 19.1149959564209,
"learning_rate": 6.2105031173736e-05,
"loss": 2.1472,
"step": 168
},
{
"epoch": 0.43328631875881524,
"eval_loss": 2.13541579246521,
"eval_runtime": 358.8473,
"eval_samples_per_second": 0.686,
"eval_steps_per_second": 0.686,
"step": 168
},
{
"epoch": 0.44360265968164414,
"grad_norm": 17.229467391967773,
"learning_rate": 6.049015865767318e-05,
"loss": 2.0992,
"step": 172
},
{
"epoch": 0.4539190006044731,
"grad_norm": 14.356927871704102,
"learning_rate": 5.88637547007204e-05,
"loss": 2.1348,
"step": 176
},
{
"epoch": 0.4539190006044731,
"eval_loss": 2.1260793209075928,
"eval_runtime": 248.3974,
"eval_samples_per_second": 0.99,
"eval_steps_per_second": 0.99,
"step": 176
},
{
"epoch": 0.46423534152730206,
"grad_norm": 16.18288803100586,
"learning_rate": 5.722760714820057e-05,
"loss": 2.1451,
"step": 180
},
{
"epoch": 0.47455168245013096,
"grad_norm": 16.129526138305664,
"learning_rate": 5.5583514556208514e-05,
"loss": 2.1099,
"step": 184
},
{
"epoch": 0.47455168245013096,
"eval_loss": 2.127509355545044,
"eval_runtime": 357.9954,
"eval_samples_per_second": 0.687,
"eval_steps_per_second": 0.687,
"step": 184
},
{
"epoch": 0.4848680233729599,
"grad_norm": 17.3072509765625,
"learning_rate": 5.393328421452514e-05,
"loss": 2.108,
"step": 188
},
{
"epoch": 0.4951843642957888,
"grad_norm": 15.436331748962402,
"learning_rate": 5.2278730159931076e-05,
"loss": 2.1006,
"step": 192
},
{
"epoch": 0.4951843642957888,
"eval_loss": 2.1196277141571045,
"eval_runtime": 266.5248,
"eval_samples_per_second": 0.923,
"eval_steps_per_second": 0.923,
"step": 192
},
{
"epoch": 0.5055007052186178,
"grad_norm": 13.889219284057617,
"learning_rate": 5.062167118210367e-05,
"loss": 2.1588,
"step": 196
},
{
"epoch": 0.5158170461414467,
"grad_norm": 15.053701400756836,
"learning_rate": 4.896392882428901e-05,
"loss": 2.1339,
"step": 200
},
{
"epoch": 0.5158170461414467,
"eval_loss": 2.1169850826263428,
"eval_runtime": 243.5045,
"eval_samples_per_second": 1.01,
"eval_steps_per_second": 1.01,
"step": 200
},
{
"epoch": 0.5261333870642756,
"grad_norm": 15.929015159606934,
"learning_rate": 4.730732538094749e-05,
"loss": 2.1116,
"step": 204
},
{
"epoch": 0.5364497279871046,
"grad_norm": 13.288613319396973,
"learning_rate": 4.565368189457313e-05,
"loss": 2.0841,
"step": 208
},
{
"epoch": 0.5364497279871046,
"eval_loss": 2.1104753017425537,
"eval_runtime": 365.3009,
"eval_samples_per_second": 0.673,
"eval_steps_per_second": 0.673,
"step": 208
},
{
"epoch": 0.5467660689099335,
"grad_norm": 15.282868385314941,
"learning_rate": 4.400481615388948e-05,
"loss": 2.0709,
"step": 212
},
{
"epoch": 0.5570824098327625,
"grad_norm": 16.86932945251465,
"learning_rate": 4.236254069562213e-05,
"loss": 2.1344,
"step": 216
},
{
"epoch": 0.5570824098327625,
"eval_loss": 2.1079087257385254,
"eval_runtime": 251.0859,
"eval_samples_per_second": 0.98,
"eval_steps_per_second": 0.98,
"step": 216
},
{
"epoch": 0.5673987507555914,
"grad_norm": 15.028175354003906,
"learning_rate": 4.0728660812044536e-05,
"loss": 2.1397,
"step": 220
},
{
"epoch": 0.5777150916784203,
"grad_norm": 14.224735260009766,
"learning_rate": 3.910497256648742e-05,
"loss": 2.0732,
"step": 224
},
{
"epoch": 0.5777150916784203,
"eval_loss": 2.104301929473877,
"eval_runtime": 366.8962,
"eval_samples_per_second": 0.67,
"eval_steps_per_second": 0.67,
"step": 224
},
{
"epoch": 0.5880314326012492,
"grad_norm": 15.107568740844727,
"learning_rate": 3.749326081899329e-05,
"loss": 2.1003,
"step": 228
},
{
"epoch": 0.5983477735240782,
"grad_norm": 12.49576473236084,
"learning_rate": 3.589529726428615e-05,
"loss": 2.0417,
"step": 232
},
{
"epoch": 0.5983477735240782,
"eval_loss": 2.103517532348633,
"eval_runtime": 243.1722,
"eval_samples_per_second": 1.012,
"eval_steps_per_second": 1.012,
"step": 232
},
{
"epoch": 0.6086641144469072,
"grad_norm": 10.86776065826416,
"learning_rate": 3.431283848421347e-05,
"loss": 2.1209,
"step": 236
},
{
"epoch": 0.618980455369736,
"grad_norm": 15.035432815551758,
"learning_rate": 3.274762401680124e-05,
"loss": 2.1003,
"step": 240
},
{
"epoch": 0.618980455369736,
"eval_loss": 2.0966618061065674,
"eval_runtime": 354.4771,
"eval_samples_per_second": 0.694,
"eval_steps_per_second": 0.694,
"step": 240
},
{
"epoch": 0.629296796292565,
"grad_norm": 19.571407318115234,
"learning_rate": 3.120137444404442e-05,
"loss": 2.0631,
"step": 244
},
{
"epoch": 0.6396131372153939,
"grad_norm": 12.437437057495117,
"learning_rate": 2.9675789500535328e-05,
"loss": 2.0501,
"step": 248
},
{
"epoch": 0.6396131372153939,
"eval_loss": 2.1006603240966797,
"eval_runtime": 242.766,
"eval_samples_per_second": 1.013,
"eval_steps_per_second": 1.013,
"step": 248
},
{
"epoch": 0.6499294781382229,
"grad_norm": 14.765650749206543,
"learning_rate": 2.8172546205008683e-05,
"loss": 2.1215,
"step": 252
},
{
"epoch": 0.6602458190610517,
"grad_norm": 12.97276496887207,
"learning_rate": 2.6693297016857188e-05,
"loss": 2.078,
"step": 256
},
{
"epoch": 0.6602458190610517,
"eval_loss": 2.0861904621124268,
"eval_runtime": 251.7783,
"eval_samples_per_second": 0.977,
"eval_steps_per_second": 0.977,
"step": 256
},
{
"epoch": 0.6705621599838807,
"grad_norm": 13.271310806274414,
"learning_rate": 2.523966801964468e-05,
"loss": 2.0733,
"step": 260
},
{
"epoch": 0.6808785009067096,
"grad_norm": 13.07889461517334,
"learning_rate": 2.3813257133612827e-05,
"loss": 2.0507,
"step": 264
},
{
"epoch": 0.6808785009067096,
"eval_loss": 2.084022045135498,
"eval_runtime": 351.4005,
"eval_samples_per_second": 0.7,
"eval_steps_per_second": 0.7,
"step": 264
},
{
"epoch": 0.6911948418295386,
"grad_norm": 12.636960983276367,
"learning_rate": 2.2415632359146856e-05,
"loss": 2.0855,
"step": 268
},
{
"epoch": 0.7015111827523676,
"grad_norm": 16.52528190612793,
"learning_rate": 2.104833005313131e-05,
"loss": 2.0235,
"step": 272
},
{
"epoch": 0.7015111827523676,
"eval_loss": 2.076244354248047,
"eval_runtime": 240.3826,
"eval_samples_per_second": 1.023,
"eval_steps_per_second": 1.023,
"step": 272
},
{
"epoch": 0.7118275236751964,
"grad_norm": 11.037178039550781,
"learning_rate": 1.971285324008994e-05,
"loss": 2.1384,
"step": 276
},
{
"epoch": 0.7221438645980254,
"grad_norm": 13.414417266845703,
"learning_rate": 1.84106699599668e-05,
"loss": 2.0743,
"step": 280
},
{
"epoch": 0.7221438645980254,
"eval_loss": 2.072314500808716,
"eval_runtime": 366.894,
"eval_samples_per_second": 0.67,
"eval_steps_per_second": 0.67,
"step": 280
},
{
"epoch": 0.7324602055208543,
"grad_norm": 12.51091480255127,
"learning_rate": 1.7143211654364762e-05,
"loss": 2.0863,
"step": 284
},
{
"epoch": 0.7427765464436833,
"grad_norm": 15.719615936279297,
"learning_rate": 1.5911871593014837e-05,
"loss": 2.1028,
"step": 288
},
{
"epoch": 0.7427765464436833,
"eval_loss": 2.0720996856689453,
"eval_runtime": 240.5658,
"eval_samples_per_second": 1.023,
"eval_steps_per_second": 1.023,
"step": 288
},
{
"epoch": 0.7530928873665121,
"grad_norm": 11.165517807006836,
"learning_rate": 1.4718003342206722e-05,
"loss": 2.0551,
"step": 292
},
{
"epoch": 0.7634092282893411,
"grad_norm": 13.10633659362793,
"learning_rate": 1.3562919276863844e-05,
"loss": 2.0987,
"step": 296
},
{
"epoch": 0.7634092282893411,
"eval_loss": 2.066223382949829,
"eval_runtime": 248.9943,
"eval_samples_per_second": 0.988,
"eval_steps_per_second": 0.988,
"step": 296
},
{
"epoch": 0.7737255692121701,
"grad_norm": 15.164176940917969,
"learning_rate": 1.2447889137898293e-05,
"loss": 2.113,
"step": 300
},
{
"epoch": 0.784041910134999,
"grad_norm": 13.534417152404785,
"learning_rate": 1.1374138636432053e-05,
"loss": 2.0985,
"step": 304
},
{
"epoch": 0.784041910134999,
"eval_loss": 2.066317558288574,
"eval_runtime": 373.4925,
"eval_samples_per_second": 0.659,
"eval_steps_per_second": 0.659,
"step": 304
},
{
"epoch": 0.794358251057828,
"grad_norm": 11.72275161743164,
"learning_rate": 1.0342848106418368e-05,
"loss": 2.0898,
"step": 308
},
{
"epoch": 0.8046745919806568,
"grad_norm": 12.691418647766113,
"learning_rate": 9.35515120714447e-06,
"loss": 2.0548,
"step": 312
},
{
"epoch": 0.8046745919806568,
"eval_loss": 2.0601634979248047,
"eval_runtime": 252.2425,
"eval_samples_per_second": 0.975,
"eval_steps_per_second": 0.975,
"step": 312
},
{
"epoch": 0.8149909329034858,
"grad_norm": 9.275248527526855,
"learning_rate": 8.41213367704224e-06,
"loss": 2.0907,
"step": 316
},
{
"epoch": 0.8253072738263147,
"grad_norm": 13.456718444824219,
"learning_rate": 7.51483214017637e-06,
"loss": 2.0365,
"step": 320
},
{
"epoch": 0.8253072738263147,
"eval_loss": 2.0563478469848633,
"eval_runtime": 378.3713,
"eval_samples_per_second": 0.65,
"eval_steps_per_second": 0.65,
"step": 320
},
{
"epoch": 0.8356236147491437,
"grad_norm": 10.312747955322266,
"learning_rate": 6.664232966721995e-06,
"loss": 2.0537,
"step": 324
},
{
"epoch": 0.8459399556719726,
"grad_norm": 11.122403144836426,
"learning_rate": 5.8612711886848196e-06,
"loss": 2.0102,
"step": 328
},
{
"epoch": 0.8459399556719726,
"eval_loss": 2.056385040283203,
"eval_runtime": 247.0489,
"eval_samples_per_second": 0.996,
"eval_steps_per_second": 0.996,
"step": 328
},
{
"epoch": 0.8562562965948015,
"grad_norm": 12.165043830871582,
"learning_rate": 5.106829472055202e-06,
"loss": 2.0219,
"step": 332
},
{
"epoch": 0.8665726375176305,
"grad_norm": 14.259943008422852,
"learning_rate": 4.401737146526219e-06,
"loss": 2.0497,
"step": 336
},
{
"epoch": 0.8665726375176305,
"eval_loss": 2.0522236824035645,
"eval_runtime": 366.3478,
"eval_samples_per_second": 0.671,
"eval_steps_per_second": 0.671,
"step": 336
},
{
"epoch": 0.8768889784404594,
"grad_norm": 12.8046293258667,
"learning_rate": 3.7467692938425057e-06,
"loss": 2.0136,
"step": 340
},
{
"epoch": 0.8872053193632883,
"grad_norm": 15.552796363830566,
"learning_rate": 3.142645895781715e-06,
"loss": 2.0721,
"step": 344
},
{
"epoch": 0.8872053193632883,
"eval_loss": 2.0470504760742188,
"eval_runtime": 245.8972,
"eval_samples_per_second": 1.0,
"eval_steps_per_second": 1.0,
"step": 344
},
{
"epoch": 0.8975216602861172,
"grad_norm": 10.629472732543945,
"learning_rate": 2.5900310427053044e-06,
"loss": 2.0459,
"step": 348
},
{
"epoch": 0.9078380012089462,
"grad_norm": 12.79671859741211,
"learning_rate": 2.089532203548794e-06,
"loss": 2.0812,
"step": 352
},
{
"epoch": 0.9078380012089462,
"eval_loss": 2.0468461513519287,
"eval_runtime": 246.3251,
"eval_samples_per_second": 0.999,
"eval_steps_per_second": 0.999,
"step": 352
},
{
"epoch": 0.9181543421317752,
"grad_norm": 10.172734260559082,
"learning_rate": 1.6416995580537664e-06,
"loss": 2.0588,
"step": 356
},
{
"epoch": 0.9284706830546041,
"grad_norm": 13.063249588012695,
"learning_rate": 1.247025391975698e-06,
"loss": 2.0475,
"step": 360
},
{
"epoch": 0.9284706830546041,
"eval_loss": 2.0462241172790527,
"eval_runtime": 383.7451,
"eval_samples_per_second": 0.641,
"eval_steps_per_second": 0.641,
"step": 360
},
{
"epoch": 0.938787023977433,
"grad_norm": 9.2144193649292,
"learning_rate": 9.059435559326257e-07,
"loss": 2.0669,
"step": 364
},
{
"epoch": 0.9491033649002619,
"grad_norm": 11.035449028015137,
"learning_rate": 6.188289884893062e-07,
"loss": 2.0687,
"step": 368
},
{
"epoch": 0.9491033649002619,
"eval_loss": 2.045191526412964,
"eval_runtime": 250.014,
"eval_samples_per_second": 0.984,
"eval_steps_per_second": 0.984,
"step": 368
},
{
"epoch": 0.9594197058230909,
"grad_norm": 11.38818073272705,
"learning_rate": 3.8599730400115107e-07,
"loss": 2.0007,
"step": 372
},
{
"epoch": 0.9697360467459198,
"grad_norm": 11.021515846252441,
"learning_rate": 2.0770444567118075e-07,
"loss": 2.065,
"step": 376
},
{
"epoch": 0.9697360467459198,
"eval_loss": 2.044952392578125,
"eval_runtime": 364.9331,
"eval_samples_per_second": 0.674,
"eval_steps_per_second": 0.674,
"step": 376
},
{
"epoch": 0.9800523876687487,
"grad_norm": 14.076632499694824,
"learning_rate": 8.414640420116305e-08,
"loss": 2.0618,
"step": 380
},
{
"epoch": 0.9903687285915777,
"grad_norm": 15.729289054870605,
"learning_rate": 1.5459002346324135e-08,
"loss": 1.991,
"step": 384
},
{
"epoch": 0.9903687285915777,
"eval_loss": 2.04483962059021,
"eval_runtime": 253.6132,
"eval_samples_per_second": 0.97,
"eval_steps_per_second": 0.97,
"step": 384
}
],
"logging_steps": 4,
"max_steps": 387,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 8,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.5374998322774016e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}