{ "best_metric": 2.145188093185425, "best_model_checkpoint": "/content/working/models/checkpoint-3750", "epoch": 4.8475055544334475, "eval_steps": 750, "global_step": 3750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06463340739244597, "grad_norm": 0.06091862916946411, "learning_rate": 5e-05, "loss": 2.5923, "step": 50 }, { "epoch": 0.12926681478489194, "grad_norm": 0.08663396537303925, "learning_rate": 0.0001, "loss": 2.51, "step": 100 }, { "epoch": 0.1939002221773379, "grad_norm": 0.12191956490278244, "learning_rate": 9.867197875166003e-05, "loss": 2.4241, "step": 150 }, { "epoch": 0.2585336295697839, "grad_norm": 0.1454772800207138, "learning_rate": 9.734395750332006e-05, "loss": 2.3631, "step": 200 }, { "epoch": 0.32316703696222987, "grad_norm": 0.15225572884082794, "learning_rate": 9.601593625498009e-05, "loss": 2.3231, "step": 250 }, { "epoch": 0.3878004443546758, "grad_norm": 0.20414011180400848, "learning_rate": 9.468791500664011e-05, "loss": 2.3052, "step": 300 }, { "epoch": 0.4524338517471218, "grad_norm": 0.14269402623176575, "learning_rate": 9.335989375830013e-05, "loss": 2.3173, "step": 350 }, { "epoch": 0.5170672591395677, "grad_norm": 0.1620974987745285, "learning_rate": 9.203187250996016e-05, "loss": 2.3082, "step": 400 }, { "epoch": 0.5817006665320137, "grad_norm": 0.17254067957401276, "learning_rate": 9.070385126162018e-05, "loss": 2.2901, "step": 450 }, { "epoch": 0.6463340739244597, "grad_norm": 0.17900939285755157, "learning_rate": 8.937583001328021e-05, "loss": 2.2697, "step": 500 }, { "epoch": 0.7109674813169057, "grad_norm": 0.18225091695785522, "learning_rate": 8.804780876494024e-05, "loss": 2.2775, "step": 550 }, { "epoch": 0.7756008887093516, "grad_norm": 0.19103792309761047, "learning_rate": 8.671978751660027e-05, "loss": 2.285, "step": 600 }, { "epoch": 0.8402342961017976, "grad_norm": 0.18991926312446594, "learning_rate": 8.539176626826029e-05, "loss": 2.2732, "step": 650 }, { "epoch": 0.9048677034942436, "grad_norm": 0.19260592758655548, "learning_rate": 8.406374501992032e-05, "loss": 2.2706, "step": 700 }, { "epoch": 0.9695011108866896, "grad_norm": 0.2015322744846344, "learning_rate": 8.273572377158035e-05, "loss": 2.2757, "step": 750 }, { "epoch": 0.9695011108866896, "eval_loss": 2.1990480422973633, "eval_runtime": 668.7494, "eval_samples_per_second": 12.329, "eval_steps_per_second": 12.329, "step": 750 }, { "epoch": 1.0341345182791355, "grad_norm": 0.2102416753768921, "learning_rate": 8.140770252324038e-05, "loss": 2.2683, "step": 800 }, { "epoch": 1.0987679256715814, "grad_norm": 0.2448854297399521, "learning_rate": 8.00796812749004e-05, "loss": 2.2509, "step": 850 }, { "epoch": 1.1634013330640274, "grad_norm": 0.19221533834934235, "learning_rate": 7.875166002656043e-05, "loss": 2.2675, "step": 900 }, { "epoch": 1.2280347404564735, "grad_norm": 0.20851700007915497, "learning_rate": 7.742363877822046e-05, "loss": 2.2595, "step": 950 }, { "epoch": 1.2926681478489195, "grad_norm": 0.2272697389125824, "learning_rate": 7.609561752988048e-05, "loss": 2.2552, "step": 1000 }, { "epoch": 1.3573015552413654, "grad_norm": 0.24437403678894043, "learning_rate": 7.476759628154051e-05, "loss": 2.2545, "step": 1050 }, { "epoch": 1.4219349626338114, "grad_norm": 0.18688540160655975, "learning_rate": 7.343957503320054e-05, "loss": 2.2611, "step": 1100 }, { "epoch": 1.4865683700262573, "grad_norm": 0.25209754705429077, "learning_rate": 7.211155378486057e-05, "loss": 2.2444, "step": 1150 }, { "epoch": 1.5512017774187032, "grad_norm": 0.26226118206977844, "learning_rate": 7.07835325365206e-05, "loss": 2.2491, "step": 1200 }, { "epoch": 1.6158351848111492, "grad_norm": 0.19100286066532135, "learning_rate": 6.945551128818062e-05, "loss": 2.2401, "step": 1250 }, { "epoch": 1.6804685922035953, "grad_norm": 0.24590526521205902, "learning_rate": 6.812749003984064e-05, "loss": 2.238, "step": 1300 }, { "epoch": 1.745101999596041, "grad_norm": 0.20657116174697876, "learning_rate": 6.679946879150066e-05, "loss": 2.2481, "step": 1350 }, { "epoch": 1.8097354069884872, "grad_norm": 0.2326170951128006, "learning_rate": 6.547144754316069e-05, "loss": 2.2503, "step": 1400 }, { "epoch": 1.8743688143809332, "grad_norm": 0.192840114235878, "learning_rate": 6.414342629482072e-05, "loss": 2.2438, "step": 1450 }, { "epoch": 1.939002221773379, "grad_norm": 0.2151508778333664, "learning_rate": 6.281540504648075e-05, "loss": 2.2364, "step": 1500 }, { "epoch": 1.939002221773379, "eval_loss": 2.171081304550171, "eval_runtime": 669.7809, "eval_samples_per_second": 12.31, "eval_steps_per_second": 12.31, "step": 1500 }, { "epoch": 2.0036356291658253, "grad_norm": 0.23874713480472565, "learning_rate": 6.148738379814077e-05, "loss": 2.2375, "step": 1550 }, { "epoch": 2.068269036558271, "grad_norm": 0.23924137651920319, "learning_rate": 6.01593625498008e-05, "loss": 2.234, "step": 1600 }, { "epoch": 2.132902443950717, "grad_norm": 0.24824275076389313, "learning_rate": 5.883134130146083e-05, "loss": 2.2261, "step": 1650 }, { "epoch": 2.197535851343163, "grad_norm": 0.26141875982284546, "learning_rate": 5.7503320053120855e-05, "loss": 2.2319, "step": 1700 }, { "epoch": 2.262169258735609, "grad_norm": 0.23786722123622894, "learning_rate": 5.6175298804780876e-05, "loss": 2.2351, "step": 1750 }, { "epoch": 2.3268026661280548, "grad_norm": 0.22672909498214722, "learning_rate": 5.48472775564409e-05, "loss": 2.2194, "step": 1800 }, { "epoch": 2.391436073520501, "grad_norm": 0.2062198519706726, "learning_rate": 5.351925630810093e-05, "loss": 2.2284, "step": 1850 }, { "epoch": 2.456069480912947, "grad_norm": 0.2124897688627243, "learning_rate": 5.219123505976096e-05, "loss": 2.2472, "step": 1900 }, { "epoch": 2.520702888305393, "grad_norm": 0.24128706753253937, "learning_rate": 5.0863213811420985e-05, "loss": 2.2264, "step": 1950 }, { "epoch": 2.585336295697839, "grad_norm": 0.22971104085445404, "learning_rate": 4.953519256308101e-05, "loss": 2.2134, "step": 2000 }, { "epoch": 2.6499697030902847, "grad_norm": 0.22065037488937378, "learning_rate": 4.820717131474104e-05, "loss": 2.2328, "step": 2050 }, { "epoch": 2.714603110482731, "grad_norm": 0.2271411418914795, "learning_rate": 4.687915006640107e-05, "loss": 2.2252, "step": 2100 }, { "epoch": 2.7792365178751766, "grad_norm": 0.2558536231517792, "learning_rate": 4.555112881806109e-05, "loss": 2.2324, "step": 2150 }, { "epoch": 2.8438699252676227, "grad_norm": 0.23957061767578125, "learning_rate": 4.4223107569721116e-05, "loss": 2.2209, "step": 2200 }, { "epoch": 2.908503332660069, "grad_norm": 0.26163169741630554, "learning_rate": 4.289508632138114e-05, "loss": 2.2172, "step": 2250 }, { "epoch": 2.908503332660069, "eval_loss": 2.156771183013916, "eval_runtime": 668.1763, "eval_samples_per_second": 12.34, "eval_steps_per_second": 12.34, "step": 2250 }, { "epoch": 2.9731367400525146, "grad_norm": 0.26465606689453125, "learning_rate": 4.156706507304117e-05, "loss": 2.2313, "step": 2300 }, { "epoch": 3.0377701474449608, "grad_norm": 0.22578325867652893, "learning_rate": 4.02390438247012e-05, "loss": 2.2253, "step": 2350 }, { "epoch": 3.1024035548374065, "grad_norm": 0.23404210805892944, "learning_rate": 3.8911022576361225e-05, "loss": 2.2121, "step": 2400 }, { "epoch": 3.1670369622298526, "grad_norm": 0.2838682532310486, "learning_rate": 3.758300132802125e-05, "loss": 2.2218, "step": 2450 }, { "epoch": 3.2316703696222984, "grad_norm": 0.310476690530777, "learning_rate": 3.625498007968128e-05, "loss": 2.2159, "step": 2500 }, { "epoch": 3.2963037770147445, "grad_norm": 0.22608056664466858, "learning_rate": 3.492695883134131e-05, "loss": 2.2157, "step": 2550 }, { "epoch": 3.3609371844071907, "grad_norm": 0.23135437071323395, "learning_rate": 3.359893758300133e-05, "loss": 2.2255, "step": 2600 }, { "epoch": 3.4255705917996364, "grad_norm": 0.23839163780212402, "learning_rate": 3.2270916334661356e-05, "loss": 2.2223, "step": 2650 }, { "epoch": 3.4902039991920826, "grad_norm": 0.24394294619560242, "learning_rate": 3.094289508632138e-05, "loss": 2.2199, "step": 2700 }, { "epoch": 3.5548374065845283, "grad_norm": 0.2429606020450592, "learning_rate": 2.961487383798141e-05, "loss": 2.2015, "step": 2750 }, { "epoch": 3.6194708139769745, "grad_norm": 0.25373005867004395, "learning_rate": 2.8286852589641438e-05, "loss": 2.2152, "step": 2800 }, { "epoch": 3.68410422136942, "grad_norm": 0.2572405934333801, "learning_rate": 2.6958831341301462e-05, "loss": 2.2133, "step": 2850 }, { "epoch": 3.7487376287618663, "grad_norm": 0.2735884189605713, "learning_rate": 2.563081009296149e-05, "loss": 2.2171, "step": 2900 }, { "epoch": 3.8133710361543125, "grad_norm": 0.24303478002548218, "learning_rate": 2.4302788844621517e-05, "loss": 2.206, "step": 2950 }, { "epoch": 3.878004443546758, "grad_norm": 0.27165114879608154, "learning_rate": 2.297476759628154e-05, "loss": 2.2202, "step": 3000 }, { "epoch": 3.878004443546758, "eval_loss": 2.148362398147583, "eval_runtime": 668.4731, "eval_samples_per_second": 12.334, "eval_steps_per_second": 12.334, "step": 3000 }, { "epoch": 3.942637850939204, "grad_norm": 0.2713201940059662, "learning_rate": 2.1646746347941568e-05, "loss": 2.2085, "step": 3050 }, { "epoch": 4.0072712583316505, "grad_norm": 0.28700971603393555, "learning_rate": 2.0318725099601595e-05, "loss": 2.2154, "step": 3100 }, { "epoch": 4.071904665724096, "grad_norm": 0.2836604714393616, "learning_rate": 1.899070385126162e-05, "loss": 2.2011, "step": 3150 }, { "epoch": 4.136538073116542, "grad_norm": 0.26256245374679565, "learning_rate": 1.7662682602921647e-05, "loss": 2.2102, "step": 3200 }, { "epoch": 4.201171480508988, "grad_norm": 0.2922552227973938, "learning_rate": 1.6334661354581674e-05, "loss": 2.1935, "step": 3250 }, { "epoch": 4.265804887901434, "grad_norm": 0.26275578141212463, "learning_rate": 1.5006640106241702e-05, "loss": 2.209, "step": 3300 }, { "epoch": 4.33043829529388, "grad_norm": 0.28638267517089844, "learning_rate": 1.3678618857901726e-05, "loss": 2.2105, "step": 3350 }, { "epoch": 4.395071702686326, "grad_norm": 0.27328747510910034, "learning_rate": 1.2350597609561753e-05, "loss": 2.2171, "step": 3400 }, { "epoch": 4.459705110078772, "grad_norm": 0.2329448014497757, "learning_rate": 1.102257636122178e-05, "loss": 2.2133, "step": 3450 }, { "epoch": 4.524338517471218, "grad_norm": 0.24032790958881378, "learning_rate": 9.694555112881806e-06, "loss": 2.2225, "step": 3500 }, { "epoch": 4.588971924863664, "grad_norm": 0.2536959648132324, "learning_rate": 8.366533864541832e-06, "loss": 2.2047, "step": 3550 }, { "epoch": 4.6536053322561095, "grad_norm": 0.2353695183992386, "learning_rate": 7.03851261620186e-06, "loss": 2.2158, "step": 3600 }, { "epoch": 4.718238739648556, "grad_norm": 0.23979786038398743, "learning_rate": 5.710491367861886e-06, "loss": 2.219, "step": 3650 }, { "epoch": 4.782872147041002, "grad_norm": 0.2803194522857666, "learning_rate": 4.382470119521913e-06, "loss": 2.193, "step": 3700 }, { "epoch": 4.8475055544334475, "grad_norm": 0.24206270277500153, "learning_rate": 3.054448871181939e-06, "loss": 2.2079, "step": 3750 }, { "epoch": 4.8475055544334475, "eval_loss": 2.145188093185425, "eval_runtime": 669.1927, "eval_samples_per_second": 12.321, "eval_steps_per_second": 12.321, "step": 3750 } ], "logging_steps": 50, "max_steps": 3865, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 750, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.3013315141632e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }