|
{ |
|
"best_metric": 2.145188093185425, |
|
"best_model_checkpoint": "/content/working/models/checkpoint-3750", |
|
"epoch": 4.8475055544334475, |
|
"eval_steps": 750, |
|
"global_step": 3750, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.06463340739244597, |
|
"grad_norm": 0.06091862916946411, |
|
"learning_rate": 5e-05, |
|
"loss": 2.5923, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.12926681478489194, |
|
"grad_norm": 0.08663396537303925, |
|
"learning_rate": 0.0001, |
|
"loss": 2.51, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1939002221773379, |
|
"grad_norm": 0.12191956490278244, |
|
"learning_rate": 9.867197875166003e-05, |
|
"loss": 2.4241, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2585336295697839, |
|
"grad_norm": 0.1454772800207138, |
|
"learning_rate": 9.734395750332006e-05, |
|
"loss": 2.3631, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.32316703696222987, |
|
"grad_norm": 0.15225572884082794, |
|
"learning_rate": 9.601593625498009e-05, |
|
"loss": 2.3231, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3878004443546758, |
|
"grad_norm": 0.20414011180400848, |
|
"learning_rate": 9.468791500664011e-05, |
|
"loss": 2.3052, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4524338517471218, |
|
"grad_norm": 0.14269402623176575, |
|
"learning_rate": 9.335989375830013e-05, |
|
"loss": 2.3173, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5170672591395677, |
|
"grad_norm": 0.1620974987745285, |
|
"learning_rate": 9.203187250996016e-05, |
|
"loss": 2.3082, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5817006665320137, |
|
"grad_norm": 0.17254067957401276, |
|
"learning_rate": 9.070385126162018e-05, |
|
"loss": 2.2901, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6463340739244597, |
|
"grad_norm": 0.17900939285755157, |
|
"learning_rate": 8.937583001328021e-05, |
|
"loss": 2.2697, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7109674813169057, |
|
"grad_norm": 0.18225091695785522, |
|
"learning_rate": 8.804780876494024e-05, |
|
"loss": 2.2775, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.7756008887093516, |
|
"grad_norm": 0.19103792309761047, |
|
"learning_rate": 8.671978751660027e-05, |
|
"loss": 2.285, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.8402342961017976, |
|
"grad_norm": 0.18991926312446594, |
|
"learning_rate": 8.539176626826029e-05, |
|
"loss": 2.2732, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.9048677034942436, |
|
"grad_norm": 0.19260592758655548, |
|
"learning_rate": 8.406374501992032e-05, |
|
"loss": 2.2706, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.9695011108866896, |
|
"grad_norm": 0.2015322744846344, |
|
"learning_rate": 8.273572377158035e-05, |
|
"loss": 2.2757, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.9695011108866896, |
|
"eval_loss": 2.1990480422973633, |
|
"eval_runtime": 668.7494, |
|
"eval_samples_per_second": 12.329, |
|
"eval_steps_per_second": 12.329, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.0341345182791355, |
|
"grad_norm": 0.2102416753768921, |
|
"learning_rate": 8.140770252324038e-05, |
|
"loss": 2.2683, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.0987679256715814, |
|
"grad_norm": 0.2448854297399521, |
|
"learning_rate": 8.00796812749004e-05, |
|
"loss": 2.2509, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.1634013330640274, |
|
"grad_norm": 0.19221533834934235, |
|
"learning_rate": 7.875166002656043e-05, |
|
"loss": 2.2675, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.2280347404564735, |
|
"grad_norm": 0.20851700007915497, |
|
"learning_rate": 7.742363877822046e-05, |
|
"loss": 2.2595, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.2926681478489195, |
|
"grad_norm": 0.2272697389125824, |
|
"learning_rate": 7.609561752988048e-05, |
|
"loss": 2.2552, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.3573015552413654, |
|
"grad_norm": 0.24437403678894043, |
|
"learning_rate": 7.476759628154051e-05, |
|
"loss": 2.2545, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.4219349626338114, |
|
"grad_norm": 0.18688540160655975, |
|
"learning_rate": 7.343957503320054e-05, |
|
"loss": 2.2611, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.4865683700262573, |
|
"grad_norm": 0.25209754705429077, |
|
"learning_rate": 7.211155378486057e-05, |
|
"loss": 2.2444, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.5512017774187032, |
|
"grad_norm": 0.26226118206977844, |
|
"learning_rate": 7.07835325365206e-05, |
|
"loss": 2.2491, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.6158351848111492, |
|
"grad_norm": 0.19100286066532135, |
|
"learning_rate": 6.945551128818062e-05, |
|
"loss": 2.2401, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.6804685922035953, |
|
"grad_norm": 0.24590526521205902, |
|
"learning_rate": 6.812749003984064e-05, |
|
"loss": 2.238, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.745101999596041, |
|
"grad_norm": 0.20657116174697876, |
|
"learning_rate": 6.679946879150066e-05, |
|
"loss": 2.2481, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.8097354069884872, |
|
"grad_norm": 0.2326170951128006, |
|
"learning_rate": 6.547144754316069e-05, |
|
"loss": 2.2503, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.8743688143809332, |
|
"grad_norm": 0.192840114235878, |
|
"learning_rate": 6.414342629482072e-05, |
|
"loss": 2.2438, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.939002221773379, |
|
"grad_norm": 0.2151508778333664, |
|
"learning_rate": 6.281540504648075e-05, |
|
"loss": 2.2364, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.939002221773379, |
|
"eval_loss": 2.171081304550171, |
|
"eval_runtime": 669.7809, |
|
"eval_samples_per_second": 12.31, |
|
"eval_steps_per_second": 12.31, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.0036356291658253, |
|
"grad_norm": 0.23874713480472565, |
|
"learning_rate": 6.148738379814077e-05, |
|
"loss": 2.2375, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.068269036558271, |
|
"grad_norm": 0.23924137651920319, |
|
"learning_rate": 6.01593625498008e-05, |
|
"loss": 2.234, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.132902443950717, |
|
"grad_norm": 0.24824275076389313, |
|
"learning_rate": 5.883134130146083e-05, |
|
"loss": 2.2261, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.197535851343163, |
|
"grad_norm": 0.26141875982284546, |
|
"learning_rate": 5.7503320053120855e-05, |
|
"loss": 2.2319, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.262169258735609, |
|
"grad_norm": 0.23786722123622894, |
|
"learning_rate": 5.6175298804780876e-05, |
|
"loss": 2.2351, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.3268026661280548, |
|
"grad_norm": 0.22672909498214722, |
|
"learning_rate": 5.48472775564409e-05, |
|
"loss": 2.2194, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.391436073520501, |
|
"grad_norm": 0.2062198519706726, |
|
"learning_rate": 5.351925630810093e-05, |
|
"loss": 2.2284, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.456069480912947, |
|
"grad_norm": 0.2124897688627243, |
|
"learning_rate": 5.219123505976096e-05, |
|
"loss": 2.2472, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.520702888305393, |
|
"grad_norm": 0.24128706753253937, |
|
"learning_rate": 5.0863213811420985e-05, |
|
"loss": 2.2264, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.585336295697839, |
|
"grad_norm": 0.22971104085445404, |
|
"learning_rate": 4.953519256308101e-05, |
|
"loss": 2.2134, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.6499697030902847, |
|
"grad_norm": 0.22065037488937378, |
|
"learning_rate": 4.820717131474104e-05, |
|
"loss": 2.2328, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.714603110482731, |
|
"grad_norm": 0.2271411418914795, |
|
"learning_rate": 4.687915006640107e-05, |
|
"loss": 2.2252, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.7792365178751766, |
|
"grad_norm": 0.2558536231517792, |
|
"learning_rate": 4.555112881806109e-05, |
|
"loss": 2.2324, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.8438699252676227, |
|
"grad_norm": 0.23957061767578125, |
|
"learning_rate": 4.4223107569721116e-05, |
|
"loss": 2.2209, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.908503332660069, |
|
"grad_norm": 0.26163169741630554, |
|
"learning_rate": 4.289508632138114e-05, |
|
"loss": 2.2172, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.908503332660069, |
|
"eval_loss": 2.156771183013916, |
|
"eval_runtime": 668.1763, |
|
"eval_samples_per_second": 12.34, |
|
"eval_steps_per_second": 12.34, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.9731367400525146, |
|
"grad_norm": 0.26465606689453125, |
|
"learning_rate": 4.156706507304117e-05, |
|
"loss": 2.2313, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 3.0377701474449608, |
|
"grad_norm": 0.22578325867652893, |
|
"learning_rate": 4.02390438247012e-05, |
|
"loss": 2.2253, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 3.1024035548374065, |
|
"grad_norm": 0.23404210805892944, |
|
"learning_rate": 3.8911022576361225e-05, |
|
"loss": 2.2121, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 3.1670369622298526, |
|
"grad_norm": 0.2838682532310486, |
|
"learning_rate": 3.758300132802125e-05, |
|
"loss": 2.2218, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 3.2316703696222984, |
|
"grad_norm": 0.310476690530777, |
|
"learning_rate": 3.625498007968128e-05, |
|
"loss": 2.2159, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 3.2963037770147445, |
|
"grad_norm": 0.22608056664466858, |
|
"learning_rate": 3.492695883134131e-05, |
|
"loss": 2.2157, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 3.3609371844071907, |
|
"grad_norm": 0.23135437071323395, |
|
"learning_rate": 3.359893758300133e-05, |
|
"loss": 2.2255, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 3.4255705917996364, |
|
"grad_norm": 0.23839163780212402, |
|
"learning_rate": 3.2270916334661356e-05, |
|
"loss": 2.2223, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 3.4902039991920826, |
|
"grad_norm": 0.24394294619560242, |
|
"learning_rate": 3.094289508632138e-05, |
|
"loss": 2.2199, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 3.5548374065845283, |
|
"grad_norm": 0.2429606020450592, |
|
"learning_rate": 2.961487383798141e-05, |
|
"loss": 2.2015, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 3.6194708139769745, |
|
"grad_norm": 0.25373005867004395, |
|
"learning_rate": 2.8286852589641438e-05, |
|
"loss": 2.2152, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 3.68410422136942, |
|
"grad_norm": 0.2572405934333801, |
|
"learning_rate": 2.6958831341301462e-05, |
|
"loss": 2.2133, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 3.7487376287618663, |
|
"grad_norm": 0.2735884189605713, |
|
"learning_rate": 2.563081009296149e-05, |
|
"loss": 2.2171, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 3.8133710361543125, |
|
"grad_norm": 0.24303478002548218, |
|
"learning_rate": 2.4302788844621517e-05, |
|
"loss": 2.206, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 3.878004443546758, |
|
"grad_norm": 0.27165114879608154, |
|
"learning_rate": 2.297476759628154e-05, |
|
"loss": 2.2202, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.878004443546758, |
|
"eval_loss": 2.148362398147583, |
|
"eval_runtime": 668.4731, |
|
"eval_samples_per_second": 12.334, |
|
"eval_steps_per_second": 12.334, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.942637850939204, |
|
"grad_norm": 0.2713201940059662, |
|
"learning_rate": 2.1646746347941568e-05, |
|
"loss": 2.2085, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 4.0072712583316505, |
|
"grad_norm": 0.28700971603393555, |
|
"learning_rate": 2.0318725099601595e-05, |
|
"loss": 2.2154, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 4.071904665724096, |
|
"grad_norm": 0.2836604714393616, |
|
"learning_rate": 1.899070385126162e-05, |
|
"loss": 2.2011, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 4.136538073116542, |
|
"grad_norm": 0.26256245374679565, |
|
"learning_rate": 1.7662682602921647e-05, |
|
"loss": 2.2102, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 4.201171480508988, |
|
"grad_norm": 0.2922552227973938, |
|
"learning_rate": 1.6334661354581674e-05, |
|
"loss": 2.1935, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 4.265804887901434, |
|
"grad_norm": 0.26275578141212463, |
|
"learning_rate": 1.5006640106241702e-05, |
|
"loss": 2.209, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 4.33043829529388, |
|
"grad_norm": 0.28638267517089844, |
|
"learning_rate": 1.3678618857901726e-05, |
|
"loss": 2.2105, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 4.395071702686326, |
|
"grad_norm": 0.27328747510910034, |
|
"learning_rate": 1.2350597609561753e-05, |
|
"loss": 2.2171, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 4.459705110078772, |
|
"grad_norm": 0.2329448014497757, |
|
"learning_rate": 1.102257636122178e-05, |
|
"loss": 2.2133, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 4.524338517471218, |
|
"grad_norm": 0.24032790958881378, |
|
"learning_rate": 9.694555112881806e-06, |
|
"loss": 2.2225, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 4.588971924863664, |
|
"grad_norm": 0.2536959648132324, |
|
"learning_rate": 8.366533864541832e-06, |
|
"loss": 2.2047, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 4.6536053322561095, |
|
"grad_norm": 0.2353695183992386, |
|
"learning_rate": 7.03851261620186e-06, |
|
"loss": 2.2158, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 4.718238739648556, |
|
"grad_norm": 0.23979786038398743, |
|
"learning_rate": 5.710491367861886e-06, |
|
"loss": 2.219, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 4.782872147041002, |
|
"grad_norm": 0.2803194522857666, |
|
"learning_rate": 4.382470119521913e-06, |
|
"loss": 2.193, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 4.8475055544334475, |
|
"grad_norm": 0.24206270277500153, |
|
"learning_rate": 3.054448871181939e-06, |
|
"loss": 2.2079, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 4.8475055544334475, |
|
"eval_loss": 2.145188093185425, |
|
"eval_runtime": 669.1927, |
|
"eval_samples_per_second": 12.321, |
|
"eval_steps_per_second": 12.321, |
|
"step": 3750 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 3865, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 750, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.3013315141632e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|