lemexp-task1-v3-template_small_nodefs-deepseek-coder-6.7b-base
/
checkpoint-39589
/trainer_state.json
| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 11.0, | |
| "eval_steps": 720, | |
| "global_step": 39589, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.13892747985551543, | |
| "grad_norm": 0.4299773573875427, | |
| "learning_rate": 0.0003953968695007873, | |
| "loss": 0.2967, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2000555709919422, | |
| "eval_loss": 0.1983761489391327, | |
| "eval_runtime": 16.4512, | |
| "eval_samples_per_second": 30.393, | |
| "eval_steps_per_second": 3.83, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.27785495971103086, | |
| "grad_norm": 0.5706949830055237, | |
| "learning_rate": 0.0003907659535056034, | |
| "loss": 0.1959, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.4001111419838844, | |
| "eval_loss": 0.17383727431297302, | |
| "eval_runtime": 16.4842, | |
| "eval_samples_per_second": 30.332, | |
| "eval_steps_per_second": 3.822, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.41678243956654626, | |
| "grad_norm": 0.5392869710922241, | |
| "learning_rate": 0.0003861350375104196, | |
| "loss": 0.1723, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.5557099194220617, | |
| "grad_norm": 0.5005412697792053, | |
| "learning_rate": 0.0003815041215152357, | |
| "loss": 0.1636, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6001667129758266, | |
| "eval_loss": 0.16229495406150818, | |
| "eval_runtime": 16.4963, | |
| "eval_samples_per_second": 30.31, | |
| "eval_steps_per_second": 3.819, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.6946373992775771, | |
| "grad_norm": 0.370914489030838, | |
| "learning_rate": 0.0003768732055200519, | |
| "loss": 0.1539, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8002222839677688, | |
| "eval_loss": 0.1537286937236786, | |
| "eval_runtime": 16.4662, | |
| "eval_samples_per_second": 30.365, | |
| "eval_steps_per_second": 3.826, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.8335648791330925, | |
| "grad_norm": 0.41378697752952576, | |
| "learning_rate": 0.000372242289524868, | |
| "loss": 0.1445, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.972492358988608, | |
| "grad_norm": 0.4000154137611389, | |
| "learning_rate": 0.0003676113735296842, | |
| "loss": 0.1384, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.000277854959711, | |
| "eval_loss": 0.15115150809288025, | |
| "eval_runtime": 16.8021, | |
| "eval_samples_per_second": 29.758, | |
| "eval_steps_per_second": 3.75, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.1114198388441234, | |
| "grad_norm": 0.4774770140647888, | |
| "learning_rate": 0.00036298045753450036, | |
| "loss": 0.1317, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.2003334259516532, | |
| "eval_loss": 0.14330309629440308, | |
| "eval_runtime": 16.4814, | |
| "eval_samples_per_second": 30.337, | |
| "eval_steps_per_second": 3.822, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 1.2503473186996388, | |
| "grad_norm": 0.4988621175289154, | |
| "learning_rate": 0.0003583495415393165, | |
| "loss": 0.1283, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.3892747985551543, | |
| "grad_norm": 0.5688238739967346, | |
| "learning_rate": 0.000353727887376123, | |
| "loss": 0.1237, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.4003889969435954, | |
| "eval_loss": 0.13683326542377472, | |
| "eval_runtime": 16.5065, | |
| "eval_samples_per_second": 30.291, | |
| "eval_steps_per_second": 3.817, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 1.5282022784106695, | |
| "grad_norm": 0.5433902740478516, | |
| "learning_rate": 0.00034909697138093914, | |
| "loss": 0.1174, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.6004445679355377, | |
| "eval_loss": 0.13955478370189667, | |
| "eval_runtime": 16.4989, | |
| "eval_samples_per_second": 30.305, | |
| "eval_steps_per_second": 3.818, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 1.667129758266185, | |
| "grad_norm": 0.5300644040107727, | |
| "learning_rate": 0.0003444660553857553, | |
| "loss": 0.1178, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.8005001389274797, | |
| "eval_loss": 0.13158197700977325, | |
| "eval_runtime": 16.533, | |
| "eval_samples_per_second": 30.243, | |
| "eval_steps_per_second": 3.811, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 1.8060572381217006, | |
| "grad_norm": 0.3573897182941437, | |
| "learning_rate": 0.0003398351393905715, | |
| "loss": 0.113, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.9449847179772157, | |
| "grad_norm": 0.5019258260726929, | |
| "learning_rate": 0.00033520422339538766, | |
| "loss": 0.1134, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.000555709919422, | |
| "eval_loss": 0.13124322891235352, | |
| "eval_runtime": 16.0696, | |
| "eval_samples_per_second": 31.115, | |
| "eval_steps_per_second": 3.92, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.0839121978327313, | |
| "grad_norm": 0.4890081286430359, | |
| "learning_rate": 0.0003305733074002038, | |
| "loss": 0.1075, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.2006112809113643, | |
| "eval_loss": 0.12690122425556183, | |
| "eval_runtime": 15.8697, | |
| "eval_samples_per_second": 31.507, | |
| "eval_steps_per_second": 3.97, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 2.222839677688247, | |
| "grad_norm": 0.5411983132362366, | |
| "learning_rate": 0.0003259423914050199, | |
| "loss": 0.104, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.361767157543762, | |
| "grad_norm": 0.892245352268219, | |
| "learning_rate": 0.00032132073724182645, | |
| "loss": 0.1018, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.4006668519033063, | |
| "eval_loss": 0.1253955215215683, | |
| "eval_runtime": 15.9212, | |
| "eval_samples_per_second": 31.405, | |
| "eval_steps_per_second": 3.957, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 2.5006946373992776, | |
| "grad_norm": 0.5154420137405396, | |
| "learning_rate": 0.0003166898212466426, | |
| "loss": 0.1018, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.600722422895249, | |
| "eval_loss": 0.1270376443862915, | |
| "eval_runtime": 15.8556, | |
| "eval_samples_per_second": 31.535, | |
| "eval_steps_per_second": 3.973, | |
| "step": 9360 | |
| }, | |
| { | |
| "epoch": 2.639622117254793, | |
| "grad_norm": 0.4247698187828064, | |
| "learning_rate": 0.0003120681670834491, | |
| "loss": 0.0988, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.7785495971103087, | |
| "grad_norm": 0.6174339652061462, | |
| "learning_rate": 0.0003074372510882653, | |
| "loss": 0.0931, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.800777993887191, | |
| "eval_loss": 0.12492711842060089, | |
| "eval_runtime": 15.9536, | |
| "eval_samples_per_second": 31.341, | |
| "eval_steps_per_second": 3.949, | |
| "step": 10080 | |
| }, | |
| { | |
| "epoch": 2.917477076965824, | |
| "grad_norm": 0.3945905864238739, | |
| "learning_rate": 0.0003028063350930814, | |
| "loss": 0.0924, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 3.000833564879133, | |
| "eval_loss": 0.12177152931690216, | |
| "eval_runtime": 16.5123, | |
| "eval_samples_per_second": 30.28, | |
| "eval_steps_per_second": 3.815, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 3.0564045568213394, | |
| "grad_norm": 0.4349508285522461, | |
| "learning_rate": 0.0002981754190978976, | |
| "loss": 0.0929, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.1953320366768545, | |
| "grad_norm": 0.5195356011390686, | |
| "learning_rate": 0.00029354450310271375, | |
| "loss": 0.0897, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 3.2008891358710754, | |
| "eval_loss": 0.12157219648361206, | |
| "eval_runtime": 15.887, | |
| "eval_samples_per_second": 31.472, | |
| "eval_steps_per_second": 3.965, | |
| "step": 11520 | |
| }, | |
| { | |
| "epoch": 3.33425951653237, | |
| "grad_norm": 0.38773760199546814, | |
| "learning_rate": 0.0002889135871075299, | |
| "loss": 0.0868, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.4009447068630174, | |
| "eval_loss": 0.12406055629253387, | |
| "eval_runtime": 15.9444, | |
| "eval_samples_per_second": 31.359, | |
| "eval_steps_per_second": 3.951, | |
| "step": 12240 | |
| }, | |
| { | |
| "epoch": 3.4731869963878856, | |
| "grad_norm": 0.3054683804512024, | |
| "learning_rate": 0.00028428267111234605, | |
| "loss": 0.0865, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 3.6010002778549595, | |
| "eval_loss": 0.11476034671068192, | |
| "eval_runtime": 15.9006, | |
| "eval_samples_per_second": 31.445, | |
| "eval_steps_per_second": 3.962, | |
| "step": 12960 | |
| }, | |
| { | |
| "epoch": 3.612114476243401, | |
| "grad_norm": 0.5311923623085022, | |
| "learning_rate": 0.0002796610169491526, | |
| "loss": 0.0845, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.7510419560989163, | |
| "grad_norm": 0.7641647458076477, | |
| "learning_rate": 0.0002750301009539687, | |
| "loss": 0.084, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 3.801055848846902, | |
| "eval_loss": 0.11587072908878326, | |
| "eval_runtime": 15.933, | |
| "eval_samples_per_second": 31.381, | |
| "eval_steps_per_second": 3.954, | |
| "step": 13680 | |
| }, | |
| { | |
| "epoch": 3.889969435954432, | |
| "grad_norm": 0.5842312574386597, | |
| "learning_rate": 0.00027039918495878483, | |
| "loss": 0.0815, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.001111419838844, | |
| "eval_loss": 0.11761430650949478, | |
| "eval_runtime": 16.0803, | |
| "eval_samples_per_second": 31.094, | |
| "eval_steps_per_second": 3.918, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 4.0288969158099475, | |
| "grad_norm": 0.5182059407234192, | |
| "learning_rate": 0.000265768268963601, | |
| "loss": 0.0823, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 4.167824395665463, | |
| "grad_norm": 0.3954576253890991, | |
| "learning_rate": 0.0002611373529684172, | |
| "loss": 0.0753, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.201166990830786, | |
| "eval_loss": 0.11391445249319077, | |
| "eval_runtime": 15.9483, | |
| "eval_samples_per_second": 31.351, | |
| "eval_steps_per_second": 3.95, | |
| "step": 15120 | |
| }, | |
| { | |
| "epoch": 4.306751875520978, | |
| "grad_norm": 0.5974435210227966, | |
| "learning_rate": 0.00025650643697323335, | |
| "loss": 0.0762, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 4.4012225618227285, | |
| "eval_loss": 0.11403658986091614, | |
| "eval_runtime": 15.92, | |
| "eval_samples_per_second": 31.407, | |
| "eval_steps_per_second": 3.957, | |
| "step": 15840 | |
| }, | |
| { | |
| "epoch": 4.445679355376494, | |
| "grad_norm": 0.4496535360813141, | |
| "learning_rate": 0.0002518755209780495, | |
| "loss": 0.0737, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.584606835232009, | |
| "grad_norm": 0.5617558360099792, | |
| "learning_rate": 0.0002472446049828656, | |
| "loss": 0.074, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 4.601278132814671, | |
| "eval_loss": 0.11306341737508774, | |
| "eval_runtime": 15.9244, | |
| "eval_samples_per_second": 31.398, | |
| "eval_steps_per_second": 3.956, | |
| "step": 16560 | |
| }, | |
| { | |
| "epoch": 4.723534315087524, | |
| "grad_norm": 0.5999208092689514, | |
| "learning_rate": 0.00024261368898768177, | |
| "loss": 0.0732, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.801333703806613, | |
| "eval_loss": 0.11077062785625458, | |
| "eval_runtime": 15.9311, | |
| "eval_samples_per_second": 31.385, | |
| "eval_steps_per_second": 3.955, | |
| "step": 17280 | |
| }, | |
| { | |
| "epoch": 4.86246179494304, | |
| "grad_norm": 0.3961442708969116, | |
| "learning_rate": 0.0002379920348244883, | |
| "loss": 0.0724, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 5.001389274798555, | |
| "grad_norm": 0.507563054561615, | |
| "learning_rate": 0.00023336111882930443, | |
| "loss": 0.0685, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.001389274798555, | |
| "eval_loss": 0.11523561179637909, | |
| "eval_runtime": 16.1329, | |
| "eval_samples_per_second": 30.992, | |
| "eval_steps_per_second": 3.905, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.14031675465407, | |
| "grad_norm": 0.5651789307594299, | |
| "learning_rate": 0.00022873020283412058, | |
| "loss": 0.0655, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 5.201444845790498, | |
| "eval_loss": 0.11398093402385712, | |
| "eval_runtime": 15.9337, | |
| "eval_samples_per_second": 31.38, | |
| "eval_steps_per_second": 3.954, | |
| "step": 18720 | |
| }, | |
| { | |
| "epoch": 5.279244234509586, | |
| "grad_norm": 0.619132399559021, | |
| "learning_rate": 0.00022409928683893675, | |
| "loss": 0.0664, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 5.401500416782439, | |
| "eval_loss": 0.11203750967979431, | |
| "eval_runtime": 15.8871, | |
| "eval_samples_per_second": 31.472, | |
| "eval_steps_per_second": 3.965, | |
| "step": 19440 | |
| }, | |
| { | |
| "epoch": 5.418171714365101, | |
| "grad_norm": 0.4724760353565216, | |
| "learning_rate": 0.0002194683708437529, | |
| "loss": 0.0636, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 5.5570991942206165, | |
| "grad_norm": 0.5861866474151611, | |
| "learning_rate": 0.00021483745484856908, | |
| "loss": 0.0648, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 5.601555987774382, | |
| "eval_loss": 0.1131061241030693, | |
| "eval_runtime": 15.952, | |
| "eval_samples_per_second": 31.344, | |
| "eval_steps_per_second": 3.949, | |
| "step": 20160 | |
| }, | |
| { | |
| "epoch": 5.6960266740761325, | |
| "grad_norm": 0.5262423157691956, | |
| "learning_rate": 0.0002102065388533852, | |
| "loss": 0.063, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 5.801611558766324, | |
| "eval_loss": 0.11248422414064407, | |
| "eval_runtime": 15.9153, | |
| "eval_samples_per_second": 31.416, | |
| "eval_steps_per_second": 3.958, | |
| "step": 20880 | |
| }, | |
| { | |
| "epoch": 5.834954153931648, | |
| "grad_norm": 0.6074294447898865, | |
| "learning_rate": 0.00020557562285820135, | |
| "loss": 0.0624, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 5.973881633787163, | |
| "grad_norm": 0.5349674820899963, | |
| "learning_rate": 0.0002009539686950079, | |
| "loss": 0.0609, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 6.001667129758266, | |
| "eval_loss": 0.11405794322490692, | |
| "eval_runtime": 16.1656, | |
| "eval_samples_per_second": 30.93, | |
| "eval_steps_per_second": 3.897, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 6.112809113642679, | |
| "grad_norm": 0.36713194847106934, | |
| "learning_rate": 0.00019632305269982403, | |
| "loss": 0.0576, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 6.201722700750208, | |
| "eval_loss": 0.11051186919212341, | |
| "eval_runtime": 15.9395, | |
| "eval_samples_per_second": 31.369, | |
| "eval_steps_per_second": 3.952, | |
| "step": 22320 | |
| }, | |
| { | |
| "epoch": 6.251736593498194, | |
| "grad_norm": 0.4714512526988983, | |
| "learning_rate": 0.00019169213670464018, | |
| "loss": 0.057, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 6.390664073353709, | |
| "grad_norm": 0.3419685363769531, | |
| "learning_rate": 0.00018706122070945633, | |
| "loss": 0.0572, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 6.401778271742151, | |
| "eval_loss": 0.1142740249633789, | |
| "eval_runtime": 15.9346, | |
| "eval_samples_per_second": 31.378, | |
| "eval_steps_per_second": 3.954, | |
| "step": 23040 | |
| }, | |
| { | |
| "epoch": 6.529591553209225, | |
| "grad_norm": 0.43148958683013916, | |
| "learning_rate": 0.0001824303047142725, | |
| "loss": 0.0554, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 6.601833842734093, | |
| "eval_loss": 0.1115042194724083, | |
| "eval_runtime": 15.9938, | |
| "eval_samples_per_second": 31.262, | |
| "eval_steps_per_second": 3.939, | |
| "step": 23760 | |
| }, | |
| { | |
| "epoch": 6.66851903306474, | |
| "grad_norm": 0.5623305439949036, | |
| "learning_rate": 0.00017779938871908863, | |
| "loss": 0.0538, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 6.801889413726035, | |
| "eval_loss": 0.11134042590856552, | |
| "eval_runtime": 15.9452, | |
| "eval_samples_per_second": 31.357, | |
| "eval_steps_per_second": 3.951, | |
| "step": 24480 | |
| }, | |
| { | |
| "epoch": 6.807446512920255, | |
| "grad_norm": 0.4066413640975952, | |
| "learning_rate": 0.0001731684727239048, | |
| "loss": 0.0534, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 6.946373992775771, | |
| "grad_norm": 0.4021354019641876, | |
| "learning_rate": 0.00016853755672872095, | |
| "loss": 0.052, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 7.001944984717977, | |
| "eval_loss": 0.1132456511259079, | |
| "eval_runtime": 16.1483, | |
| "eval_samples_per_second": 30.963, | |
| "eval_steps_per_second": 3.901, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 7.085301472631286, | |
| "grad_norm": 0.46669623255729675, | |
| "learning_rate": 0.00016391590256552746, | |
| "loss": 0.0498, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 7.20200055570992, | |
| "eval_loss": 0.11319959908723831, | |
| "eval_runtime": 15.9896, | |
| "eval_samples_per_second": 31.27, | |
| "eval_steps_per_second": 3.94, | |
| "step": 25920 | |
| }, | |
| { | |
| "epoch": 7.2242289524868015, | |
| "grad_norm": 0.2583458125591278, | |
| "learning_rate": 0.00015928498657034364, | |
| "loss": 0.0487, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 7.3631564323423175, | |
| "grad_norm": 0.3225070536136627, | |
| "learning_rate": 0.00015465407057515976, | |
| "loss": 0.0485, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 7.402056126701861, | |
| "eval_loss": 0.11151115596294403, | |
| "eval_runtime": 15.9848, | |
| "eval_samples_per_second": 31.28, | |
| "eval_steps_per_second": 3.941, | |
| "step": 26640 | |
| }, | |
| { | |
| "epoch": 7.502083912197833, | |
| "grad_norm": 0.4772126376628876, | |
| "learning_rate": 0.00015002315457997593, | |
| "loss": 0.0483, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 7.602111697693804, | |
| "eval_loss": 0.11146976053714752, | |
| "eval_runtime": 15.9333, | |
| "eval_samples_per_second": 31.381, | |
| "eval_steps_per_second": 3.954, | |
| "step": 27360 | |
| }, | |
| { | |
| "epoch": 7.641011392053348, | |
| "grad_norm": 0.7825577855110168, | |
| "learning_rate": 0.00014540150041678245, | |
| "loss": 0.0478, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 7.779938871908864, | |
| "grad_norm": 0.465191513299942, | |
| "learning_rate": 0.0001407705844215986, | |
| "loss": 0.0469, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 7.802167268685746, | |
| "eval_loss": 0.11259140819311142, | |
| "eval_runtime": 15.9752, | |
| "eval_samples_per_second": 31.299, | |
| "eval_steps_per_second": 3.944, | |
| "step": 28080 | |
| }, | |
| { | |
| "epoch": 7.918866351764379, | |
| "grad_norm": 0.2701134979724884, | |
| "learning_rate": 0.00013613966842641474, | |
| "loss": 0.0443, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 8.002222839677689, | |
| "eval_loss": 0.11337699741125107, | |
| "eval_runtime": 16.0812, | |
| "eval_samples_per_second": 31.092, | |
| "eval_steps_per_second": 3.918, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 8.057793831619895, | |
| "grad_norm": 0.3994615375995636, | |
| "learning_rate": 0.00013151801426322128, | |
| "loss": 0.044, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 8.19672131147541, | |
| "grad_norm": 0.46412038803100586, | |
| "learning_rate": 0.00012688709826803743, | |
| "loss": 0.0421, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 8.20227841066963, | |
| "eval_loss": 0.11495082080364227, | |
| "eval_runtime": 15.9654, | |
| "eval_samples_per_second": 31.318, | |
| "eval_steps_per_second": 3.946, | |
| "step": 29520 | |
| }, | |
| { | |
| "epoch": 8.335648791330925, | |
| "grad_norm": 0.3810461759567261, | |
| "learning_rate": 0.00012225618227285358, | |
| "loss": 0.0411, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 8.402333981661572, | |
| "eval_loss": 0.11439384520053864, | |
| "eval_runtime": 16.014, | |
| "eval_samples_per_second": 31.223, | |
| "eval_steps_per_second": 3.934, | |
| "step": 30240 | |
| }, | |
| { | |
| "epoch": 8.474576271186441, | |
| "grad_norm": 0.4397641122341156, | |
| "learning_rate": 0.00011762526627766973, | |
| "loss": 0.0412, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 8.602389552653515, | |
| "eval_loss": 0.11167102307081223, | |
| "eval_runtime": 15.9143, | |
| "eval_samples_per_second": 31.418, | |
| "eval_steps_per_second": 3.959, | |
| "step": 30960 | |
| }, | |
| { | |
| "epoch": 8.613503751041955, | |
| "grad_norm": 0.7023443579673767, | |
| "learning_rate": 0.00011300361211447625, | |
| "loss": 0.041, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 8.752431230897471, | |
| "grad_norm": 0.5792316198348999, | |
| "learning_rate": 0.00010837269611929239, | |
| "loss": 0.0391, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 8.802445123645457, | |
| "eval_loss": 0.11271476745605469, | |
| "eval_runtime": 15.9859, | |
| "eval_samples_per_second": 31.278, | |
| "eval_steps_per_second": 3.941, | |
| "step": 31680 | |
| }, | |
| { | |
| "epoch": 8.891358710752987, | |
| "grad_norm": 0.44151026010513306, | |
| "learning_rate": 0.00010374178012410855, | |
| "loss": 0.0403, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 9.0025006946374, | |
| "eval_loss": 0.11616696417331696, | |
| "eval_runtime": 16.1103, | |
| "eval_samples_per_second": 31.036, | |
| "eval_steps_per_second": 3.911, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 9.030286190608502, | |
| "grad_norm": 0.3094378411769867, | |
| "learning_rate": 9.911086412892471e-05, | |
| "loss": 0.0386, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 9.169213670464018, | |
| "grad_norm": 0.4858907163143158, | |
| "learning_rate": 9.447994813374086e-05, | |
| "loss": 0.0354, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 9.202556265629342, | |
| "eval_loss": 0.11926531791687012, | |
| "eval_runtime": 16.0468, | |
| "eval_samples_per_second": 31.159, | |
| "eval_steps_per_second": 3.926, | |
| "step": 33120 | |
| }, | |
| { | |
| "epoch": 9.308141150319534, | |
| "grad_norm": 0.40236544609069824, | |
| "learning_rate": 8.9849032138557e-05, | |
| "loss": 0.0354, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 9.402611836621285, | |
| "eval_loss": 0.12175790965557098, | |
| "eval_runtime": 15.9794, | |
| "eval_samples_per_second": 31.29, | |
| "eval_steps_per_second": 3.943, | |
| "step": 33840 | |
| }, | |
| { | |
| "epoch": 9.447068630175048, | |
| "grad_norm": 0.8239908218383789, | |
| "learning_rate": 8.521811614337317e-05, | |
| "loss": 0.035, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 9.585996110030564, | |
| "grad_norm": 0.2754063308238983, | |
| "learning_rate": 8.059646198017968e-05, | |
| "loss": 0.0352, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 9.602667407613225, | |
| "eval_loss": 0.11963404715061188, | |
| "eval_runtime": 15.9949, | |
| "eval_samples_per_second": 31.26, | |
| "eval_steps_per_second": 3.939, | |
| "step": 34560 | |
| }, | |
| { | |
| "epoch": 9.72492358988608, | |
| "grad_norm": 0.4275870621204376, | |
| "learning_rate": 7.596554598499583e-05, | |
| "loss": 0.0356, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 9.802722978605168, | |
| "eval_loss": 0.12364204972982407, | |
| "eval_runtime": 15.9291, | |
| "eval_samples_per_second": 31.389, | |
| "eval_steps_per_second": 3.955, | |
| "step": 35280 | |
| }, | |
| { | |
| "epoch": 9.863851069741594, | |
| "grad_norm": 0.41111400723457336, | |
| "learning_rate": 7.133462998981199e-05, | |
| "loss": 0.0322, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 10.00277854959711, | |
| "grad_norm": 0.4361058175563812, | |
| "learning_rate": 6.67129758266185e-05, | |
| "loss": 0.0331, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 10.00277854959711, | |
| "eval_loss": 0.1233987957239151, | |
| "eval_runtime": 16.0606, | |
| "eval_samples_per_second": 31.132, | |
| "eval_steps_per_second": 3.923, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 10.141706029452626, | |
| "grad_norm": 0.3986058235168457, | |
| "learning_rate": 6.208205983143465e-05, | |
| "loss": 0.032, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 10.202834120589053, | |
| "eval_loss": 0.12648221850395203, | |
| "eval_runtime": 15.987, | |
| "eval_samples_per_second": 31.275, | |
| "eval_steps_per_second": 3.941, | |
| "step": 36720 | |
| }, | |
| { | |
| "epoch": 10.28063350930814, | |
| "grad_norm": 0.1770099699497223, | |
| "learning_rate": 5.745114383625081e-05, | |
| "loss": 0.0302, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 10.402889691580995, | |
| "eval_loss": 0.1288907825946808, | |
| "eval_runtime": 15.9585, | |
| "eval_samples_per_second": 31.331, | |
| "eval_steps_per_second": 3.948, | |
| "step": 37440 | |
| }, | |
| { | |
| "epoch": 10.419560989163656, | |
| "grad_norm": 0.33153316378593445, | |
| "learning_rate": 5.282022784106696e-05, | |
| "loss": 0.0299, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 10.558488469019172, | |
| "grad_norm": 0.4955579340457916, | |
| "learning_rate": 4.818931184588312e-05, | |
| "loss": 0.0301, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 10.602945262572938, | |
| "eval_loss": 0.1280115395784378, | |
| "eval_runtime": 15.9981, | |
| "eval_samples_per_second": 31.254, | |
| "eval_steps_per_second": 3.938, | |
| "step": 38160 | |
| }, | |
| { | |
| "epoch": 10.697415948874687, | |
| "grad_norm": 0.6109702587127686, | |
| "learning_rate": 4.355839585069927e-05, | |
| "loss": 0.0295, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 10.803000833564878, | |
| "eval_loss": 0.12585216760635376, | |
| "eval_runtime": 15.9876, | |
| "eval_samples_per_second": 31.274, | |
| "eval_steps_per_second": 3.941, | |
| "step": 38880 | |
| }, | |
| { | |
| "epoch": 10.836343428730203, | |
| "grad_norm": 0.3070131242275238, | |
| "learning_rate": 3.8927479855515425e-05, | |
| "loss": 0.0298, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 10.975270908585719, | |
| "grad_norm": 0.5334280133247375, | |
| "learning_rate": 3.429656386033158e-05, | |
| "loss": 0.028, | |
| "step": 39500 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 43188, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 12, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.419404232849097e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |