lemexp-task1-v3-template_small_nodefs-deepseek-coder-6.7b-base
/
checkpoint-17995
/trainer_state.json
| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 720, | |
| "global_step": 17995, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.13892747985551543, | |
| "grad_norm": 0.4299773573875427, | |
| "learning_rate": 0.0003953968695007873, | |
| "loss": 0.2967, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2000555709919422, | |
| "eval_loss": 0.1983761489391327, | |
| "eval_runtime": 16.4512, | |
| "eval_samples_per_second": 30.393, | |
| "eval_steps_per_second": 3.83, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.27785495971103086, | |
| "grad_norm": 0.5706949830055237, | |
| "learning_rate": 0.0003907659535056034, | |
| "loss": 0.1959, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.4001111419838844, | |
| "eval_loss": 0.17383727431297302, | |
| "eval_runtime": 16.4842, | |
| "eval_samples_per_second": 30.332, | |
| "eval_steps_per_second": 3.822, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.41678243956654626, | |
| "grad_norm": 0.5392869710922241, | |
| "learning_rate": 0.0003861350375104196, | |
| "loss": 0.1723, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.5557099194220617, | |
| "grad_norm": 0.5005412697792053, | |
| "learning_rate": 0.0003815041215152357, | |
| "loss": 0.1636, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6001667129758266, | |
| "eval_loss": 0.16229495406150818, | |
| "eval_runtime": 16.4963, | |
| "eval_samples_per_second": 30.31, | |
| "eval_steps_per_second": 3.819, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.6946373992775771, | |
| "grad_norm": 0.370914489030838, | |
| "learning_rate": 0.0003768732055200519, | |
| "loss": 0.1539, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8002222839677688, | |
| "eval_loss": 0.1537286937236786, | |
| "eval_runtime": 16.4662, | |
| "eval_samples_per_second": 30.365, | |
| "eval_steps_per_second": 3.826, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.8335648791330925, | |
| "grad_norm": 0.41378697752952576, | |
| "learning_rate": 0.000372242289524868, | |
| "loss": 0.1445, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.972492358988608, | |
| "grad_norm": 0.4000154137611389, | |
| "learning_rate": 0.0003676113735296842, | |
| "loss": 0.1384, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.000277854959711, | |
| "eval_loss": 0.15115150809288025, | |
| "eval_runtime": 16.8021, | |
| "eval_samples_per_second": 29.758, | |
| "eval_steps_per_second": 3.75, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.1114198388441234, | |
| "grad_norm": 0.4774770140647888, | |
| "learning_rate": 0.00036298045753450036, | |
| "loss": 0.1317, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.2003334259516532, | |
| "eval_loss": 0.14330309629440308, | |
| "eval_runtime": 16.4814, | |
| "eval_samples_per_second": 30.337, | |
| "eval_steps_per_second": 3.822, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 1.2503473186996388, | |
| "grad_norm": 0.4988621175289154, | |
| "learning_rate": 0.0003583495415393165, | |
| "loss": 0.1283, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.3892747985551543, | |
| "grad_norm": 0.5688238739967346, | |
| "learning_rate": 0.000353727887376123, | |
| "loss": 0.1237, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.4003889969435954, | |
| "eval_loss": 0.13683326542377472, | |
| "eval_runtime": 16.5065, | |
| "eval_samples_per_second": 30.291, | |
| "eval_steps_per_second": 3.817, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 1.5282022784106695, | |
| "grad_norm": 0.5433902740478516, | |
| "learning_rate": 0.00034909697138093914, | |
| "loss": 0.1174, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.6004445679355377, | |
| "eval_loss": 0.13955478370189667, | |
| "eval_runtime": 16.4989, | |
| "eval_samples_per_second": 30.305, | |
| "eval_steps_per_second": 3.818, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 1.667129758266185, | |
| "grad_norm": 0.5300644040107727, | |
| "learning_rate": 0.0003444660553857553, | |
| "loss": 0.1178, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.8005001389274797, | |
| "eval_loss": 0.13158197700977325, | |
| "eval_runtime": 16.533, | |
| "eval_samples_per_second": 30.243, | |
| "eval_steps_per_second": 3.811, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 1.8060572381217006, | |
| "grad_norm": 0.3573897182941437, | |
| "learning_rate": 0.0003398351393905715, | |
| "loss": 0.113, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.9449847179772157, | |
| "grad_norm": 0.5019258260726929, | |
| "learning_rate": 0.00033520422339538766, | |
| "loss": 0.1134, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.000555709919422, | |
| "eval_loss": 0.13124322891235352, | |
| "eval_runtime": 16.0696, | |
| "eval_samples_per_second": 31.115, | |
| "eval_steps_per_second": 3.92, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.0839121978327313, | |
| "grad_norm": 0.4890081286430359, | |
| "learning_rate": 0.0003305733074002038, | |
| "loss": 0.1075, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.2006112809113643, | |
| "eval_loss": 0.12690122425556183, | |
| "eval_runtime": 15.8697, | |
| "eval_samples_per_second": 31.507, | |
| "eval_steps_per_second": 3.97, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 2.222839677688247, | |
| "grad_norm": 0.5411983132362366, | |
| "learning_rate": 0.0003259423914050199, | |
| "loss": 0.104, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.361767157543762, | |
| "grad_norm": 0.892245352268219, | |
| "learning_rate": 0.00032132073724182645, | |
| "loss": 0.1018, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.4006668519033063, | |
| "eval_loss": 0.1253955215215683, | |
| "eval_runtime": 15.9212, | |
| "eval_samples_per_second": 31.405, | |
| "eval_steps_per_second": 3.957, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 2.5006946373992776, | |
| "grad_norm": 0.5154420137405396, | |
| "learning_rate": 0.0003166898212466426, | |
| "loss": 0.1018, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.600722422895249, | |
| "eval_loss": 0.1270376443862915, | |
| "eval_runtime": 15.8556, | |
| "eval_samples_per_second": 31.535, | |
| "eval_steps_per_second": 3.973, | |
| "step": 9360 | |
| }, | |
| { | |
| "epoch": 2.639622117254793, | |
| "grad_norm": 0.4247698187828064, | |
| "learning_rate": 0.0003120681670834491, | |
| "loss": 0.0988, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.7785495971103087, | |
| "grad_norm": 0.6174339652061462, | |
| "learning_rate": 0.0003074372510882653, | |
| "loss": 0.0931, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.800777993887191, | |
| "eval_loss": 0.12492711842060089, | |
| "eval_runtime": 15.9536, | |
| "eval_samples_per_second": 31.341, | |
| "eval_steps_per_second": 3.949, | |
| "step": 10080 | |
| }, | |
| { | |
| "epoch": 2.917477076965824, | |
| "grad_norm": 0.3945905864238739, | |
| "learning_rate": 0.0003028063350930814, | |
| "loss": 0.0924, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 3.000833564879133, | |
| "eval_loss": 0.12177152931690216, | |
| "eval_runtime": 16.5123, | |
| "eval_samples_per_second": 30.28, | |
| "eval_steps_per_second": 3.815, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 3.0564045568213394, | |
| "grad_norm": 0.4349508285522461, | |
| "learning_rate": 0.0002981754190978976, | |
| "loss": 0.0929, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.1953320366768545, | |
| "grad_norm": 0.5195356011390686, | |
| "learning_rate": 0.00029354450310271375, | |
| "loss": 0.0897, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 3.2008891358710754, | |
| "eval_loss": 0.12157219648361206, | |
| "eval_runtime": 15.887, | |
| "eval_samples_per_second": 31.472, | |
| "eval_steps_per_second": 3.965, | |
| "step": 11520 | |
| }, | |
| { | |
| "epoch": 3.33425951653237, | |
| "grad_norm": 0.38773760199546814, | |
| "learning_rate": 0.0002889135871075299, | |
| "loss": 0.0868, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.4009447068630174, | |
| "eval_loss": 0.12406055629253387, | |
| "eval_runtime": 15.9444, | |
| "eval_samples_per_second": 31.359, | |
| "eval_steps_per_second": 3.951, | |
| "step": 12240 | |
| }, | |
| { | |
| "epoch": 3.4731869963878856, | |
| "grad_norm": 0.3054683804512024, | |
| "learning_rate": 0.00028428267111234605, | |
| "loss": 0.0865, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 3.6010002778549595, | |
| "eval_loss": 0.11476034671068192, | |
| "eval_runtime": 15.9006, | |
| "eval_samples_per_second": 31.445, | |
| "eval_steps_per_second": 3.962, | |
| "step": 12960 | |
| }, | |
| { | |
| "epoch": 3.612114476243401, | |
| "grad_norm": 0.5311923623085022, | |
| "learning_rate": 0.0002796610169491526, | |
| "loss": 0.0845, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.7510419560989163, | |
| "grad_norm": 0.7641647458076477, | |
| "learning_rate": 0.0002750301009539687, | |
| "loss": 0.084, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 3.801055848846902, | |
| "eval_loss": 0.11587072908878326, | |
| "eval_runtime": 15.933, | |
| "eval_samples_per_second": 31.381, | |
| "eval_steps_per_second": 3.954, | |
| "step": 13680 | |
| }, | |
| { | |
| "epoch": 3.889969435954432, | |
| "grad_norm": 0.5842312574386597, | |
| "learning_rate": 0.00027039918495878483, | |
| "loss": 0.0815, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.001111419838844, | |
| "eval_loss": 0.11761430650949478, | |
| "eval_runtime": 16.0803, | |
| "eval_samples_per_second": 31.094, | |
| "eval_steps_per_second": 3.918, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 4.0288969158099475, | |
| "grad_norm": 0.5182059407234192, | |
| "learning_rate": 0.000265768268963601, | |
| "loss": 0.0823, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 4.167824395665463, | |
| "grad_norm": 0.3954576253890991, | |
| "learning_rate": 0.0002611373529684172, | |
| "loss": 0.0753, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.201166990830786, | |
| "eval_loss": 0.11391445249319077, | |
| "eval_runtime": 15.9483, | |
| "eval_samples_per_second": 31.351, | |
| "eval_steps_per_second": 3.95, | |
| "step": 15120 | |
| }, | |
| { | |
| "epoch": 4.306751875520978, | |
| "grad_norm": 0.5974435210227966, | |
| "learning_rate": 0.00025650643697323335, | |
| "loss": 0.0762, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 4.4012225618227285, | |
| "eval_loss": 0.11403658986091614, | |
| "eval_runtime": 15.92, | |
| "eval_samples_per_second": 31.407, | |
| "eval_steps_per_second": 3.957, | |
| "step": 15840 | |
| }, | |
| { | |
| "epoch": 4.445679355376494, | |
| "grad_norm": 0.4496535360813141, | |
| "learning_rate": 0.0002518755209780495, | |
| "loss": 0.0737, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.584606835232009, | |
| "grad_norm": 0.5617558360099792, | |
| "learning_rate": 0.0002472446049828656, | |
| "loss": 0.074, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 4.601278132814671, | |
| "eval_loss": 0.11306341737508774, | |
| "eval_runtime": 15.9244, | |
| "eval_samples_per_second": 31.398, | |
| "eval_steps_per_second": 3.956, | |
| "step": 16560 | |
| }, | |
| { | |
| "epoch": 4.723534315087524, | |
| "grad_norm": 0.5999208092689514, | |
| "learning_rate": 0.00024261368898768177, | |
| "loss": 0.0732, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.801333703806613, | |
| "eval_loss": 0.11077062785625458, | |
| "eval_runtime": 15.9311, | |
| "eval_samples_per_second": 31.385, | |
| "eval_steps_per_second": 3.955, | |
| "step": 17280 | |
| }, | |
| { | |
| "epoch": 4.86246179494304, | |
| "grad_norm": 0.3961442708969116, | |
| "learning_rate": 0.0002379920348244883, | |
| "loss": 0.0724, | |
| "step": 17500 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 43188, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 12, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.3736461282464236e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |