{ "best_metric": 1.1487760543823242, "best_model_checkpoint": "miner_id_24/checkpoint-250", "epoch": 1.0015698587127158, "eval_steps": 50, "global_step": 319, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0031397174254317113, "eval_loss": 2.547349452972412, "eval_runtime": 3.374, "eval_samples_per_second": 39.716, "eval_steps_per_second": 10.077, "step": 1 }, { "epoch": 0.03139717425431711, "grad_norm": 0.5533512830734253, "learning_rate": 4.2800000000000004e-05, "loss": 2.0151, "step": 10 }, { "epoch": 0.06279434850863422, "grad_norm": 0.8514082431793213, "learning_rate": 8.560000000000001e-05, "loss": 2.1655, "step": 20 }, { "epoch": 0.09419152276295134, "grad_norm": 1.2003597021102905, "learning_rate": 0.0001284, "loss": 1.9514, "step": 30 }, { "epoch": 0.12558869701726844, "grad_norm": 1.8414256572723389, "learning_rate": 0.00017120000000000001, "loss": 1.6826, "step": 40 }, { "epoch": 0.15698587127158556, "grad_norm": 2.62640118598938, "learning_rate": 0.000214, "loss": 1.8814, "step": 50 }, { "epoch": 0.15698587127158556, "eval_loss": 1.3949910402297974, "eval_runtime": 3.4107, "eval_samples_per_second": 39.288, "eval_steps_per_second": 9.969, "step": 50 }, { "epoch": 0.18838304552590268, "grad_norm": 0.8165822625160217, "learning_rate": 0.0002132711212563778, "loss": 1.4078, "step": 60 }, { "epoch": 0.21978021978021978, "grad_norm": 0.8177237510681152, "learning_rate": 0.00021109441519790188, "loss": 1.2985, "step": 70 }, { "epoch": 0.25117739403453687, "grad_norm": 0.7881454825401306, "learning_rate": 0.0002074995370540515, "loss": 1.368, "step": 80 }, { "epoch": 0.282574568288854, "grad_norm": 1.2316051721572876, "learning_rate": 0.00020253546309146147, "loss": 1.2715, "step": 90 }, { "epoch": 0.3139717425431711, "grad_norm": 1.8924660682678223, "learning_rate": 0.0001962698233660776, "loss": 1.4515, "step": 100 }, { "epoch": 0.3139717425431711, "eval_loss": 1.3114863634109497, "eval_runtime": 3.4175, "eval_samples_per_second": 39.21, "eval_steps_per_second": 9.949, "step": 100 }, { "epoch": 0.3453689167974882, "grad_norm": 0.6310524344444275, "learning_rate": 0.00018878798033791907, "loss": 1.3915, "step": 110 }, { "epoch": 0.37676609105180536, "grad_norm": 0.8549318313598633, "learning_rate": 0.00018019186590131008, "loss": 1.2935, "step": 120 }, { "epoch": 0.40816326530612246, "grad_norm": 0.740123987197876, "learning_rate": 0.0001705985926747663, "loss": 1.2393, "step": 130 }, { "epoch": 0.43956043956043955, "grad_norm": 0.9421306848526001, "learning_rate": 0.00016013885847018653, "loss": 1.3608, "step": 140 }, { "epoch": 0.47095761381475665, "grad_norm": 1.348279595375061, "learning_rate": 0.00014895516567870457, "loss": 1.3475, "step": 150 }, { "epoch": 0.47095761381475665, "eval_loss": 1.2188818454742432, "eval_runtime": 3.4253, "eval_samples_per_second": 39.12, "eval_steps_per_second": 9.926, "step": 150 }, { "epoch": 0.5023547880690737, "grad_norm": 0.7450600862503052, "learning_rate": 0.00013719987983211316, "loss": 1.3868, "step": 160 }, { "epoch": 0.533751962323391, "grad_norm": 0.5625097751617432, "learning_rate": 0.00012503315378982878, "loss": 1.2578, "step": 170 }, { "epoch": 0.565149136577708, "grad_norm": 0.7126911282539368, "learning_rate": 0.00011262074583207008, "loss": 1.307, "step": 180 }, { "epoch": 0.5965463108320251, "grad_norm": 0.8584557771682739, "learning_rate": 0.00010013176138533383, "loss": 1.3332, "step": 190 }, { "epoch": 0.6279434850863422, "grad_norm": 1.294969916343689, "learning_rate": 8.7736349146679e-05, "loss": 1.2446, "step": 200 }, { "epoch": 0.6279434850863422, "eval_loss": 1.1698250770568848, "eval_runtime": 3.4749, "eval_samples_per_second": 38.562, "eval_steps_per_second": 9.785, "step": 200 }, { "epoch": 0.6593406593406593, "grad_norm": 0.6069462895393372, "learning_rate": 7.560338299459509e-05, "loss": 1.2518, "step": 210 }, { "epoch": 0.6907378335949764, "grad_norm": 0.8310301899909973, "learning_rate": 6.389816126787358e-05, "loss": 1.2279, "step": 220 }, { "epoch": 0.7221350078492935, "grad_norm": 0.6590100526809692, "learning_rate": 5.2780154757280646e-05, "loss": 1.1994, "step": 230 }, { "epoch": 0.7535321821036107, "grad_norm": 0.8934396505355835, "learning_rate": 4.2400834091171904e-05, "loss": 1.2859, "step": 240 }, { "epoch": 0.7849293563579278, "grad_norm": 1.815680742263794, "learning_rate": 3.2901606114533654e-05, "loss": 1.266, "step": 250 }, { "epoch": 0.7849293563579278, "eval_loss": 1.1487760543823242, "eval_runtime": 3.4264, "eval_samples_per_second": 39.109, "eval_steps_per_second": 9.923, "step": 250 }, { "epoch": 0.8163265306122449, "grad_norm": 0.6432026624679565, "learning_rate": 2.4411887376019884e-05, "loss": 1.2566, "step": 260 }, { "epoch": 0.847723704866562, "grad_norm": 0.6584495306015015, "learning_rate": 1.7047340969608885e-05, "loss": 1.1352, "step": 270 }, { "epoch": 0.8791208791208791, "grad_norm": 0.6709410548210144, "learning_rate": 1.0908300751974463e-05, "loss": 1.1969, "step": 280 }, { "epoch": 0.9105180533751962, "grad_norm": 0.8905112147331238, "learning_rate": 6.078404403880684e-06, "loss": 1.2159, "step": 290 }, { "epoch": 0.9419152276295133, "grad_norm": 1.3594378232955933, "learning_rate": 2.623453958636396e-06, "loss": 1.2775, "step": 300 }, { "epoch": 0.9419152276295133, "eval_loss": 1.1542326211929321, "eval_runtime": 3.3774, "eval_samples_per_second": 39.675, "eval_steps_per_second": 10.067, "step": 300 }, { "epoch": 0.9733124018838305, "grad_norm": 0.6472459435462952, "learning_rate": 5.905193216585661e-07, "loss": 1.201, "step": 310 } ], "logging_steps": 10, "max_steps": 319, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 1 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3179286292791296e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }