{ "best_metric": 2.572946310043335, "best_model_checkpoint": "miner_id_24/checkpoint-300", "epoch": 0.04610065309258548, "eval_steps": 50, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0001536688436419516, "eval_loss": 3.802302360534668, "eval_runtime": 176.2084, "eval_samples_per_second": 15.55, "eval_steps_per_second": 3.887, "step": 1 }, { "epoch": 0.001536688436419516, "grad_norm": 29.49568748474121, "learning_rate": 4.16e-05, "loss": 5.6992, "step": 10 }, { "epoch": 0.003073376872839032, "grad_norm": 11.658048629760742, "learning_rate": 8.32e-05, "loss": 6.3089, "step": 20 }, { "epoch": 0.004610065309258548, "grad_norm": 14.371132850646973, "learning_rate": 0.0001248, "loss": 5.7372, "step": 30 }, { "epoch": 0.006146753745678064, "grad_norm": 18.474430084228516, "learning_rate": 0.0001664, "loss": 6.365, "step": 40 }, { "epoch": 0.00768344218209758, "grad_norm": 42.71783447265625, "learning_rate": 0.000208, "loss": 7.0134, "step": 50 }, { "epoch": 0.00768344218209758, "eval_loss": 3.467595338821411, "eval_runtime": 175.6638, "eval_samples_per_second": 15.598, "eval_steps_per_second": 3.899, "step": 50 }, { "epoch": 0.009220130618517095, "grad_norm": 8.05245590209961, "learning_rate": 0.0002077466612270217, "loss": 5.4029, "step": 60 }, { "epoch": 0.010756819054936612, "grad_norm": 7.458392143249512, "learning_rate": 0.0002069878791491233, "loss": 5.7947, "step": 70 }, { "epoch": 0.012293507491356128, "grad_norm": 27.035818099975586, "learning_rate": 0.00020572735047631578, "loss": 5.5678, "step": 80 }, { "epoch": 0.013830195927775643, "grad_norm": 17.315340042114258, "learning_rate": 0.00020397121637758515, "loss": 6.5471, "step": 90 }, { "epoch": 0.01536688436419516, "grad_norm": 33.20652770996094, "learning_rate": 0.00020172803256173445, "loss": 7.03, "step": 100 }, { "epoch": 0.01536688436419516, "eval_loss": 3.3676817417144775, "eval_runtime": 176.2178, "eval_samples_per_second": 15.549, "eval_steps_per_second": 3.887, "step": 100 }, { "epoch": 0.016903572800614674, "grad_norm": 6.50840425491333, "learning_rate": 0.00019900872759483047, "loss": 5.6033, "step": 110 }, { "epoch": 0.01844026123703419, "grad_norm": 6.7641825675964355, "learning_rate": 0.0001958265496573284, "loss": 5.4601, "step": 120 }, { "epoch": 0.019976949673453707, "grad_norm": 14.109837532043457, "learning_rate": 0.00019219700200026827, "loss": 5.4273, "step": 130 }, { "epoch": 0.021513638109873223, "grad_norm": 11.948671340942383, "learning_rate": 0.0001881377674149945, "loss": 6.1093, "step": 140 }, { "epoch": 0.02305032654629274, "grad_norm": 21.177404403686523, "learning_rate": 0.00018366862208437368, "loss": 6.2444, "step": 150 }, { "epoch": 0.02305032654629274, "eval_loss": 3.1878161430358887, "eval_runtime": 175.8544, "eval_samples_per_second": 15.581, "eval_steps_per_second": 3.895, "step": 150 }, { "epoch": 0.024587014982712256, "grad_norm": 6.276320457458496, "learning_rate": 0.00017881133923521971, "loss": 5.5092, "step": 160 }, { "epoch": 0.026123703419131773, "grad_norm": 6.799064636230469, "learning_rate": 0.00017358958306132124, "loss": 5.1537, "step": 170 }, { "epoch": 0.027660391855551286, "grad_norm": 9.907474517822266, "learning_rate": 0.00016802879343386844, "loss": 5.5056, "step": 180 }, { "epoch": 0.029197080291970802, "grad_norm": 13.780465126037598, "learning_rate": 0.00016215606196095766, "loss": 5.6722, "step": 190 }, { "epoch": 0.03073376872839032, "grad_norm": 17.128713607788086, "learning_rate": 0.000156, "loss": 5.9664, "step": 200 }, { "epoch": 0.03073376872839032, "eval_loss": 3.0006656646728516, "eval_runtime": 175.6129, "eval_samples_per_second": 15.602, "eval_steps_per_second": 3.901, "step": 200 }, { "epoch": 0.032270457164809835, "grad_norm": 4.835846900939941, "learning_rate": 0.00014959059926606403, "loss": 5.1, "step": 210 }, { "epoch": 0.03380714560122935, "grad_norm": 5.523816108703613, "learning_rate": 0.00014295908571525487, "loss": 4.9443, "step": 220 }, { "epoch": 0.03534383403764887, "grad_norm": 9.212928771972656, "learning_rate": 0.00013613776741499452, "loss": 4.8833, "step": 230 }, { "epoch": 0.03688052247406838, "grad_norm": 10.690314292907715, "learning_rate": 0.00012915987714236542, "loss": 5.425, "step": 240 }, { "epoch": 0.0384172109104879, "grad_norm": 16.973690032958984, "learning_rate": 0.00012205941047736077, "loss": 5.6566, "step": 250 }, { "epoch": 0.0384172109104879, "eval_loss": 2.7571074962615967, "eval_runtime": 175.489, "eval_samples_per_second": 15.614, "eval_steps_per_second": 3.903, "step": 250 }, { "epoch": 0.039953899346907414, "grad_norm": 4.981348514556885, "learning_rate": 0.00011487096017983597, "loss": 4.7851, "step": 260 }, { "epoch": 0.041490587783326933, "grad_norm": 6.857975482940674, "learning_rate": 0.00010762954765706012, "loss": 4.6581, "step": 270 }, { "epoch": 0.043027276219746446, "grad_norm": 8.468694686889648, "learning_rate": 0.00010037045234293992, "loss": 4.5606, "step": 280 }, { "epoch": 0.04456396465616596, "grad_norm": 11.279480934143066, "learning_rate": 9.312903982016405e-05, "loss": 5.3172, "step": 290 }, { "epoch": 0.04610065309258548, "grad_norm": 19.332422256469727, "learning_rate": 8.594058952263925e-05, "loss": 5.3847, "step": 300 }, { "epoch": 0.04610065309258548, "eval_loss": 2.572946310043335, "eval_runtime": 175.6968, "eval_samples_per_second": 15.595, "eval_steps_per_second": 3.899, "step": 300 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.902866787074048e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }