{ "best_metric": 0.549039363861084, "best_model_checkpoint": "miner_id_24/checkpoint-450", "epoch": 0.21226415094339623, "eval_steps": 50, "global_step": 450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004716981132075472, "eval_loss": 4.834980487823486, "eval_runtime": 79.2702, "eval_samples_per_second": 11.265, "eval_steps_per_second": 2.826, "step": 1 }, { "epoch": 0.0047169811320754715, "grad_norm": 3.147049903869629, "learning_rate": 4.22e-05, "loss": 2.5826, "step": 10 }, { "epoch": 0.009433962264150943, "grad_norm": 3.3617496490478516, "learning_rate": 8.44e-05, "loss": 2.4648, "step": 20 }, { "epoch": 0.014150943396226415, "grad_norm": 2.916816473007202, "learning_rate": 0.0001266, "loss": 2.118, "step": 30 }, { "epoch": 0.018867924528301886, "grad_norm": 0.0006454469403252006, "learning_rate": 0.0001688, "loss": 3.8828, "step": 40 }, { "epoch": 0.02358490566037736, "grad_norm": 1.0803881878018728e-06, "learning_rate": 0.000211, "loss": 0.0, "step": 50 }, { "epoch": 0.02358490566037736, "eval_loss": 25.65457534790039, "eval_runtime": 79.178, "eval_samples_per_second": 11.278, "eval_steps_per_second": 2.829, "step": 50 }, { "epoch": 0.02830188679245283, "grad_norm": 3.4077255725860596, "learning_rate": 0.00021074300730241147, "loss": 13.6957, "step": 60 }, { "epoch": 0.0330188679245283, "grad_norm": 2.4732985496520996, "learning_rate": 0.00020997328125223568, "loss": 1.8985, "step": 70 }, { "epoch": 0.03773584905660377, "grad_norm": 2.2590270042419434, "learning_rate": 0.0002086945718774165, "loss": 1.6445, "step": 80 }, { "epoch": 0.04245283018867924, "grad_norm": 2.0797111574211158e-05, "learning_rate": 0.00020691310892149265, "loss": 4.5727, "step": 90 }, { "epoch": 0.04716981132075472, "grad_norm": 9.767625306267291e-05, "learning_rate": 0.00020463757149291335, "loss": 0.0, "step": 100 }, { "epoch": 0.04716981132075472, "eval_loss": 11.30462646484375, "eval_runtime": 79.3874, "eval_samples_per_second": 11.249, "eval_steps_per_second": 2.822, "step": 100 }, { "epoch": 0.05188679245283019, "grad_norm": 2.947974920272827, "learning_rate": 0.0002018790457812944, "loss": 5.8256, "step": 110 }, { "epoch": 0.05660377358490566, "grad_norm": 2.1973209381103516, "learning_rate": 0.0001986509710466168, "loss": 1.752, "step": 120 }, { "epoch": 0.06132075471698113, "grad_norm": 2.1051247119903564, "learning_rate": 0.00019496907414450293, "loss": 1.5014, "step": 130 }, { "epoch": 0.0660377358490566, "grad_norm": 0.14951299130916595, "learning_rate": 0.00019085129290655697, "loss": 10.0941, "step": 140 }, { "epoch": 0.07075471698113207, "grad_norm": 2.2972593797021545e-05, "learning_rate": 0.00018631768874905217, "loss": 0.0, "step": 150 }, { "epoch": 0.07075471698113207, "eval_loss": 7.184693813323975, "eval_runtime": 79.1678, "eval_samples_per_second": 11.28, "eval_steps_per_second": 2.829, "step": 150 }, { "epoch": 0.07547169811320754, "grad_norm": 3.277813196182251, "learning_rate": 0.0001813903489357277, "loss": 5.2341, "step": 160 }, { "epoch": 0.08018867924528301, "grad_norm": 2.0672781467437744, "learning_rate": 0.00017609327897085954, "loss": 1.6099, "step": 170 }, { "epoch": 0.08490566037735849, "grad_norm": 3.1866703033447266, "learning_rate": 0.00017045228564685694, "loss": 1.7181, "step": 180 }, { "epoch": 0.08962264150943396, "grad_norm": 0.06553133577108383, "learning_rate": 0.0001644948513161638, "loss": 4.2497, "step": 190 }, { "epoch": 0.09433962264150944, "grad_norm": 0.0005188188515603542, "learning_rate": 0.00015825, "loss": 0.0001, "step": 200 }, { "epoch": 0.09433962264150944, "eval_loss": 3.776930809020996, "eval_runtime": 79.0735, "eval_samples_per_second": 11.293, "eval_steps_per_second": 2.833, "step": 200 }, { "epoch": 0.09905660377358491, "grad_norm": 2.7126495838165283, "learning_rate": 0.00015174815598624768, "loss": 3.9974, "step": 210 }, { "epoch": 0.10377358490566038, "grad_norm": 3.26777720451355, "learning_rate": 0.00014502099560537873, "loss": 1.626, "step": 220 }, { "epoch": 0.10849056603773585, "grad_norm": 3.467517614364624, "learning_rate": 0.00013810129290655696, "loss": 1.7558, "step": 230 }, { "epoch": 0.11320754716981132, "grad_norm": 0.0017427363200113177, "learning_rate": 0.00013102275998576495, "loss": 3.0762, "step": 240 }, { "epoch": 0.1179245283018868, "grad_norm": 0.0014397504273802042, "learning_rate": 0.00012381988274386116, "loss": 0.0, "step": 250 }, { "epoch": 0.1179245283018868, "eval_loss": 3.8721871376037598, "eval_runtime": 79.2119, "eval_samples_per_second": 11.274, "eval_steps_per_second": 2.828, "step": 250 }, { "epoch": 0.12264150943396226, "grad_norm": 3.9357564449310303, "learning_rate": 0.00011652775287473745, "loss": 3.4362, "step": 260 }, { "epoch": 0.12735849056603774, "grad_norm": 1.8450816869735718, "learning_rate": 0.00010918189690211387, "loss": 1.6049, "step": 270 }, { "epoch": 0.1320754716981132, "grad_norm": 2.2753257751464844, "learning_rate": 0.00010181810309788618, "loss": 1.5727, "step": 280 }, { "epoch": 0.13679245283018868, "grad_norm": 0.006811914965510368, "learning_rate": 9.447224712526258e-05, "loss": 2.284, "step": 290 }, { "epoch": 0.14150943396226415, "grad_norm": 0.0010616736253723502, "learning_rate": 8.718011725613886e-05, "loss": 0.0001, "step": 300 }, { "epoch": 0.14150943396226415, "eval_loss": 1.952929973602295, "eval_runtime": 79.2711, "eval_samples_per_second": 11.265, "eval_steps_per_second": 2.826, "step": 300 }, { "epoch": 0.14622641509433962, "grad_norm": 5.2309889793396, "learning_rate": 7.997724001423507e-05, "loss": 2.5929, "step": 310 }, { "epoch": 0.1509433962264151, "grad_norm": 2.521411657333374, "learning_rate": 7.289870709344306e-05, "loss": 1.6089, "step": 320 }, { "epoch": 0.15566037735849056, "grad_norm": 2.273935317993164, "learning_rate": 6.597900439462128e-05, "loss": 1.4862, "step": 330 }, { "epoch": 0.16037735849056603, "grad_norm": 0.026630675420165062, "learning_rate": 5.9251844013752326e-05, "loss": 4.0695, "step": 340 }, { "epoch": 0.1650943396226415, "grad_norm": 7.486833055736497e-05, "learning_rate": 5.275000000000002e-05, "loss": 0.0, "step": 350 }, { "epoch": 0.1650943396226415, "eval_loss": 0.7211728096008301, "eval_runtime": 79.3058, "eval_samples_per_second": 11.26, "eval_steps_per_second": 2.825, "step": 350 }, { "epoch": 0.16981132075471697, "grad_norm": 3.576345682144165, "learning_rate": 4.650514868383623e-05, "loss": 1.9645, "step": 360 }, { "epoch": 0.17452830188679244, "grad_norm": 1.6262489557266235, "learning_rate": 4.054771435314305e-05, "loss": 1.7139, "step": 370 }, { "epoch": 0.1792452830188679, "grad_norm": 2.826154947280884, "learning_rate": 3.4906721029140495e-05, "loss": 1.6077, "step": 380 }, { "epoch": 0.18396226415094338, "grad_norm": 0.018351580947637558, "learning_rate": 2.9609651064272323e-05, "loss": 2.747, "step": 390 }, { "epoch": 0.18867924528301888, "grad_norm": 7.47749290894717e-05, "learning_rate": 2.468231125094783e-05, "loss": 0.0001, "step": 400 }, { "epoch": 0.18867924528301888, "eval_loss": 0.6076525449752808, "eval_runtime": 79.2739, "eval_samples_per_second": 11.265, "eval_steps_per_second": 2.826, "step": 400 }, { "epoch": 0.19339622641509435, "grad_norm": 2.413058280944824, "learning_rate": 2.0148707093443057e-05, "loss": 1.5055, "step": 410 }, { "epoch": 0.19811320754716982, "grad_norm": 2.6474435329437256, "learning_rate": 1.603092585549706e-05, "loss": 1.4478, "step": 420 }, { "epoch": 0.2028301886792453, "grad_norm": 2.5455524921417236, "learning_rate": 1.2349028953383204e-05, "loss": 1.5041, "step": 430 }, { "epoch": 0.20754716981132076, "grad_norm": 64.91968536376953, "learning_rate": 9.120954218705596e-06, "loss": 10.0888, "step": 440 }, { "epoch": 0.21226415094339623, "grad_norm": 0.0022940777707844973, "learning_rate": 6.362428507086673e-06, "loss": 0.0065, "step": 450 }, { "epoch": 0.21226415094339623, "eval_loss": 0.549039363861084, "eval_runtime": 79.325, "eval_samples_per_second": 11.257, "eval_steps_per_second": 2.824, "step": 450 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.30551191175168e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }