{ "best_metric": 2.0348799228668213, "best_model_checkpoint": "miner_id_24/checkpoint-600", "epoch": 0.058783188008229646, "eval_steps": 150, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 9.797198001371608e-05, "eval_loss": 2.3550796508789062, "eval_runtime": 31.3445, "eval_samples_per_second": 68.561, "eval_steps_per_second": 17.164, "step": 1 }, { "epoch": 0.0009797198001371607, "grad_norm": 2.6627249717712402, "learning_rate": 7.333333333333333e-06, "loss": 2.2873, "step": 10 }, { "epoch": 0.0019594396002743215, "grad_norm": 3.744236707687378, "learning_rate": 1.4666666666666666e-05, "loss": 2.049, "step": 20 }, { "epoch": 0.0029391594004114824, "grad_norm": 1.9671672582626343, "learning_rate": 2.2e-05, "loss": 2.5565, "step": 30 }, { "epoch": 0.003918879200548643, "grad_norm": 3.646611452102661, "learning_rate": 2.9333333333333333e-05, "loss": 2.2786, "step": 40 }, { "epoch": 0.004898599000685804, "grad_norm": 3.827122211456299, "learning_rate": 3.6666666666666666e-05, "loss": 2.2983, "step": 50 }, { "epoch": 0.005878318800822965, "grad_norm": 3.6867024898529053, "learning_rate": 4.4e-05, "loss": 2.1209, "step": 60 }, { "epoch": 0.006858038600960126, "grad_norm": 2.6899774074554443, "learning_rate": 4.39627794819679e-05, "loss": 2.1654, "step": 70 }, { "epoch": 0.007837758401097286, "grad_norm": 1.979284644126892, "learning_rate": 4.3851243870322744e-05, "loss": 2.2673, "step": 80 }, { "epoch": 0.008817478201234448, "grad_norm": 2.812793016433716, "learning_rate": 4.366577056626858e-05, "loss": 2.1135, "step": 90 }, { "epoch": 0.009797198001371608, "grad_norm": 1.854182481765747, "learning_rate": 4.340698715275612e-05, "loss": 1.9896, "step": 100 }, { "epoch": 0.010776917801508768, "grad_norm": 3.9260337352752686, "learning_rate": 4.3075769270940754e-05, "loss": 2.307, "step": 110 }, { "epoch": 0.01175663760164593, "grad_norm": 2.632535219192505, "learning_rate": 4.267323765728998e-05, "loss": 2.2054, "step": 120 }, { "epoch": 0.01273635740178309, "grad_norm": 2.7458906173706055, "learning_rate": 4.220075435136603e-05, "loss": 2.1946, "step": 130 }, { "epoch": 0.013716077201920251, "grad_norm": 3.668741464614868, "learning_rate": 4.165991808711507e-05, "loss": 2.2404, "step": 140 }, { "epoch": 0.014695797002057412, "grad_norm": 2.9794223308563232, "learning_rate": 4.105255888325765e-05, "loss": 1.8902, "step": 150 }, { "epoch": 0.014695797002057412, "eval_loss": 2.0861992835998535, "eval_runtime": 30.7572, "eval_samples_per_second": 69.87, "eval_steps_per_second": 17.492, "step": 150 }, { "epoch": 0.01567551680219457, "grad_norm": 2.3339385986328125, "learning_rate": 4.03807318510846e-05, "loss": 2.1139, "step": 160 }, { "epoch": 0.016655236602331733, "grad_norm": 3.324424982070923, "learning_rate": 3.9646710240610966e-05, "loss": 2.0497, "step": 170 }, { "epoch": 0.017634956402468895, "grad_norm": 5.234289169311523, "learning_rate": 3.885297774861751e-05, "loss": 2.1816, "step": 180 }, { "epoch": 0.018614676202606054, "grad_norm": 3.0698719024658203, "learning_rate": 3.800222011460707e-05, "loss": 1.6112, "step": 190 }, { "epoch": 0.019594396002743215, "grad_norm": 1.8660218715667725, "learning_rate": 3.709731603311214e-05, "loss": 1.8906, "step": 200 }, { "epoch": 0.020574115802880377, "grad_norm": 2.6674013137817383, "learning_rate": 3.614132741310386e-05, "loss": 1.8914, "step": 210 }, { "epoch": 0.021553835603017536, "grad_norm": 1.9608094692230225, "learning_rate": 3.5137489017461296e-05, "loss": 1.7041, "step": 220 }, { "epoch": 0.022533555403154697, "grad_norm": 2.6172726154327393, "learning_rate": 3.4089197517557735e-05, "loss": 1.9938, "step": 230 }, { "epoch": 0.02351327520329186, "grad_norm": 1.9283496141433716, "learning_rate": 3.3e-05, "loss": 1.9221, "step": 240 }, { "epoch": 0.02449299500342902, "grad_norm": 3.148313045501709, "learning_rate": 3.187358196441017e-05, "loss": 1.7092, "step": 250 }, { "epoch": 0.02547271480356618, "grad_norm": 4.097523212432861, "learning_rate": 3.071375485286145e-05, "loss": 2.0906, "step": 260 }, { "epoch": 0.02645243460370334, "grad_norm": 3.45194673538208, "learning_rate": 2.9524443153164715e-05, "loss": 2.0168, "step": 270 }, { "epoch": 0.027432154403840503, "grad_norm": 3.444427967071533, "learning_rate": 2.8309671119643985e-05, "loss": 2.0562, "step": 280 }, { "epoch": 0.02841187420397766, "grad_norm": 3.895585298538208, "learning_rate": 2.7073549156333684e-05, "loss": 2.0465, "step": 290 }, { "epoch": 0.029391594004114823, "grad_norm": 3.246363639831543, "learning_rate": 2.5820259908672472e-05, "loss": 1.8482, "step": 300 }, { "epoch": 0.029391594004114823, "eval_loss": 2.049943685531616, "eval_runtime": 30.8943, "eval_samples_per_second": 69.56, "eval_steps_per_second": 17.414, "step": 300 }, { "epoch": 0.030371313804251985, "grad_norm": 3.1775381565093994, "learning_rate": 2.4554044110755066e-05, "loss": 2.4515, "step": 310 }, { "epoch": 0.03135103360438914, "grad_norm": 2.6267483234405518, "learning_rate": 2.3279186236030468e-05, "loss": 2.0211, "step": 320 }, { "epoch": 0.032330753404526305, "grad_norm": 2.6191763877868652, "learning_rate": 2.2e-05, "loss": 2.3442, "step": 330 }, { "epoch": 0.03331047320466347, "grad_norm": 2.536149024963379, "learning_rate": 2.072081376396953e-05, "loss": 1.8382, "step": 340 }, { "epoch": 0.03429019300480063, "grad_norm": 2.8444716930389404, "learning_rate": 1.9445955889244933e-05, "loss": 2.0293, "step": 350 }, { "epoch": 0.03526991280493779, "grad_norm": 2.9862940311431885, "learning_rate": 1.8179740091327534e-05, "loss": 2.0392, "step": 360 }, { "epoch": 0.036249632605074945, "grad_norm": 4.285247802734375, "learning_rate": 1.6926450843666314e-05, "loss": 2.0266, "step": 370 }, { "epoch": 0.03722935240521211, "grad_norm": 3.9121134281158447, "learning_rate": 1.569032888035602e-05, "loss": 2.1918, "step": 380 }, { "epoch": 0.03820907220534927, "grad_norm": 3.740175485610962, "learning_rate": 1.447555684683529e-05, "loss": 2.0974, "step": 390 }, { "epoch": 0.03918879200548643, "grad_norm": 2.3882009983062744, "learning_rate": 1.3286245147138549e-05, "loss": 2.1681, "step": 400 }, { "epoch": 0.04016851180562359, "grad_norm": 2.8183250427246094, "learning_rate": 1.2126418035589831e-05, "loss": 2.127, "step": 410 }, { "epoch": 0.041148231605760754, "grad_norm": 2.2813663482666016, "learning_rate": 1.1000000000000005e-05, "loss": 1.8708, "step": 420 }, { "epoch": 0.042127951405897916, "grad_norm": 3.027383804321289, "learning_rate": 9.910802482442268e-06, "loss": 1.9334, "step": 430 }, { "epoch": 0.04310767120603507, "grad_norm": 3.16849946975708, "learning_rate": 8.86251098253871e-06, "loss": 1.9682, "step": 440 }, { "epoch": 0.04408739100617223, "grad_norm": 2.1472132205963135, "learning_rate": 7.858672586896134e-06, "loss": 2.0641, "step": 450 }, { "epoch": 0.04408739100617223, "eval_loss": 2.037466526031494, "eval_runtime": 30.4503, "eval_samples_per_second": 70.574, "eval_steps_per_second": 17.668, "step": 450 }, { "epoch": 0.045067110806309395, "grad_norm": 3.0501604080200195, "learning_rate": 6.902683966887863e-06, "loss": 1.9057, "step": 460 }, { "epoch": 0.04604683060644656, "grad_norm": 2.0422005653381348, "learning_rate": 5.997779885392928e-06, "loss": 1.8502, "step": 470 }, { "epoch": 0.04702655040658372, "grad_norm": 2.0223166942596436, "learning_rate": 5.147022251382486e-06, "loss": 2.1528, "step": 480 }, { "epoch": 0.04800627020672088, "grad_norm": 2.5460333824157715, "learning_rate": 4.3532897593890356e-06, "loss": 2.062, "step": 490 }, { "epoch": 0.04898599000685804, "grad_norm": 2.885043144226074, "learning_rate": 3.619268148915402e-06, "loss": 2.0503, "step": 500 }, { "epoch": 0.0499657098069952, "grad_norm": 2.628741979598999, "learning_rate": 2.947441116742348e-06, "loss": 1.9733, "step": 510 }, { "epoch": 0.05094542960713236, "grad_norm": 3.8591113090515137, "learning_rate": 2.3400819128849325e-06, "loss": 2.128, "step": 520 }, { "epoch": 0.05192514940726952, "grad_norm": 3.2337727546691895, "learning_rate": 1.7992456486339744e-06, "loss": 2.0159, "step": 530 }, { "epoch": 0.05290486920740668, "grad_norm": 3.3239448070526123, "learning_rate": 1.326762342710017e-06, "loss": 2.2092, "step": 540 }, { "epoch": 0.053884589007543844, "grad_norm": 1.9489779472351074, "learning_rate": 9.242307290592442e-07, "loss": 1.8123, "step": 550 }, { "epoch": 0.054864308807681006, "grad_norm": 3.490032434463501, "learning_rate": 5.930128472438762e-07, "loss": 2.0926, "step": 560 }, { "epoch": 0.05584402860781816, "grad_norm": 3.1768524646759033, "learning_rate": 3.3422943373142354e-07, "loss": 2.1623, "step": 570 }, { "epoch": 0.05682374840795532, "grad_norm": 2.5022518634796143, "learning_rate": 1.4875612967725348e-07, "loss": 2.075, "step": 580 }, { "epoch": 0.057803468208092484, "grad_norm": 3.5616374015808105, "learning_rate": 3.722051803210014e-08, "loss": 1.8666, "step": 590 }, { "epoch": 0.058783188008229646, "grad_norm": 3.008579969406128, "learning_rate": 0.0, "loss": 2.0734, "step": 600 }, { "epoch": 0.058783188008229646, "eval_loss": 2.0348799228668213, "eval_runtime": 30.5609, "eval_samples_per_second": 70.319, "eval_steps_per_second": 17.604, "step": 600 } ], "logging_steps": 10, "max_steps": 600, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 150, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3118127351070720.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }