{ "best_metric": 0.6367893815040588, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.018298261665141813, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 3.659652333028362e-05, "eval_loss": 3.3232226371765137, "eval_runtime": 654.1201, "eval_samples_per_second": 17.59, "eval_steps_per_second": 4.398, "step": 1 }, { "epoch": 0.00036596523330283625, "grad_norm": 3.221409559249878, "learning_rate": 4.34e-05, "loss": 2.2773, "step": 10 }, { "epoch": 0.0007319304666056725, "grad_norm": 2.2033233642578125, "learning_rate": 8.68e-05, "loss": 1.6412, "step": 20 }, { "epoch": 0.0010978956999085087, "grad_norm": 2.0530292987823486, "learning_rate": 0.0001302, "loss": 1.2863, "step": 30 }, { "epoch": 0.001463860933211345, "grad_norm": 2.043330430984497, "learning_rate": 0.0001736, "loss": 1.1969, "step": 40 }, { "epoch": 0.0018298261665141812, "grad_norm": 5.248676300048828, "learning_rate": 0.000217, "loss": 1.1612, "step": 50 }, { "epoch": 0.0018298261665141812, "eval_loss": 1.7948706150054932, "eval_runtime": 654.0329, "eval_samples_per_second": 17.592, "eval_steps_per_second": 4.399, "step": 50 }, { "epoch": 0.0021957913998170175, "grad_norm": 1.6893078088760376, "learning_rate": 0.00021673569945319091, "loss": 1.2685, "step": 60 }, { "epoch": 0.0025617566331198535, "grad_norm": 1.2931175231933594, "learning_rate": 0.00021594408545846038, "loss": 1.0429, "step": 70 }, { "epoch": 0.00292772186642269, "grad_norm": 1.124497652053833, "learning_rate": 0.0002146290146796179, "loss": 1.0055, "step": 80 }, { "epoch": 0.003293687099725526, "grad_norm": 2.295246124267578, "learning_rate": 0.0002127968940093076, "loss": 1.0593, "step": 90 }, { "epoch": 0.0036596523330283625, "grad_norm": 2.5044991970062256, "learning_rate": 0.00021045664935527106, "loss": 1.0974, "step": 100 }, { "epoch": 0.0036596523330283625, "eval_loss": 1.6511973142623901, "eval_runtime": 653.846, "eval_samples_per_second": 17.597, "eval_steps_per_second": 4.4, "step": 100 }, { "epoch": 0.0040256175663311985, "grad_norm": 1.1868317127227783, "learning_rate": 0.00020761968215422217, "loss": 1.1934, "step": 110 }, { "epoch": 0.004391582799634035, "grad_norm": 1.1751034259796143, "learning_rate": 0.00020429981382519356, "loss": 0.8967, "step": 120 }, { "epoch": 0.004757548032936871, "grad_norm": 1.8134667873382568, "learning_rate": 0.00020051321843297219, "loss": 0.959, "step": 130 }, { "epoch": 0.005123513266239707, "grad_norm": 1.202273964881897, "learning_rate": 0.0001962783438896818, "loss": 1.0112, "step": 140 }, { "epoch": 0.0054894784995425435, "grad_norm": 2.1259782314300537, "learning_rate": 0.0001916158220784091, "loss": 0.8748, "step": 150 }, { "epoch": 0.0054894784995425435, "eval_loss": 1.6277738809585571, "eval_runtime": 654.0264, "eval_samples_per_second": 17.593, "eval_steps_per_second": 4.399, "step": 150 }, { "epoch": 0.00585544373284538, "grad_norm": 0.8180252313613892, "learning_rate": 0.00018654836833674362, "loss": 1.1652, "step": 160 }, { "epoch": 0.006221408966148216, "grad_norm": 1.229128360748291, "learning_rate": 0.0001811006707899361, "loss": 0.9004, "step": 170 }, { "epoch": 0.006587374199451052, "grad_norm": 2.126133680343628, "learning_rate": 0.0001752992700728339, "loss": 0.8799, "step": 180 }, { "epoch": 0.0069533394327538885, "grad_norm": 1.0177737474441528, "learning_rate": 0.00016917243002657602, "loss": 0.8698, "step": 190 }, { "epoch": 0.007319304666056725, "grad_norm": 2.6272943019866943, "learning_rate": 0.00016275, "loss": 0.8727, "step": 200 }, { "epoch": 0.007319304666056725, "eval_loss": 1.3153941631317139, "eval_runtime": 653.5309, "eval_samples_per_second": 17.606, "eval_steps_per_second": 4.402, "step": 200 }, { "epoch": 0.0076852698993595606, "grad_norm": 0.8264740109443665, "learning_rate": 0.0001560632694266149, "loss": 0.9661, "step": 210 }, { "epoch": 0.008051235132662397, "grad_norm": 0.7382209897041321, "learning_rate": 0.00014914481538562646, "loss": 0.8098, "step": 220 }, { "epoch": 0.008417200365965233, "grad_norm": 0.9931471943855286, "learning_rate": 0.0001420283438896818, "loss": 0.7825, "step": 230 }, { "epoch": 0.00878316559926807, "grad_norm": 0.7864680886268616, "learning_rate": 0.00013474852567256393, "loss": 0.8832, "step": 240 }, { "epoch": 0.009149130832570906, "grad_norm": 2.8572616577148438, "learning_rate": 0.00012734082727686196, "loss": 0.8983, "step": 250 }, { "epoch": 0.009149130832570906, "eval_loss": 1.1016149520874023, "eval_runtime": 654.6016, "eval_samples_per_second": 17.577, "eval_steps_per_second": 4.395, "step": 250 }, { "epoch": 0.009515096065873741, "grad_norm": 0.7957597970962524, "learning_rate": 0.0001198413382645404, "loss": 0.9094, "step": 260 }, { "epoch": 0.009881061299176578, "grad_norm": 0.9204810261726379, "learning_rate": 0.00011228659539222137, "loss": 0.8334, "step": 270 }, { "epoch": 0.010247026532479414, "grad_norm": 1.0104973316192627, "learning_rate": 0.00010471340460777866, "loss": 0.7847, "step": 280 }, { "epoch": 0.01061299176578225, "grad_norm": 0.8759304285049438, "learning_rate": 9.715866173545961e-05, "loss": 0.7903, "step": 290 }, { "epoch": 0.010978956999085087, "grad_norm": 3.211150646209717, "learning_rate": 8.965917272313806e-05, "loss": 0.7951, "step": 300 }, { "epoch": 0.010978956999085087, "eval_loss": 0.9889414310455322, "eval_runtime": 653.2449, "eval_samples_per_second": 17.614, "eval_steps_per_second": 4.404, "step": 300 }, { "epoch": 0.011344922232387923, "grad_norm": 0.8451036810874939, "learning_rate": 8.225147432743606e-05, "loss": 0.7332, "step": 310 }, { "epoch": 0.01171088746569076, "grad_norm": 0.6947381496429443, "learning_rate": 7.497165611031821e-05, "loss": 0.7797, "step": 320 }, { "epoch": 0.012076852698993596, "grad_norm": 0.7813195586204529, "learning_rate": 6.785518461437353e-05, "loss": 0.7427, "step": 330 }, { "epoch": 0.012442817932296431, "grad_norm": 1.1668095588684082, "learning_rate": 6.093673057338509e-05, "loss": 0.6847, "step": 340 }, { "epoch": 0.012808783165599268, "grad_norm": 1.675475001335144, "learning_rate": 5.4250000000000024e-05, "loss": 0.7452, "step": 350 }, { "epoch": 0.012808783165599268, "eval_loss": 0.799678385257721, "eval_runtime": 653.6373, "eval_samples_per_second": 17.603, "eval_steps_per_second": 4.402, "step": 350 }, { "epoch": 0.013174748398902104, "grad_norm": 0.8130257725715637, "learning_rate": 4.782756997342398e-05, "loss": 0.6847, "step": 360 }, { "epoch": 0.01354071363220494, "grad_norm": 0.8550499081611633, "learning_rate": 4.170072992716607e-05, "loss": 0.6897, "step": 370 }, { "epoch": 0.013906678865507777, "grad_norm": 0.7728462815284729, "learning_rate": 3.5899329210063916e-05, "loss": 0.7088, "step": 380 }, { "epoch": 0.014272644098810613, "grad_norm": 1.0523146390914917, "learning_rate": 3.045163166325637e-05, "loss": 0.6873, "step": 390 }, { "epoch": 0.01463860933211345, "grad_norm": 2.0533359050750732, "learning_rate": 2.5384177921590895e-05, "loss": 0.6468, "step": 400 }, { "epoch": 0.01463860933211345, "eval_loss": 0.7176544666290283, "eval_runtime": 654.5529, "eval_samples_per_second": 17.578, "eval_steps_per_second": 4.395, "step": 400 }, { "epoch": 0.015004574565416285, "grad_norm": 0.7370195984840393, "learning_rate": 2.0721656110318213e-05, "loss": 0.6352, "step": 410 }, { "epoch": 0.015370539798719121, "grad_norm": 0.7632651329040527, "learning_rate": 1.6486781567027783e-05, "loss": 0.6466, "step": 420 }, { "epoch": 0.01573650503202196, "grad_norm": 0.9370796084403992, "learning_rate": 1.2700186174806422e-05, "loss": 0.6818, "step": 430 }, { "epoch": 0.016102470265324794, "grad_norm": 0.8790647387504578, "learning_rate": 9.380317845777794e-06, "loss": 0.7192, "step": 440 }, { "epoch": 0.01646843549862763, "grad_norm": 2.4070212841033936, "learning_rate": 6.543350644728947e-06, "loss": 0.6286, "step": 450 }, { "epoch": 0.01646843549862763, "eval_loss": 0.6477181315422058, "eval_runtime": 654.5484, "eval_samples_per_second": 17.579, "eval_steps_per_second": 4.395, "step": 450 }, { "epoch": 0.016834400731930467, "grad_norm": 0.7986826300621033, "learning_rate": 4.2031059906924e-06, "loss": 0.5613, "step": 460 }, { "epoch": 0.0172003659652333, "grad_norm": 0.9289596080780029, "learning_rate": 2.3709853203820825e-06, "loss": 0.6961, "step": 470 }, { "epoch": 0.01756633119853614, "grad_norm": 0.9395463466644287, "learning_rate": 1.0559145415396157e-06, "loss": 0.6394, "step": 480 }, { "epoch": 0.017932296431838975, "grad_norm": 0.949378252029419, "learning_rate": 2.643005468090745e-07, "loss": 0.6338, "step": 490 }, { "epoch": 0.018298261665141813, "grad_norm": 1.6931450366973877, "learning_rate": 0.0, "loss": 0.6262, "step": 500 }, { "epoch": 0.018298261665141813, "eval_loss": 0.6367893815040588, "eval_runtime": 654.042, "eval_samples_per_second": 17.592, "eval_steps_per_second": 4.399, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.3877540397056e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }