{ "best_metric": 0.27326497435569763, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.17834849295523453, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003566969859104691, "eval_loss": 1.6572248935699463, "eval_runtime": 126.1616, "eval_samples_per_second": 9.361, "eval_steps_per_second": 2.346, "step": 1 }, { "epoch": 0.0035669698591046907, "grad_norm": 2.7096357345581055, "learning_rate": 4.34e-05, "loss": 0.9751, "step": 10 }, { "epoch": 0.0071339397182093815, "grad_norm": 10.129169464111328, "learning_rate": 8.68e-05, "loss": 0.6725, "step": 20 }, { "epoch": 0.010700909577314071, "grad_norm": 1.403724193572998, "learning_rate": 0.0001302, "loss": 0.4999, "step": 30 }, { "epoch": 0.014267879436418763, "grad_norm": 1.566288948059082, "learning_rate": 0.0001736, "loss": 0.5148, "step": 40 }, { "epoch": 0.017834849295523453, "grad_norm": 1.1698219776153564, "learning_rate": 0.000217, "loss": 0.3887, "step": 50 }, { "epoch": 0.017834849295523453, "eval_loss": 0.5776557326316833, "eval_runtime": 126.4119, "eval_samples_per_second": 9.342, "eval_steps_per_second": 2.342, "step": 50 }, { "epoch": 0.021401819154628143, "grad_norm": 1.3930236101150513, "learning_rate": 0.00021673569945319091, "loss": 0.5755, "step": 60 }, { "epoch": 0.024968789013732832, "grad_norm": 0.9935230612754822, "learning_rate": 0.00021594408545846038, "loss": 0.4614, "step": 70 }, { "epoch": 0.028535758872837526, "grad_norm": 1.562578558921814, "learning_rate": 0.0002146290146796179, "loss": 0.4925, "step": 80 }, { "epoch": 0.03210272873194221, "grad_norm": 1.2459532022476196, "learning_rate": 0.0002127968940093076, "loss": 0.3117, "step": 90 }, { "epoch": 0.035669698591046906, "grad_norm": 1.4213972091674805, "learning_rate": 0.00021045664935527106, "loss": 0.4069, "step": 100 }, { "epoch": 0.035669698591046906, "eval_loss": 0.6545668840408325, "eval_runtime": 126.3911, "eval_samples_per_second": 9.344, "eval_steps_per_second": 2.342, "step": 100 }, { "epoch": 0.0392366684501516, "grad_norm": 1.6053053140640259, "learning_rate": 0.00020761968215422217, "loss": 0.6128, "step": 110 }, { "epoch": 0.042803638309256285, "grad_norm": 1.4004822969436646, "learning_rate": 0.00020429981382519356, "loss": 0.5975, "step": 120 }, { "epoch": 0.04637060816836098, "grad_norm": 1.241351842880249, "learning_rate": 0.00020051321843297219, "loss": 0.4978, "step": 130 }, { "epoch": 0.049937578027465665, "grad_norm": 5.282879829406738, "learning_rate": 0.0001962783438896818, "loss": 0.4208, "step": 140 }, { "epoch": 0.05350454788657036, "grad_norm": 2.913874864578247, "learning_rate": 0.0001916158220784091, "loss": 0.3688, "step": 150 }, { "epoch": 0.05350454788657036, "eval_loss": 0.5131043791770935, "eval_runtime": 126.3713, "eval_samples_per_second": 9.345, "eval_steps_per_second": 2.342, "step": 150 }, { "epoch": 0.05707151774567505, "grad_norm": 1.2657321691513062, "learning_rate": 0.00018654836833674362, "loss": 0.6035, "step": 160 }, { "epoch": 0.06063848760477974, "grad_norm": 1.2073376178741455, "learning_rate": 0.0001811006707899361, "loss": 0.3707, "step": 170 }, { "epoch": 0.06420545746388442, "grad_norm": 1.2466622591018677, "learning_rate": 0.0001752992700728339, "loss": 0.4732, "step": 180 }, { "epoch": 0.06777242732298912, "grad_norm": 1.2483686208724976, "learning_rate": 0.00016917243002657602, "loss": 0.3561, "step": 190 }, { "epoch": 0.07133939718209381, "grad_norm": 1.5653057098388672, "learning_rate": 0.00016275, "loss": 0.4015, "step": 200 }, { "epoch": 0.07133939718209381, "eval_loss": 0.5132488012313843, "eval_runtime": 126.2518, "eval_samples_per_second": 9.354, "eval_steps_per_second": 2.345, "step": 200 }, { "epoch": 0.0749063670411985, "grad_norm": 1.5472954511642456, "learning_rate": 0.0001560632694266149, "loss": 0.5455, "step": 210 }, { "epoch": 0.0784733369003032, "grad_norm": 1.3301926851272583, "learning_rate": 0.00014914481538562646, "loss": 0.4256, "step": 220 }, { "epoch": 0.08204030675940788, "grad_norm": 1.182495355606079, "learning_rate": 0.0001420283438896818, "loss": 0.3823, "step": 230 }, { "epoch": 0.08560727661851257, "grad_norm": 1.007603645324707, "learning_rate": 0.00013474852567256393, "loss": 0.275, "step": 240 }, { "epoch": 0.08917424647761726, "grad_norm": 1.2171993255615234, "learning_rate": 0.00012734082727686196, "loss": 0.2953, "step": 250 }, { "epoch": 0.08917424647761726, "eval_loss": 0.47643429040908813, "eval_runtime": 126.4327, "eval_samples_per_second": 9.341, "eval_steps_per_second": 2.341, "step": 250 }, { "epoch": 0.09274121633672196, "grad_norm": 1.6132450103759766, "learning_rate": 0.0001198413382645404, "loss": 0.5383, "step": 260 }, { "epoch": 0.09630818619582665, "grad_norm": 1.4011763334274292, "learning_rate": 0.00011228659539222137, "loss": 0.3738, "step": 270 }, { "epoch": 0.09987515605493133, "grad_norm": 1.091366171836853, "learning_rate": 0.00010471340460777866, "loss": 0.3487, "step": 280 }, { "epoch": 0.10344212591403602, "grad_norm": 1.5295441150665283, "learning_rate": 9.715866173545961e-05, "loss": 0.3104, "step": 290 }, { "epoch": 0.10700909577314072, "grad_norm": 0.5504244565963745, "learning_rate": 8.965917272313806e-05, "loss": 0.2395, "step": 300 }, { "epoch": 0.10700909577314072, "eval_loss": 0.3887171447277069, "eval_runtime": 126.4389, "eval_samples_per_second": 9.34, "eval_steps_per_second": 2.341, "step": 300 }, { "epoch": 0.11057606563224541, "grad_norm": 1.032090425491333, "learning_rate": 8.225147432743606e-05, "loss": 0.5718, "step": 310 }, { "epoch": 0.1141430354913501, "grad_norm": 0.9163453578948975, "learning_rate": 7.497165611031821e-05, "loss": 0.3106, "step": 320 }, { "epoch": 0.11771000535045478, "grad_norm": 0.7260503172874451, "learning_rate": 6.785518461437353e-05, "loss": 0.3202, "step": 330 }, { "epoch": 0.12127697520955948, "grad_norm": 0.6277322173118591, "learning_rate": 6.093673057338509e-05, "loss": 0.2855, "step": 340 }, { "epoch": 0.12484394506866417, "grad_norm": 1.2401589155197144, "learning_rate": 5.4250000000000024e-05, "loss": 0.2984, "step": 350 }, { "epoch": 0.12484394506866417, "eval_loss": 0.35979732871055603, "eval_runtime": 126.6911, "eval_samples_per_second": 9.322, "eval_steps_per_second": 2.336, "step": 350 }, { "epoch": 0.12841091492776885, "grad_norm": 1.1920021772384644, "learning_rate": 4.782756997342398e-05, "loss": 0.4701, "step": 360 }, { "epoch": 0.13197788478687356, "grad_norm": 1.140146255493164, "learning_rate": 4.170072992716607e-05, "loss": 0.3846, "step": 370 }, { "epoch": 0.13554485464597824, "grad_norm": 0.6016573309898376, "learning_rate": 3.5899329210063916e-05, "loss": 0.2747, "step": 380 }, { "epoch": 0.13911182450508294, "grad_norm": 0.7130181193351746, "learning_rate": 3.045163166325637e-05, "loss": 0.2489, "step": 390 }, { "epoch": 0.14267879436418762, "grad_norm": 0.7759641408920288, "learning_rate": 2.5384177921590895e-05, "loss": 0.1899, "step": 400 }, { "epoch": 0.14267879436418762, "eval_loss": 0.3352268934249878, "eval_runtime": 126.4127, "eval_samples_per_second": 9.342, "eval_steps_per_second": 2.342, "step": 400 }, { "epoch": 0.1462457642232923, "grad_norm": 1.3363033533096313, "learning_rate": 2.0721656110318213e-05, "loss": 0.5647, "step": 410 }, { "epoch": 0.149812734082397, "grad_norm": 0.7996323704719543, "learning_rate": 1.6486781567027783e-05, "loss": 0.276, "step": 420 }, { "epoch": 0.1533797039415017, "grad_norm": 0.8417302966117859, "learning_rate": 1.2700186174806422e-05, "loss": 0.3509, "step": 430 }, { "epoch": 0.1569466738006064, "grad_norm": 0.6128970384597778, "learning_rate": 9.380317845777794e-06, "loss": 0.2575, "step": 440 }, { "epoch": 0.16051364365971107, "grad_norm": 0.5301075577735901, "learning_rate": 6.543350644728947e-06, "loss": 0.1957, "step": 450 }, { "epoch": 0.16051364365971107, "eval_loss": 0.27697739005088806, "eval_runtime": 126.4761, "eval_samples_per_second": 9.338, "eval_steps_per_second": 2.34, "step": 450 }, { "epoch": 0.16408061351881575, "grad_norm": 0.767898678779602, "learning_rate": 4.2031059906924e-06, "loss": 0.4768, "step": 460 }, { "epoch": 0.16764758337792046, "grad_norm": 0.8965118527412415, "learning_rate": 2.3709853203820825e-06, "loss": 0.3135, "step": 470 }, { "epoch": 0.17121455323702514, "grad_norm": 0.7519134283065796, "learning_rate": 1.0559145415396157e-06, "loss": 0.2856, "step": 480 }, { "epoch": 0.17478152309612985, "grad_norm": 0.5825276374816895, "learning_rate": 2.643005468090745e-07, "loss": 0.2907, "step": 490 }, { "epoch": 0.17834849295523453, "grad_norm": 0.5771666169166565, "learning_rate": 0.0, "loss": 0.1991, "step": 500 }, { "epoch": 0.17834849295523453, "eval_loss": 0.27326497435569763, "eval_runtime": 126.8529, "eval_samples_per_second": 9.31, "eval_steps_per_second": 2.333, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.7989024426544333e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }