|
{ |
|
"best_metric": 0.27326497435569763, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.17834849295523453, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0003566969859104691, |
|
"eval_loss": 1.6572248935699463, |
|
"eval_runtime": 126.1616, |
|
"eval_samples_per_second": 9.361, |
|
"eval_steps_per_second": 2.346, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0035669698591046907, |
|
"grad_norm": 2.7096357345581055, |
|
"learning_rate": 4.34e-05, |
|
"loss": 0.9751, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0071339397182093815, |
|
"grad_norm": 10.129169464111328, |
|
"learning_rate": 8.68e-05, |
|
"loss": 0.6725, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.010700909577314071, |
|
"grad_norm": 1.403724193572998, |
|
"learning_rate": 0.0001302, |
|
"loss": 0.4999, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.014267879436418763, |
|
"grad_norm": 1.566288948059082, |
|
"learning_rate": 0.0001736, |
|
"loss": 0.5148, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.017834849295523453, |
|
"grad_norm": 1.1698219776153564, |
|
"learning_rate": 0.000217, |
|
"loss": 0.3887, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.017834849295523453, |
|
"eval_loss": 0.5776557326316833, |
|
"eval_runtime": 126.4119, |
|
"eval_samples_per_second": 9.342, |
|
"eval_steps_per_second": 2.342, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.021401819154628143, |
|
"grad_norm": 1.3930236101150513, |
|
"learning_rate": 0.00021673569945319091, |
|
"loss": 0.5755, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.024968789013732832, |
|
"grad_norm": 0.9935230612754822, |
|
"learning_rate": 0.00021594408545846038, |
|
"loss": 0.4614, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.028535758872837526, |
|
"grad_norm": 1.562578558921814, |
|
"learning_rate": 0.0002146290146796179, |
|
"loss": 0.4925, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.03210272873194221, |
|
"grad_norm": 1.2459532022476196, |
|
"learning_rate": 0.0002127968940093076, |
|
"loss": 0.3117, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.035669698591046906, |
|
"grad_norm": 1.4213972091674805, |
|
"learning_rate": 0.00021045664935527106, |
|
"loss": 0.4069, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.035669698591046906, |
|
"eval_loss": 0.6545668840408325, |
|
"eval_runtime": 126.3911, |
|
"eval_samples_per_second": 9.344, |
|
"eval_steps_per_second": 2.342, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0392366684501516, |
|
"grad_norm": 1.6053053140640259, |
|
"learning_rate": 0.00020761968215422217, |
|
"loss": 0.6128, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.042803638309256285, |
|
"grad_norm": 1.4004822969436646, |
|
"learning_rate": 0.00020429981382519356, |
|
"loss": 0.5975, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.04637060816836098, |
|
"grad_norm": 1.241351842880249, |
|
"learning_rate": 0.00020051321843297219, |
|
"loss": 0.4978, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.049937578027465665, |
|
"grad_norm": 5.282879829406738, |
|
"learning_rate": 0.0001962783438896818, |
|
"loss": 0.4208, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.05350454788657036, |
|
"grad_norm": 2.913874864578247, |
|
"learning_rate": 0.0001916158220784091, |
|
"loss": 0.3688, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.05350454788657036, |
|
"eval_loss": 0.5131043791770935, |
|
"eval_runtime": 126.3713, |
|
"eval_samples_per_second": 9.345, |
|
"eval_steps_per_second": 2.342, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.05707151774567505, |
|
"grad_norm": 1.2657321691513062, |
|
"learning_rate": 0.00018654836833674362, |
|
"loss": 0.6035, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.06063848760477974, |
|
"grad_norm": 1.2073376178741455, |
|
"learning_rate": 0.0001811006707899361, |
|
"loss": 0.3707, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.06420545746388442, |
|
"grad_norm": 1.2466622591018677, |
|
"learning_rate": 0.0001752992700728339, |
|
"loss": 0.4732, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.06777242732298912, |
|
"grad_norm": 1.2483686208724976, |
|
"learning_rate": 0.00016917243002657602, |
|
"loss": 0.3561, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.07133939718209381, |
|
"grad_norm": 1.5653057098388672, |
|
"learning_rate": 0.00016275, |
|
"loss": 0.4015, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07133939718209381, |
|
"eval_loss": 0.5132488012313843, |
|
"eval_runtime": 126.2518, |
|
"eval_samples_per_second": 9.354, |
|
"eval_steps_per_second": 2.345, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0749063670411985, |
|
"grad_norm": 1.5472954511642456, |
|
"learning_rate": 0.0001560632694266149, |
|
"loss": 0.5455, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.0784733369003032, |
|
"grad_norm": 1.3301926851272583, |
|
"learning_rate": 0.00014914481538562646, |
|
"loss": 0.4256, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.08204030675940788, |
|
"grad_norm": 1.182495355606079, |
|
"learning_rate": 0.0001420283438896818, |
|
"loss": 0.3823, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.08560727661851257, |
|
"grad_norm": 1.007603645324707, |
|
"learning_rate": 0.00013474852567256393, |
|
"loss": 0.275, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.08917424647761726, |
|
"grad_norm": 1.2171993255615234, |
|
"learning_rate": 0.00012734082727686196, |
|
"loss": 0.2953, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.08917424647761726, |
|
"eval_loss": 0.47643429040908813, |
|
"eval_runtime": 126.4327, |
|
"eval_samples_per_second": 9.341, |
|
"eval_steps_per_second": 2.341, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.09274121633672196, |
|
"grad_norm": 1.6132450103759766, |
|
"learning_rate": 0.0001198413382645404, |
|
"loss": 0.5383, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.09630818619582665, |
|
"grad_norm": 1.4011763334274292, |
|
"learning_rate": 0.00011228659539222137, |
|
"loss": 0.3738, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.09987515605493133, |
|
"grad_norm": 1.091366171836853, |
|
"learning_rate": 0.00010471340460777866, |
|
"loss": 0.3487, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.10344212591403602, |
|
"grad_norm": 1.5295441150665283, |
|
"learning_rate": 9.715866173545961e-05, |
|
"loss": 0.3104, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.10700909577314072, |
|
"grad_norm": 0.5504244565963745, |
|
"learning_rate": 8.965917272313806e-05, |
|
"loss": 0.2395, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.10700909577314072, |
|
"eval_loss": 0.3887171447277069, |
|
"eval_runtime": 126.4389, |
|
"eval_samples_per_second": 9.34, |
|
"eval_steps_per_second": 2.341, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.11057606563224541, |
|
"grad_norm": 1.032090425491333, |
|
"learning_rate": 8.225147432743606e-05, |
|
"loss": 0.5718, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.1141430354913501, |
|
"grad_norm": 0.9163453578948975, |
|
"learning_rate": 7.497165611031821e-05, |
|
"loss": 0.3106, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.11771000535045478, |
|
"grad_norm": 0.7260503172874451, |
|
"learning_rate": 6.785518461437353e-05, |
|
"loss": 0.3202, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.12127697520955948, |
|
"grad_norm": 0.6277322173118591, |
|
"learning_rate": 6.093673057338509e-05, |
|
"loss": 0.2855, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.12484394506866417, |
|
"grad_norm": 1.2401589155197144, |
|
"learning_rate": 5.4250000000000024e-05, |
|
"loss": 0.2984, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.12484394506866417, |
|
"eval_loss": 0.35979732871055603, |
|
"eval_runtime": 126.6911, |
|
"eval_samples_per_second": 9.322, |
|
"eval_steps_per_second": 2.336, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.12841091492776885, |
|
"grad_norm": 1.1920021772384644, |
|
"learning_rate": 4.782756997342398e-05, |
|
"loss": 0.4701, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.13197788478687356, |
|
"grad_norm": 1.140146255493164, |
|
"learning_rate": 4.170072992716607e-05, |
|
"loss": 0.3846, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.13554485464597824, |
|
"grad_norm": 0.6016573309898376, |
|
"learning_rate": 3.5899329210063916e-05, |
|
"loss": 0.2747, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.13911182450508294, |
|
"grad_norm": 0.7130181193351746, |
|
"learning_rate": 3.045163166325637e-05, |
|
"loss": 0.2489, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.14267879436418762, |
|
"grad_norm": 0.7759641408920288, |
|
"learning_rate": 2.5384177921590895e-05, |
|
"loss": 0.1899, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.14267879436418762, |
|
"eval_loss": 0.3352268934249878, |
|
"eval_runtime": 126.4127, |
|
"eval_samples_per_second": 9.342, |
|
"eval_steps_per_second": 2.342, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.1462457642232923, |
|
"grad_norm": 1.3363033533096313, |
|
"learning_rate": 2.0721656110318213e-05, |
|
"loss": 0.5647, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.149812734082397, |
|
"grad_norm": 0.7996323704719543, |
|
"learning_rate": 1.6486781567027783e-05, |
|
"loss": 0.276, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.1533797039415017, |
|
"grad_norm": 0.8417302966117859, |
|
"learning_rate": 1.2700186174806422e-05, |
|
"loss": 0.3509, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.1569466738006064, |
|
"grad_norm": 0.6128970384597778, |
|
"learning_rate": 9.380317845777794e-06, |
|
"loss": 0.2575, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.16051364365971107, |
|
"grad_norm": 0.5301075577735901, |
|
"learning_rate": 6.543350644728947e-06, |
|
"loss": 0.1957, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.16051364365971107, |
|
"eval_loss": 0.27697739005088806, |
|
"eval_runtime": 126.4761, |
|
"eval_samples_per_second": 9.338, |
|
"eval_steps_per_second": 2.34, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.16408061351881575, |
|
"grad_norm": 0.767898678779602, |
|
"learning_rate": 4.2031059906924e-06, |
|
"loss": 0.4768, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.16764758337792046, |
|
"grad_norm": 0.8965118527412415, |
|
"learning_rate": 2.3709853203820825e-06, |
|
"loss": 0.3135, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.17121455323702514, |
|
"grad_norm": 0.7519134283065796, |
|
"learning_rate": 1.0559145415396157e-06, |
|
"loss": 0.2856, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.17478152309612985, |
|
"grad_norm": 0.5825276374816895, |
|
"learning_rate": 2.643005468090745e-07, |
|
"loss": 0.2907, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.17834849295523453, |
|
"grad_norm": 0.5771666169166565, |
|
"learning_rate": 0.0, |
|
"loss": 0.1991, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.17834849295523453, |
|
"eval_loss": 0.27326497435569763, |
|
"eval_runtime": 126.8529, |
|
"eval_samples_per_second": 9.31, |
|
"eval_steps_per_second": 2.333, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.7989024426544333e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|