|
{ |
|
"best_metric": 0.6367893815040588, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.018298261665141813, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 3.659652333028362e-05, |
|
"eval_loss": 3.3232226371765137, |
|
"eval_runtime": 654.1201, |
|
"eval_samples_per_second": 17.59, |
|
"eval_steps_per_second": 4.398, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00036596523330283625, |
|
"grad_norm": 3.221409559249878, |
|
"learning_rate": 4.34e-05, |
|
"loss": 2.2773, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0007319304666056725, |
|
"grad_norm": 2.2033233642578125, |
|
"learning_rate": 8.68e-05, |
|
"loss": 1.6412, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0010978956999085087, |
|
"grad_norm": 2.0530292987823486, |
|
"learning_rate": 0.0001302, |
|
"loss": 1.2863, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.001463860933211345, |
|
"grad_norm": 2.043330430984497, |
|
"learning_rate": 0.0001736, |
|
"loss": 1.1969, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0018298261665141812, |
|
"grad_norm": 5.248676300048828, |
|
"learning_rate": 0.000217, |
|
"loss": 1.1612, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0018298261665141812, |
|
"eval_loss": 1.7948706150054932, |
|
"eval_runtime": 654.0329, |
|
"eval_samples_per_second": 17.592, |
|
"eval_steps_per_second": 4.399, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0021957913998170175, |
|
"grad_norm": 1.6893078088760376, |
|
"learning_rate": 0.00021673569945319091, |
|
"loss": 1.2685, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0025617566331198535, |
|
"grad_norm": 1.2931175231933594, |
|
"learning_rate": 0.00021594408545846038, |
|
"loss": 1.0429, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.00292772186642269, |
|
"grad_norm": 1.124497652053833, |
|
"learning_rate": 0.0002146290146796179, |
|
"loss": 1.0055, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.003293687099725526, |
|
"grad_norm": 2.295246124267578, |
|
"learning_rate": 0.0002127968940093076, |
|
"loss": 1.0593, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0036596523330283625, |
|
"grad_norm": 2.5044991970062256, |
|
"learning_rate": 0.00021045664935527106, |
|
"loss": 1.0974, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0036596523330283625, |
|
"eval_loss": 1.6511973142623901, |
|
"eval_runtime": 653.846, |
|
"eval_samples_per_second": 17.597, |
|
"eval_steps_per_second": 4.4, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0040256175663311985, |
|
"grad_norm": 1.1868317127227783, |
|
"learning_rate": 0.00020761968215422217, |
|
"loss": 1.1934, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.004391582799634035, |
|
"grad_norm": 1.1751034259796143, |
|
"learning_rate": 0.00020429981382519356, |
|
"loss": 0.8967, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.004757548032936871, |
|
"grad_norm": 1.8134667873382568, |
|
"learning_rate": 0.00020051321843297219, |
|
"loss": 0.959, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.005123513266239707, |
|
"grad_norm": 1.202273964881897, |
|
"learning_rate": 0.0001962783438896818, |
|
"loss": 1.0112, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0054894784995425435, |
|
"grad_norm": 2.1259782314300537, |
|
"learning_rate": 0.0001916158220784091, |
|
"loss": 0.8748, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0054894784995425435, |
|
"eval_loss": 1.6277738809585571, |
|
"eval_runtime": 654.0264, |
|
"eval_samples_per_second": 17.593, |
|
"eval_steps_per_second": 4.399, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.00585544373284538, |
|
"grad_norm": 0.8180252313613892, |
|
"learning_rate": 0.00018654836833674362, |
|
"loss": 1.1652, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.006221408966148216, |
|
"grad_norm": 1.229128360748291, |
|
"learning_rate": 0.0001811006707899361, |
|
"loss": 0.9004, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.006587374199451052, |
|
"grad_norm": 2.126133680343628, |
|
"learning_rate": 0.0001752992700728339, |
|
"loss": 0.8799, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.0069533394327538885, |
|
"grad_norm": 1.0177737474441528, |
|
"learning_rate": 0.00016917243002657602, |
|
"loss": 0.8698, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.007319304666056725, |
|
"grad_norm": 2.6272943019866943, |
|
"learning_rate": 0.00016275, |
|
"loss": 0.8727, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.007319304666056725, |
|
"eval_loss": 1.3153941631317139, |
|
"eval_runtime": 653.5309, |
|
"eval_samples_per_second": 17.606, |
|
"eval_steps_per_second": 4.402, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0076852698993595606, |
|
"grad_norm": 0.8264740109443665, |
|
"learning_rate": 0.0001560632694266149, |
|
"loss": 0.9661, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.008051235132662397, |
|
"grad_norm": 0.7382209897041321, |
|
"learning_rate": 0.00014914481538562646, |
|
"loss": 0.8098, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.008417200365965233, |
|
"grad_norm": 0.9931471943855286, |
|
"learning_rate": 0.0001420283438896818, |
|
"loss": 0.7825, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.00878316559926807, |
|
"grad_norm": 0.7864680886268616, |
|
"learning_rate": 0.00013474852567256393, |
|
"loss": 0.8832, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.009149130832570906, |
|
"grad_norm": 2.8572616577148438, |
|
"learning_rate": 0.00012734082727686196, |
|
"loss": 0.8983, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.009149130832570906, |
|
"eval_loss": 1.1016149520874023, |
|
"eval_runtime": 654.6016, |
|
"eval_samples_per_second": 17.577, |
|
"eval_steps_per_second": 4.395, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.009515096065873741, |
|
"grad_norm": 0.7957597970962524, |
|
"learning_rate": 0.0001198413382645404, |
|
"loss": 0.9094, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.009881061299176578, |
|
"grad_norm": 0.9204810261726379, |
|
"learning_rate": 0.00011228659539222137, |
|
"loss": 0.8334, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.010247026532479414, |
|
"grad_norm": 1.0104973316192627, |
|
"learning_rate": 0.00010471340460777866, |
|
"loss": 0.7847, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.01061299176578225, |
|
"grad_norm": 0.8759304285049438, |
|
"learning_rate": 9.715866173545961e-05, |
|
"loss": 0.7903, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.010978956999085087, |
|
"grad_norm": 3.211150646209717, |
|
"learning_rate": 8.965917272313806e-05, |
|
"loss": 0.7951, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.010978956999085087, |
|
"eval_loss": 0.9889414310455322, |
|
"eval_runtime": 653.2449, |
|
"eval_samples_per_second": 17.614, |
|
"eval_steps_per_second": 4.404, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.011344922232387923, |
|
"grad_norm": 0.8451036810874939, |
|
"learning_rate": 8.225147432743606e-05, |
|
"loss": 0.7332, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.01171088746569076, |
|
"grad_norm": 0.6947381496429443, |
|
"learning_rate": 7.497165611031821e-05, |
|
"loss": 0.7797, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.012076852698993596, |
|
"grad_norm": 0.7813195586204529, |
|
"learning_rate": 6.785518461437353e-05, |
|
"loss": 0.7427, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.012442817932296431, |
|
"grad_norm": 1.1668095588684082, |
|
"learning_rate": 6.093673057338509e-05, |
|
"loss": 0.6847, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.012808783165599268, |
|
"grad_norm": 1.675475001335144, |
|
"learning_rate": 5.4250000000000024e-05, |
|
"loss": 0.7452, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.012808783165599268, |
|
"eval_loss": 0.799678385257721, |
|
"eval_runtime": 653.6373, |
|
"eval_samples_per_second": 17.603, |
|
"eval_steps_per_second": 4.402, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.013174748398902104, |
|
"grad_norm": 0.8130257725715637, |
|
"learning_rate": 4.782756997342398e-05, |
|
"loss": 0.6847, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.01354071363220494, |
|
"grad_norm": 0.8550499081611633, |
|
"learning_rate": 4.170072992716607e-05, |
|
"loss": 0.6897, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.013906678865507777, |
|
"grad_norm": 0.7728462815284729, |
|
"learning_rate": 3.5899329210063916e-05, |
|
"loss": 0.7088, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.014272644098810613, |
|
"grad_norm": 1.0523146390914917, |
|
"learning_rate": 3.045163166325637e-05, |
|
"loss": 0.6873, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.01463860933211345, |
|
"grad_norm": 2.0533359050750732, |
|
"learning_rate": 2.5384177921590895e-05, |
|
"loss": 0.6468, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.01463860933211345, |
|
"eval_loss": 0.7176544666290283, |
|
"eval_runtime": 654.5529, |
|
"eval_samples_per_second": 17.578, |
|
"eval_steps_per_second": 4.395, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.015004574565416285, |
|
"grad_norm": 0.7370195984840393, |
|
"learning_rate": 2.0721656110318213e-05, |
|
"loss": 0.6352, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.015370539798719121, |
|
"grad_norm": 0.7632651329040527, |
|
"learning_rate": 1.6486781567027783e-05, |
|
"loss": 0.6466, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.01573650503202196, |
|
"grad_norm": 0.9370796084403992, |
|
"learning_rate": 1.2700186174806422e-05, |
|
"loss": 0.6818, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.016102470265324794, |
|
"grad_norm": 0.8790647387504578, |
|
"learning_rate": 9.380317845777794e-06, |
|
"loss": 0.7192, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.01646843549862763, |
|
"grad_norm": 2.4070212841033936, |
|
"learning_rate": 6.543350644728947e-06, |
|
"loss": 0.6286, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.01646843549862763, |
|
"eval_loss": 0.6477181315422058, |
|
"eval_runtime": 654.5484, |
|
"eval_samples_per_second": 17.579, |
|
"eval_steps_per_second": 4.395, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.016834400731930467, |
|
"grad_norm": 0.7986826300621033, |
|
"learning_rate": 4.2031059906924e-06, |
|
"loss": 0.5613, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.0172003659652333, |
|
"grad_norm": 0.9289596080780029, |
|
"learning_rate": 2.3709853203820825e-06, |
|
"loss": 0.6961, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.01756633119853614, |
|
"grad_norm": 0.9395463466644287, |
|
"learning_rate": 1.0559145415396157e-06, |
|
"loss": 0.6394, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.017932296431838975, |
|
"grad_norm": 0.949378252029419, |
|
"learning_rate": 2.643005468090745e-07, |
|
"loss": 0.6338, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.018298261665141813, |
|
"grad_norm": 1.6931450366973877, |
|
"learning_rate": 0.0, |
|
"loss": 0.6262, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.018298261665141813, |
|
"eval_loss": 0.6367893815040588, |
|
"eval_runtime": 654.042, |
|
"eval_samples_per_second": 17.592, |
|
"eval_steps_per_second": 4.399, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.3877540397056e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|