|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.997342781222321, |
|
"eval_steps": 500, |
|
"global_step": 1692, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.2252470254898071, |
|
"learning_rate": 4.411764705882353e-06, |
|
"loss": 2.746, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.2458932399749756, |
|
"learning_rate": 8.823529411764707e-06, |
|
"loss": 2.7375, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.9806848764419556, |
|
"learning_rate": 1.323529411764706e-05, |
|
"loss": 2.6088, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.7579270601272583, |
|
"learning_rate": 1.7647058823529414e-05, |
|
"loss": 2.3883, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.5973591804504395, |
|
"learning_rate": 2.2058823529411766e-05, |
|
"loss": 2.148, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.4005584120750427, |
|
"learning_rate": 2.647058823529412e-05, |
|
"loss": 1.9697, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.30321258306503296, |
|
"learning_rate": 2.990144546649146e-05, |
|
"loss": 1.8807, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.33162617683410645, |
|
"learning_rate": 2.9408672798948752e-05, |
|
"loss": 1.788, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.35974565148353577, |
|
"learning_rate": 2.8915900131406044e-05, |
|
"loss": 1.7825, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.33461177349090576, |
|
"learning_rate": 2.842312746386334e-05, |
|
"loss": 1.7241, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.31440624594688416, |
|
"learning_rate": 2.793035479632063e-05, |
|
"loss": 1.6708, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.32463395595550537, |
|
"learning_rate": 2.7437582128777926e-05, |
|
"loss": 1.6098, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.35138121247291565, |
|
"learning_rate": 2.6944809461235218e-05, |
|
"loss": 1.5992, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.3213985562324524, |
|
"learning_rate": 2.645203679369251e-05, |
|
"loss": 1.5133, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.30699512362480164, |
|
"learning_rate": 2.5959264126149805e-05, |
|
"loss": 1.4807, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.3282407522201538, |
|
"learning_rate": 2.5466491458607097e-05, |
|
"loss": 1.4928, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.36608022451400757, |
|
"learning_rate": 2.4973718791064392e-05, |
|
"loss": 1.4802, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.38351428508758545, |
|
"learning_rate": 2.4480946123521684e-05, |
|
"loss": 1.4886, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.3251235783100128, |
|
"learning_rate": 2.3988173455978975e-05, |
|
"loss": 1.4557, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.3361181914806366, |
|
"learning_rate": 2.349540078843627e-05, |
|
"loss": 1.3857, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.36659279465675354, |
|
"learning_rate": 2.3002628120893562e-05, |
|
"loss": 1.4257, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.4663073420524597, |
|
"learning_rate": 2.2509855453350857e-05, |
|
"loss": 1.4516, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.3494812846183777, |
|
"learning_rate": 2.201708278580815e-05, |
|
"loss": 1.3714, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.3586205542087555, |
|
"learning_rate": 2.152431011826544e-05, |
|
"loss": 1.3569, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.38619765639305115, |
|
"learning_rate": 2.1031537450722736e-05, |
|
"loss": 1.3331, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.3586406111717224, |
|
"learning_rate": 2.0538764783180025e-05, |
|
"loss": 1.3446, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.33255332708358765, |
|
"learning_rate": 2.004599211563732e-05, |
|
"loss": 1.3421, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.362377792596817, |
|
"learning_rate": 1.955321944809461e-05, |
|
"loss": 1.3543, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.43340814113616943, |
|
"learning_rate": 1.9060446780551903e-05, |
|
"loss": 1.3592, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.43384671211242676, |
|
"learning_rate": 1.85676741130092e-05, |
|
"loss": 1.3393, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.6346977353096008, |
|
"learning_rate": 1.807490144546649e-05, |
|
"loss": 1.3142, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.37973758578300476, |
|
"learning_rate": 1.7582128777923785e-05, |
|
"loss": 1.3279, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.3827611207962036, |
|
"learning_rate": 1.7089356110381077e-05, |
|
"loss": 1.3263, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.4215668737888336, |
|
"learning_rate": 1.659658344283837e-05, |
|
"loss": 1.3353, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.3658856153488159, |
|
"learning_rate": 1.6103810775295664e-05, |
|
"loss": 1.3144, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.3511466383934021, |
|
"learning_rate": 1.5611038107752956e-05, |
|
"loss": 1.2621, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.3667345941066742, |
|
"learning_rate": 1.511826544021025e-05, |
|
"loss": 1.2563, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.40708670020103455, |
|
"learning_rate": 1.4625492772667543e-05, |
|
"loss": 1.2945, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.42138373851776123, |
|
"learning_rate": 1.4132720105124836e-05, |
|
"loss": 1.3062, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.35037311911582947, |
|
"learning_rate": 1.3639947437582128e-05, |
|
"loss": 1.2745, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.48709043860435486, |
|
"learning_rate": 1.3147174770039422e-05, |
|
"loss": 1.3324, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.38190776109695435, |
|
"learning_rate": 1.2654402102496715e-05, |
|
"loss": 1.2465, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.4355185329914093, |
|
"learning_rate": 1.2161629434954009e-05, |
|
"loss": 1.2555, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.41925591230392456, |
|
"learning_rate": 1.16688567674113e-05, |
|
"loss": 1.2977, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.3973592221736908, |
|
"learning_rate": 1.1176084099868594e-05, |
|
"loss": 1.2695, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 0.35202836990356445, |
|
"learning_rate": 1.0683311432325887e-05, |
|
"loss": 1.3174, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.40330058336257935, |
|
"learning_rate": 1.019053876478318e-05, |
|
"loss": 1.2708, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 0.37632250785827637, |
|
"learning_rate": 9.697766097240474e-06, |
|
"loss": 1.2483, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 0.3991636633872986, |
|
"learning_rate": 9.204993429697766e-06, |
|
"loss": 1.2635, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 0.400860071182251, |
|
"learning_rate": 8.71222076215506e-06, |
|
"loss": 1.2464, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 0.40742385387420654, |
|
"learning_rate": 8.219448094612353e-06, |
|
"loss": 1.2532, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 0.4327584207057953, |
|
"learning_rate": 7.726675427069646e-06, |
|
"loss": 1.2771, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 0.3787235915660858, |
|
"learning_rate": 7.233902759526938e-06, |
|
"loss": 1.2508, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 0.423367977142334, |
|
"learning_rate": 6.741130091984232e-06, |
|
"loss": 1.2382, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 0.3932174742221832, |
|
"learning_rate": 6.248357424441524e-06, |
|
"loss": 1.2759, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.40788665413856506, |
|
"learning_rate": 5.755584756898818e-06, |
|
"loss": 1.2471, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 0.456950843334198, |
|
"learning_rate": 5.26281208935611e-06, |
|
"loss": 1.2384, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 0.36899831891059875, |
|
"learning_rate": 4.770039421813404e-06, |
|
"loss": 1.1969, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 0.44063258171081543, |
|
"learning_rate": 4.277266754270697e-06, |
|
"loss": 1.2758, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 0.3583914339542389, |
|
"learning_rate": 3.784494086727989e-06, |
|
"loss": 1.2332, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 0.4087056517601013, |
|
"learning_rate": 3.2917214191852826e-06, |
|
"loss": 1.2646, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.44617870450019836, |
|
"learning_rate": 2.7989487516425756e-06, |
|
"loss": 1.2265, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 0.4349097013473511, |
|
"learning_rate": 2.3061760840998687e-06, |
|
"loss": 1.2976, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 0.4503318965435028, |
|
"learning_rate": 1.8134034165571617e-06, |
|
"loss": 1.2467, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.40000948309898376, |
|
"learning_rate": 1.3206307490144548e-06, |
|
"loss": 1.2146, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 0.39695021510124207, |
|
"learning_rate": 8.278580814717477e-07, |
|
"loss": 1.263, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 0.4137335419654846, |
|
"learning_rate": 3.3508541392904077e-07, |
|
"loss": 1.2436, |
|
"step": 1675 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 1692, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 7269909176254464.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|