|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9101251422070534, |
|
"eval_steps": 500, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.26715654134750366, |
|
"learning_rate": 4.9992855064046754e-05, |
|
"loss": 2.6697, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.4067687392234802, |
|
"learning_rate": 4.997142434019578e-05, |
|
"loss": 2.5369, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.44472697377204895, |
|
"learning_rate": 4.9935720078139045e-05, |
|
"loss": 2.5661, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.45230674743652344, |
|
"learning_rate": 4.988576268624979e-05, |
|
"loss": 2.4824, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.4477585554122925, |
|
"learning_rate": 4.982158071991725e-05, |
|
"loss": 2.3343, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.3630908131599426, |
|
"learning_rate": 4.974321086522453e-05, |
|
"loss": 2.4377, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.3237389028072357, |
|
"learning_rate": 4.9650697917979025e-05, |
|
"loss": 2.4114, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.3014233708381653, |
|
"learning_rate": 4.954409475810737e-05, |
|
"loss": 2.2636, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.3054388761520386, |
|
"learning_rate": 4.942346231942955e-05, |
|
"loss": 2.2758, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.2940792143344879, |
|
"learning_rate": 4.92888695548294e-05, |
|
"loss": 2.3207, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.25064757466316223, |
|
"learning_rate": 4.9140393396841565e-05, |
|
"loss": 2.2209, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.3227023780345917, |
|
"learning_rate": 4.89781187136772e-05, |
|
"loss": 2.2328, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.312673419713974, |
|
"learning_rate": 4.880213826071375e-05, |
|
"loss": 2.2737, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.26930904388427734, |
|
"learning_rate": 4.861255262747643e-05, |
|
"loss": 2.2686, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.2343609631061554, |
|
"learning_rate": 4.8409470180141827e-05, |
|
"loss": 2.2661, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.3088403642177582, |
|
"learning_rate": 4.8193006999596294e-05, |
|
"loss": 2.191, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.3336387574672699, |
|
"learning_rate": 4.796328681508473e-05, |
|
"loss": 2.2106, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.28899675607681274, |
|
"learning_rate": 4.7720440933487575e-05, |
|
"loss": 2.2347, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.30223432183265686, |
|
"learning_rate": 4.746460816426647e-05, |
|
"loss": 2.2307, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.2692021429538727, |
|
"learning_rate": 4.7195934740121485e-05, |
|
"loss": 2.1503, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.3296695053577423, |
|
"learning_rate": 4.6914574233405236e-05, |
|
"loss": 2.2145, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.28714171051979065, |
|
"learning_rate": 4.662068746834176e-05, |
|
"loss": 2.1163, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.35001716017723083, |
|
"learning_rate": 4.6314442429100155e-05, |
|
"loss": 2.1868, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.32721835374832153, |
|
"learning_rate": 4.599601416377575e-05, |
|
"loss": 2.1865, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.30708980560302734, |
|
"learning_rate": 4.566558468433344e-05, |
|
"loss": 2.2035, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.3122975528240204, |
|
"learning_rate": 4.532334286257064e-05, |
|
"loss": 2.1762, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.3531278669834137, |
|
"learning_rate": 4.496948432215913e-05, |
|
"loss": 2.2452, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.3190854489803314, |
|
"learning_rate": 4.460421132682751e-05, |
|
"loss": 2.2267, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.29605475068092346, |
|
"learning_rate": 4.4227732664748365e-05, |
|
"loss": 2.2548, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.37863361835479736, |
|
"learning_rate": 4.384026352919595e-05, |
|
"loss": 2.2053, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.3235403895378113, |
|
"learning_rate": 4.344202539554285e-05, |
|
"loss": 2.193, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.3326057195663452, |
|
"learning_rate": 4.3033245894665814e-05, |
|
"loss": 2.2349, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.3827252686023712, |
|
"learning_rate": 4.261415868283304e-05, |
|
"loss": 2.1247, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.34777992963790894, |
|
"learning_rate": 4.218500330814753e-05, |
|
"loss": 2.1555, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.3294101059436798, |
|
"learning_rate": 4.174602507362258e-05, |
|
"loss": 2.1771, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.3454132676124573, |
|
"learning_rate": 4.1297474896967814e-05, |
|
"loss": 2.1616, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.3628969192504883, |
|
"learning_rate": 4.083960916716597e-05, |
|
"loss": 2.1681, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.31756460666656494, |
|
"learning_rate": 4.0372689597922215e-05, |
|
"loss": 2.146, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.41087692975997925, |
|
"learning_rate": 3.989698307806995e-05, |
|
"loss": 2.2185, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.3311336636543274, |
|
"learning_rate": 3.941276151901853e-05, |
|
"loss": 2.0976, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 657, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"total_flos": 6.836880289234944e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|