|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.859154929577464, |
|
"eval_steps": 500, |
|
"global_step": 350, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.28169014084507044, |
|
"grad_norm": 6.4921875, |
|
"learning_rate": 0.00019959742939952392, |
|
"loss": 1.0479, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.5633802816901409, |
|
"grad_norm": 7.4765625, |
|
"learning_rate": 0.00019839295885986296, |
|
"loss": 1.0329, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.8450704225352113, |
|
"grad_norm": 8.5546875, |
|
"learning_rate": 0.00019639628606958533, |
|
"loss": 0.9256, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.1267605633802817, |
|
"grad_norm": 7.9453125, |
|
"learning_rate": 0.00019362348706397373, |
|
"loss": 0.8794, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.408450704225352, |
|
"grad_norm": 9.7578125, |
|
"learning_rate": 0.0001900968867902419, |
|
"loss": 0.8596, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.6901408450704225, |
|
"grad_norm": 9.765625, |
|
"learning_rate": 0.00018584487936018661, |
|
"loss": 0.8522, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.971830985915493, |
|
"grad_norm": 10.0859375, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 0.8563, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.2535211267605635, |
|
"grad_norm": 8.421875, |
|
"learning_rate": 0.00017530714660036112, |
|
"loss": 0.826, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.535211267605634, |
|
"grad_norm": 9.7265625, |
|
"learning_rate": 0.00016910626489868649, |
|
"loss": 0.8266, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.816901408450704, |
|
"grad_norm": 11.2265625, |
|
"learning_rate": 0.00016234898018587337, |
|
"loss": 0.8513, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.0985915492957745, |
|
"grad_norm": 10.9765625, |
|
"learning_rate": 0.00015508969814521025, |
|
"loss": 0.8051, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.380281690140845, |
|
"grad_norm": 12.53125, |
|
"learning_rate": 0.00014738686624729986, |
|
"loss": 0.8422, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 3.6619718309859155, |
|
"grad_norm": 10.234375, |
|
"learning_rate": 0.00013930250316539238, |
|
"loss": 0.8595, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 3.943661971830986, |
|
"grad_norm": 12.8671875, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 0.8358, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 4.225352112676056, |
|
"grad_norm": 11.7109375, |
|
"learning_rate": 0.00012225209339563145, |
|
"loss": 0.8135, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 4.507042253521127, |
|
"grad_norm": 9.9296875, |
|
"learning_rate": 0.00011342332658176555, |
|
"loss": 0.7967, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 4.788732394366197, |
|
"grad_norm": 10.546875, |
|
"learning_rate": 0.00010448648303505151, |
|
"loss": 0.7877, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 5.070422535211268, |
|
"grad_norm": 14.2265625, |
|
"learning_rate": 9.551351696494854e-05, |
|
"loss": 0.7516, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 5.352112676056338, |
|
"grad_norm": 11.4375, |
|
"learning_rate": 8.657667341823448e-05, |
|
"loss": 0.7581, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 5.633802816901408, |
|
"grad_norm": 10.5859375, |
|
"learning_rate": 7.774790660436858e-05, |
|
"loss": 0.7491, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 5.915492957746479, |
|
"grad_norm": 9.6796875, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 0.7307, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 6.197183098591549, |
|
"grad_norm": 8.890625, |
|
"learning_rate": 6.069749683460765e-05, |
|
"loss": 0.7323, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 6.47887323943662, |
|
"grad_norm": 11.2734375, |
|
"learning_rate": 5.261313375270014e-05, |
|
"loss": 0.7145, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 6.76056338028169, |
|
"grad_norm": 10.4375, |
|
"learning_rate": 4.491030185478976e-05, |
|
"loss": 0.7525, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 7.042253521126761, |
|
"grad_norm": 10.9765625, |
|
"learning_rate": 3.7651019814126654e-05, |
|
"loss": 0.7138, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 7.323943661971831, |
|
"grad_norm": 9.625, |
|
"learning_rate": 3.089373510131354e-05, |
|
"loss": 0.7266, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 7.605633802816901, |
|
"grad_norm": 9.6484375, |
|
"learning_rate": 2.4692853399638917e-05, |
|
"loss": 0.7136, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 7.887323943661972, |
|
"grad_norm": 10.25, |
|
"learning_rate": 1.9098300562505266e-05, |
|
"loss": 0.7194, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 8.169014084507042, |
|
"grad_norm": 11.3203125, |
|
"learning_rate": 1.415512063981339e-05, |
|
"loss": 0.7137, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 8.450704225352112, |
|
"grad_norm": 13.578125, |
|
"learning_rate": 9.903113209758096e-06, |
|
"loss": 0.7186, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 8.732394366197184, |
|
"grad_norm": 8.828125, |
|
"learning_rate": 6.37651293602628e-06, |
|
"loss": 0.7232, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 9.014084507042254, |
|
"grad_norm": 11.9140625, |
|
"learning_rate": 3.6037139304146762e-06, |
|
"loss": 0.7116, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 9.295774647887324, |
|
"grad_norm": 9.4453125, |
|
"learning_rate": 1.6070411401370334e-06, |
|
"loss": 0.7106, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 9.577464788732394, |
|
"grad_norm": 12.828125, |
|
"learning_rate": 4.025706004760932e-07, |
|
"loss": 0.727, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 9.859154929577464, |
|
"grad_norm": 10.4453125, |
|
"learning_rate": 0.0, |
|
"loss": 0.704, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 9.859154929577464, |
|
"step": 350, |
|
"total_flos": 2.2849945337856e+16, |
|
"train_loss": 0.7934087766919817, |
|
"train_runtime": 317.5625, |
|
"train_samples_per_second": 4.472, |
|
"train_steps_per_second": 1.102 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 350, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 2.2849945337856e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|