|
{ |
|
"best_metric": 0.09464961290359497, |
|
"best_model_checkpoint": "outputs/checkpoint-540", |
|
"epoch": 9.840546697038725, |
|
"eval_steps": 500, |
|
"global_step": 540, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.36446469248291574, |
|
"grad_norm": 3.396773099899292, |
|
"learning_rate": 1.8000000000000001e-06, |
|
"loss": 0.3727, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.7289293849658315, |
|
"grad_norm": 1.4876775741577148, |
|
"learning_rate": 3.8000000000000005e-06, |
|
"loss": 0.2861, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.9840546697038725, |
|
"eval_loss": 0.18749132752418518, |
|
"eval_runtime": 49.826, |
|
"eval_samples_per_second": 3.372, |
|
"eval_steps_per_second": 0.421, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.0933940774487472, |
|
"grad_norm": 0.6102920770645142, |
|
"learning_rate": 5.8e-06, |
|
"loss": 0.1811, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.4578587699316627, |
|
"grad_norm": 0.4133767783641815, |
|
"learning_rate": 7.800000000000002e-06, |
|
"loss": 0.1346, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.8223234624145785, |
|
"grad_norm": 1.7365530729293823, |
|
"learning_rate": 9.800000000000001e-06, |
|
"loss": 0.1243, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.9863325740318907, |
|
"eval_loss": 0.13598798215389252, |
|
"eval_runtime": 49.8253, |
|
"eval_samples_per_second": 3.372, |
|
"eval_steps_per_second": 0.421, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 2.1867881548974943, |
|
"grad_norm": 0.4655854403972626, |
|
"learning_rate": 9.958763523679515e-06, |
|
"loss": 0.1038, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.55125284738041, |
|
"grad_norm": 0.6029446125030518, |
|
"learning_rate": 9.817090706862895e-06, |
|
"loss": 0.0873, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.9157175398633255, |
|
"grad_norm": 0.4066179394721985, |
|
"learning_rate": 9.577355814597031e-06, |
|
"loss": 0.0862, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.988610478359909, |
|
"eval_loss": 0.11723620444536209, |
|
"eval_runtime": 49.7707, |
|
"eval_samples_per_second": 3.375, |
|
"eval_steps_per_second": 0.422, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 3.2801822323462413, |
|
"grad_norm": 0.6009082198143005, |
|
"learning_rate": 9.244439157950114e-06, |
|
"loss": 0.0834, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.644646924829157, |
|
"grad_norm": 0.6393954157829285, |
|
"learning_rate": 8.825117959999117e-06, |
|
"loss": 0.0756, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.990888382687927, |
|
"eval_loss": 0.10594599694013596, |
|
"eval_runtime": 49.8676, |
|
"eval_samples_per_second": 3.369, |
|
"eval_steps_per_second": 0.421, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 4.009111617312073, |
|
"grad_norm": 0.5410734415054321, |
|
"learning_rate": 8.327928391111841e-06, |
|
"loss": 0.0733, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.373576309794989, |
|
"grad_norm": 0.544377326965332, |
|
"learning_rate": 7.762991797134513e-06, |
|
"loss": 0.0684, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 4.738041002277904, |
|
"grad_norm": 0.5764002799987793, |
|
"learning_rate": 7.1418086579779075e-06, |
|
"loss": 0.0628, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 4.993166287015946, |
|
"eval_loss": 0.0995541512966156, |
|
"eval_runtime": 49.7114, |
|
"eval_samples_per_second": 3.38, |
|
"eval_steps_per_second": 0.422, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 5.10250569476082, |
|
"grad_norm": 0.48410555720329285, |
|
"learning_rate": 6.477024471011001e-06, |
|
"loss": 0.0628, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 5.466970387243736, |
|
"grad_norm": 0.6120467185974121, |
|
"learning_rate": 5.782172325201155e-06, |
|
"loss": 0.0594, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 5.831435079726651, |
|
"grad_norm": 0.7353035807609558, |
|
"learning_rate": 5.071397406448937e-06, |
|
"loss": 0.0593, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 5.995444191343964, |
|
"eval_loss": 0.09746743738651276, |
|
"eval_runtime": 49.7379, |
|
"eval_samples_per_second": 3.378, |
|
"eval_steps_per_second": 0.422, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 6.195899772209567, |
|
"grad_norm": 0.5597842931747437, |
|
"learning_rate": 4.359169042394537e-06, |
|
"loss": 0.0606, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 6.560364464692483, |
|
"grad_norm": 0.5548788905143738, |
|
"learning_rate": 3.6599861486331074e-06, |
|
"loss": 0.0548, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 6.924829157175399, |
|
"grad_norm": 0.6110637784004211, |
|
"learning_rate": 2.9880820726046613e-06, |
|
"loss": 0.0498, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 6.997722095671982, |
|
"eval_loss": 0.09508081525564194, |
|
"eval_runtime": 49.5881, |
|
"eval_samples_per_second": 3.388, |
|
"eval_steps_per_second": 0.423, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 7.289293849658314, |
|
"grad_norm": 0.6793264150619507, |
|
"learning_rate": 2.3571348436857906e-06, |
|
"loss": 0.0485, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 7.65375854214123, |
|
"grad_norm": 0.499203085899353, |
|
"learning_rate": 1.7799887279557238e-06, |
|
"loss": 0.0474, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 0.09498950093984604, |
|
"eval_runtime": 49.5797, |
|
"eval_samples_per_second": 3.388, |
|
"eval_steps_per_second": 0.424, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 8.018223234624147, |
|
"grad_norm": 0.86496502161026, |
|
"learning_rate": 1.2683927559787657e-06, |
|
"loss": 0.0523, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 8.382687927107062, |
|
"grad_norm": 0.7341931462287903, |
|
"learning_rate": 8.327615464234129e-07, |
|
"loss": 0.0423, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 8.747152619589977, |
|
"grad_norm": 0.5355525016784668, |
|
"learning_rate": 4.819632944595415e-07, |
|
"loss": 0.047, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 8.984054669703873, |
|
"eval_loss": 0.09474514424800873, |
|
"eval_runtime": 49.5868, |
|
"eval_samples_per_second": 3.388, |
|
"eval_steps_per_second": 0.424, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 9.111617312072893, |
|
"grad_norm": 0.5788146257400513, |
|
"learning_rate": 2.2313924087851657e-07, |
|
"loss": 0.0548, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 9.476082004555808, |
|
"grad_norm": 0.4824052155017853, |
|
"learning_rate": 6.15582970243117e-08, |
|
"loss": 0.0451, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 9.840546697038725, |
|
"grad_norm": 0.9220362901687622, |
|
"learning_rate": 5.09784952833492e-10, |
|
"loss": 0.0453, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 9.840546697038725, |
|
"eval_loss": 0.09464961290359497, |
|
"eval_runtime": 49.5747, |
|
"eval_samples_per_second": 3.389, |
|
"eval_steps_per_second": 0.424, |
|
"step": 540 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 540, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 2.0191775689433088e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|