|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.9402985074626864, |
|
"eval_steps": 7, |
|
"global_step": 132, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.029850746268656716, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 0.8787, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.208955223880597, |
|
"grad_norm": 3.4440434140026612, |
|
"learning_rate": 3.0000000000000004e-07, |
|
"loss": 0.8776, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.208955223880597, |
|
"eval_loss": 0.7901861071586609, |
|
"eval_runtime": 150.9691, |
|
"eval_samples_per_second": 3.153, |
|
"eval_steps_per_second": 0.026, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.417910447761194, |
|
"grad_norm": 2.901557012984361, |
|
"learning_rate": 1.8e-06, |
|
"loss": 0.8473, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.417910447761194, |
|
"eval_loss": 0.7702628374099731, |
|
"eval_runtime": 143.7629, |
|
"eval_samples_per_second": 3.311, |
|
"eval_steps_per_second": 0.028, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.6268656716417911, |
|
"grad_norm": 2.4942078074256164, |
|
"learning_rate": 2.5095609265912853e-06, |
|
"loss": 0.8293, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.6268656716417911, |
|
"eval_loss": 0.760272204875946, |
|
"eval_runtime": 143.6002, |
|
"eval_samples_per_second": 3.315, |
|
"eval_steps_per_second": 0.028, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.835820895522388, |
|
"grad_norm": 2.227024549115603, |
|
"learning_rate": 1.3197749551783641e-06, |
|
"loss": 0.8173, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.835820895522388, |
|
"eval_loss": 0.7481057047843933, |
|
"eval_runtime": 139.8099, |
|
"eval_samples_per_second": 3.405, |
|
"eval_steps_per_second": 0.029, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.044776119402985, |
|
"grad_norm": 1.890361948752719, |
|
"learning_rate": 6.783887430182062e-07, |
|
"loss": 0.7415, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.044776119402985, |
|
"eval_loss": 0.7402028441429138, |
|
"eval_runtime": 139.4376, |
|
"eval_samples_per_second": 3.414, |
|
"eval_steps_per_second": 0.029, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.2537313432835822, |
|
"grad_norm": 1.890868926771716, |
|
"learning_rate": 3.8102735091851235e-07, |
|
"loss": 0.6794, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.2537313432835822, |
|
"eval_loss": 0.7418723106384277, |
|
"eval_runtime": 139.6461, |
|
"eval_samples_per_second": 3.409, |
|
"eval_steps_per_second": 0.029, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.462686567164179, |
|
"grad_norm": 3.6823973563358674, |
|
"learning_rate": 1.9899658436440185e-07, |
|
"loss": 0.6688, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.462686567164179, |
|
"eval_loss": 0.7392202615737915, |
|
"eval_runtime": 145.8028, |
|
"eval_samples_per_second": 3.265, |
|
"eval_steps_per_second": 0.027, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.671641791044776, |
|
"grad_norm": 1.8056965838105696, |
|
"learning_rate": 1.1300091285551449e-07, |
|
"loss": 0.6498, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.671641791044776, |
|
"eval_loss": 0.7367225289344788, |
|
"eval_runtime": 141.1059, |
|
"eval_samples_per_second": 3.373, |
|
"eval_steps_per_second": 0.028, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.8805970149253732, |
|
"grad_norm": 1.5388706627418591, |
|
"learning_rate": 7.476064096023686e-08, |
|
"loss": 0.6701, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.8805970149253732, |
|
"eval_loss": 0.7358315587043762, |
|
"eval_runtime": 143.703, |
|
"eval_samples_per_second": 3.312, |
|
"eval_steps_per_second": 0.028, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.08955223880597, |
|
"grad_norm": 1.7131943205261349, |
|
"learning_rate": 5.89232146321995e-08, |
|
"loss": 0.664, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.08955223880597, |
|
"eval_loss": 0.7354702353477478, |
|
"eval_runtime": 136.9065, |
|
"eval_samples_per_second": 3.477, |
|
"eval_steps_per_second": 0.029, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.298507462686567, |
|
"grad_norm": 1.542023361934585, |
|
"learning_rate": 5.289674857255442e-08, |
|
"loss": 0.6447, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.298507462686567, |
|
"eval_loss": 0.736127495765686, |
|
"eval_runtime": 136.173, |
|
"eval_samples_per_second": 3.496, |
|
"eval_steps_per_second": 0.029, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.5074626865671643, |
|
"grad_norm": 2.5338510966510115, |
|
"learning_rate": 5.082712625717188e-08, |
|
"loss": 0.6412, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.5074626865671643, |
|
"eval_loss": 0.7373142242431641, |
|
"eval_runtime": 138.002, |
|
"eval_samples_per_second": 3.449, |
|
"eval_steps_per_second": 0.029, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.716417910447761, |
|
"grad_norm": 1.9360637903054263, |
|
"learning_rate": 5.020097212085352e-08, |
|
"loss": 0.6458, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.716417910447761, |
|
"eval_loss": 0.7382717728614807, |
|
"eval_runtime": 139.7716, |
|
"eval_samples_per_second": 3.406, |
|
"eval_steps_per_second": 0.029, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.925373134328358, |
|
"grad_norm": 2.733791738493569, |
|
"learning_rate": 5.0050722602692304e-08, |
|
"loss": 0.6356, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 2.925373134328358, |
|
"eval_loss": 0.7387175559997559, |
|
"eval_runtime": 142.5673, |
|
"eval_samples_per_second": 3.339, |
|
"eval_steps_per_second": 0.028, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 3.1343283582089554, |
|
"grad_norm": 1.3743623511988, |
|
"learning_rate": 5.001050931854095e-08, |
|
"loss": 0.6398, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 3.1343283582089554, |
|
"eval_loss": 0.7387120723724365, |
|
"eval_runtime": 139.359, |
|
"eval_samples_per_second": 3.416, |
|
"eval_steps_per_second": 0.029, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 3.343283582089552, |
|
"grad_norm": 1.5953948429140674, |
|
"learning_rate": 5.000119265172339e-08, |
|
"loss": 0.6228, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 3.343283582089552, |
|
"eval_loss": 0.7390681505203247, |
|
"eval_runtime": 140.4569, |
|
"eval_samples_per_second": 3.389, |
|
"eval_steps_per_second": 0.028, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 3.5522388059701493, |
|
"grad_norm": 1.8688576586340278, |
|
"learning_rate": 5.0000078923070654e-08, |
|
"loss": 0.6139, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 3.5522388059701493, |
|
"eval_loss": 0.7394906282424927, |
|
"eval_runtime": 143.4431, |
|
"eval_samples_per_second": 3.318, |
|
"eval_steps_per_second": 0.028, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 3.7611940298507465, |
|
"grad_norm": 3.2568458543215315, |
|
"learning_rate": 5.000000212746016e-08, |
|
"loss": 0.591, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 3.7611940298507465, |
|
"eval_loss": 0.7398449778556824, |
|
"eval_runtime": 139.4002, |
|
"eval_samples_per_second": 3.415, |
|
"eval_steps_per_second": 0.029, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 3.9402985074626864, |
|
"step": 132, |
|
"total_flos": 130172942548992.0, |
|
"train_loss": 0.6859688226020697, |
|
"train_runtime": 25080.2087, |
|
"train_samples_per_second": 0.683, |
|
"train_steps_per_second": 0.005 |
|
} |
|
], |
|
"logging_steps": 7, |
|
"max_steps": 132, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 27, |
|
"total_flos": 130172942548992.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|