|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.820359281437126, |
|
"eval_steps": 500, |
|
"global_step": 410, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.23952095808383234, |
|
"grad_norm": 2.16796875, |
|
"learning_rate": 0.00019970658011837404, |
|
"loss": 0.5442, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.47904191616766467, |
|
"grad_norm": 2.02734375, |
|
"learning_rate": 0.00019882804237803488, |
|
"loss": 0.5445, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.718562874251497, |
|
"grad_norm": 1.2587890625, |
|
"learning_rate": 0.00019736954238777792, |
|
"loss": 0.5499, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.9580838323353293, |
|
"grad_norm": 1.3486328125, |
|
"learning_rate": 0.00019533963920549306, |
|
"loss": 0.5429, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.1976047904191618, |
|
"grad_norm": 2.08984375, |
|
"learning_rate": 0.0001927502451102095, |
|
"loss": 0.5336, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.437125748502994, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 0.00018961655569610557, |
|
"loss": 0.5333, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.6766467065868262, |
|
"grad_norm": 2.63671875, |
|
"learning_rate": 0.00018595696069872013, |
|
"loss": 0.5401, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.9161676646706587, |
|
"grad_norm": 2.19921875, |
|
"learning_rate": 0.00018179293607667178, |
|
"loss": 0.5414, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.155688622754491, |
|
"grad_norm": 1.908203125, |
|
"learning_rate": 0.0001771489179821943, |
|
"loss": 0.5713, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.3952095808383236, |
|
"grad_norm": 4.09765625, |
|
"learning_rate": 0.0001720521593600787, |
|
"loss": 0.538, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.6347305389221556, |
|
"grad_norm": 2.0390625, |
|
"learning_rate": 0.00016653257001655652, |
|
"loss": 0.5199, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.874251497005988, |
|
"grad_norm": 1.8154296875, |
|
"learning_rate": 0.0001606225410966638, |
|
"loss": 0.5204, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 3.1137724550898205, |
|
"grad_norm": 1.9189453125, |
|
"learning_rate": 0.00015435675500012212, |
|
"loss": 0.5396, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 3.3532934131736525, |
|
"grad_norm": 1.8896484375, |
|
"learning_rate": 0.0001477719818512263, |
|
"loss": 0.5294, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.592814371257485, |
|
"grad_norm": 2.302734375, |
|
"learning_rate": 0.00014090686371713402, |
|
"loss": 0.5394, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.8323353293413174, |
|
"grad_norm": 2.16796875, |
|
"learning_rate": 0.00013380168784085027, |
|
"loss": 0.5279, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 4.07185628742515, |
|
"grad_norm": 2.11328125, |
|
"learning_rate": 0.0001264981502196662, |
|
"loss": 0.5182, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 4.311377245508982, |
|
"grad_norm": 1.9189453125, |
|
"learning_rate": 0.00011903911091646684, |
|
"loss": 0.5304, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 4.550898203592815, |
|
"grad_norm": 2.48046875, |
|
"learning_rate": 0.00011146834253984006, |
|
"loss": 0.5386, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 4.790419161676647, |
|
"grad_norm": 2.62890625, |
|
"learning_rate": 0.00010383027336900355, |
|
"loss": 0.5276, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 5.029940119760479, |
|
"grad_norm": 3.021484375, |
|
"learning_rate": 9.616972663099647e-05, |
|
"loss": 0.5229, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 5.269461077844311, |
|
"grad_norm": 3.259765625, |
|
"learning_rate": 8.853165746015997e-05, |
|
"loss": 0.5243, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 5.508982035928144, |
|
"grad_norm": 2.876953125, |
|
"learning_rate": 8.096088908353315e-05, |
|
"loss": 0.5206, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 5.748502994011976, |
|
"grad_norm": 3.349609375, |
|
"learning_rate": 7.350184978033386e-05, |
|
"loss": 0.5374, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 5.9880239520958085, |
|
"grad_norm": 3.791015625, |
|
"learning_rate": 6.619831215914974e-05, |
|
"loss": 0.5258, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 6.227544910179641, |
|
"grad_norm": 3.240234375, |
|
"learning_rate": 5.909313628286601e-05, |
|
"loss": 0.5166, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 6.467065868263473, |
|
"grad_norm": 3.248046875, |
|
"learning_rate": 5.222801814877369e-05, |
|
"loss": 0.5329, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 6.706586826347305, |
|
"grad_norm": 2.83984375, |
|
"learning_rate": 4.56432449998779e-05, |
|
"loss": 0.5117, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 6.946107784431137, |
|
"grad_norm": 4.03515625, |
|
"learning_rate": 3.937745890333623e-05, |
|
"loss": 0.5269, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 7.18562874251497, |
|
"grad_norm": 4.08203125, |
|
"learning_rate": 3.346742998344348e-05, |
|
"loss": 0.5299, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 7.425149700598802, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 2.794784063992131e-05, |
|
"loss": 0.5217, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 7.664670658682635, |
|
"grad_norm": 3.00390625, |
|
"learning_rate": 2.2851082017805703e-05, |
|
"loss": 0.5241, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 7.904191616766467, |
|
"grad_norm": 3.27734375, |
|
"learning_rate": 1.8207063923328237e-05, |
|
"loss": 0.5066, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 8.1437125748503, |
|
"grad_norm": 2.927734375, |
|
"learning_rate": 1.4043039301279903e-05, |
|
"loss": 0.5111, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 8.383233532934131, |
|
"grad_norm": 3.3984375, |
|
"learning_rate": 1.0383444303894452e-05, |
|
"loss": 0.5137, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 8.622754491017965, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 7.249754889790539e-06, |
|
"loss": 0.5279, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 8.862275449101796, |
|
"grad_norm": 3.19140625, |
|
"learning_rate": 4.660360794506946e-06, |
|
"loss": 0.5125, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 9.10179640718563, |
|
"grad_norm": 4.23046875, |
|
"learning_rate": 2.6304576122221035e-06, |
|
"loss": 0.5141, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 9.341317365269461, |
|
"grad_norm": 2.779296875, |
|
"learning_rate": 1.1719576219651585e-06, |
|
"loss": 0.5172, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 9.580838323353294, |
|
"grad_norm": 3.619140625, |
|
"learning_rate": 2.934198816259559e-07, |
|
"loss": 0.5389, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 9.820359281437126, |
|
"grad_norm": 4.30078125, |
|
"learning_rate": 0.0, |
|
"loss": 0.5024, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 9.820359281437126, |
|
"step": 410, |
|
"total_flos": 2.67670788243456e+16, |
|
"train_loss": 0.5285330202521348, |
|
"train_runtime": 355.5253, |
|
"train_samples_per_second": 4.697, |
|
"train_steps_per_second": 1.153 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 410, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 2.67670788243456e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|