env_dafcd9e / trainer_state.json
bimabk's picture
Upload task output 1
8c610ed verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.15016685205784205,
"eval_steps": 500,
"global_step": 135,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 0.8225839495658874,
"epoch": 0.0055617352614015575,
"grad_norm": 18.125,
"learning_rate": 9.757583950719854e-06,
"loss": 1.456,
"mean_token_accuracy": 0.7172651767730713,
"num_tokens": 10200.0,
"step": 5
},
{
"entropy": 0.990528690814972,
"epoch": 0.011123470522803115,
"grad_norm": 5.59375,
"learning_rate": 2.195456388911967e-05,
"loss": 1.1324,
"mean_token_accuracy": 0.7504398703575135,
"num_tokens": 20440.0,
"step": 10
},
{
"entropy": 1.0786633253097535,
"epoch": 0.01668520578420467,
"grad_norm": 4.375,
"learning_rate": 3.4151543827519494e-05,
"loss": 0.8838,
"mean_token_accuracy": 0.7859237432479859,
"num_tokens": 30680.0,
"step": 15
},
{
"entropy": 0.49569674730300906,
"epoch": 0.02224694104560623,
"grad_norm": 5.65625,
"learning_rate": 4.6348523765919305e-05,
"loss": 0.4962,
"mean_token_accuracy": 0.8680351972579956,
"num_tokens": 40920.0,
"step": 20
},
{
"entropy": 0.3256936728954315,
"epoch": 0.027808676307007785,
"grad_norm": 3.96875,
"learning_rate": 5.854550370431913e-05,
"loss": 0.342,
"mean_token_accuracy": 0.9047898292541504,
"num_tokens": 51160.0,
"step": 25
},
{
"entropy": 0.34028403759002684,
"epoch": 0.03337041156840934,
"grad_norm": 2.734375,
"learning_rate": 7.074248364271895e-05,
"loss": 0.3832,
"mean_token_accuracy": 0.8867057681083679,
"num_tokens": 61400.0,
"step": 30
},
{
"entropy": 0.397564172744751,
"epoch": 0.0389321468298109,
"grad_norm": 3.46875,
"learning_rate": 8.293946358111876e-05,
"loss": 0.4357,
"mean_token_accuracy": 0.8780049562454224,
"num_tokens": 71634.0,
"step": 35
},
{
"entropy": 0.39086252450942993,
"epoch": 0.04449388209121246,
"grad_norm": 2.171875,
"learning_rate": 8.537885171853211e-05,
"loss": 0.4262,
"mean_token_accuracy": 0.8823036670684814,
"num_tokens": 81859.0,
"step": 40
},
{
"entropy": 0.43406811356544495,
"epoch": 0.05005561735261402,
"grad_norm": 3.5,
"learning_rate": 8.53788198268306e-05,
"loss": 0.4289,
"mean_token_accuracy": 0.8754657983779908,
"num_tokens": 91643.0,
"step": 45
},
{
"entropy": 0.33308460712432864,
"epoch": 0.05561735261401557,
"grad_norm": 2.046875,
"learning_rate": 8.537876340307694e-05,
"loss": 0.3425,
"mean_token_accuracy": 0.9025415539741516,
"num_tokens": 101883.0,
"step": 50
},
{
"entropy": 0.3366609990596771,
"epoch": 0.06117908787541713,
"grad_norm": 2.203125,
"learning_rate": 8.537868244731438e-05,
"loss": 0.3744,
"mean_token_accuracy": 0.8893450617790222,
"num_tokens": 112123.0,
"step": 55
},
{
"entropy": 0.36333391070365906,
"epoch": 0.06674082313681869,
"grad_norm": 1.8203125,
"learning_rate": 8.537857695960494e-05,
"loss": 0.401,
"mean_token_accuracy": 0.8837732076644897,
"num_tokens": 122363.0,
"step": 60
},
{
"entropy": 0.5008107841014862,
"epoch": 0.07230255839822025,
"grad_norm": 2.484375,
"learning_rate": 8.537844694002943e-05,
"loss": 0.5051,
"mean_token_accuracy": 0.857869005203247,
"num_tokens": 132603.0,
"step": 65
},
{
"entropy": 0.5081609010696411,
"epoch": 0.0778642936596218,
"grad_norm": 2.515625,
"learning_rate": 8.537829238868749e-05,
"loss": 0.5758,
"mean_token_accuracy": 0.8529759764671325,
"num_tokens": 142833.0,
"step": 70
},
{
"entropy": 1.2066489636898041,
"epoch": 0.08342602892102335,
"grad_norm": 338.0,
"learning_rate": 8.537811330569756e-05,
"loss": 1.6885,
"mean_token_accuracy": 0.7405156970024109,
"num_tokens": 153063.0,
"step": 75
},
{
"entropy": 0.3763828158378601,
"epoch": 0.08898776418242492,
"grad_norm": 1.546875,
"learning_rate": 8.537790969119681e-05,
"loss": 0.407,
"mean_token_accuracy": 0.8820860385894775,
"num_tokens": 163299.0,
"step": 80
},
{
"entropy": 0.371926474571228,
"epoch": 0.09454949944382647,
"grad_norm": 1.78125,
"learning_rate": 8.537768154534127e-05,
"loss": 0.3733,
"mean_token_accuracy": 0.8954056620597839,
"num_tokens": 173539.0,
"step": 85
},
{
"entropy": 0.3801657140254974,
"epoch": 0.10011123470522804,
"grad_norm": 1.375,
"learning_rate": 8.537742886830578e-05,
"loss": 0.3588,
"mean_token_accuracy": 0.8942199230194092,
"num_tokens": 183779.0,
"step": 90
},
{
"entropy": 0.26907028555870055,
"epoch": 0.10567296996662959,
"grad_norm": 1.828125,
"learning_rate": 8.537715166028392e-05,
"loss": 0.2725,
"mean_token_accuracy": 0.9173020601272583,
"num_tokens": 194019.0,
"step": 95
},
{
"entropy": 0.28473606407642366,
"epoch": 0.11123470522803114,
"grad_norm": 2.046875,
"learning_rate": 8.537684992148809e-05,
"loss": 0.3359,
"mean_token_accuracy": 0.9006433963775635,
"num_tokens": 204222.0,
"step": 100
},
{
"entropy": 0.34853672981262207,
"epoch": 0.1167964404894327,
"grad_norm": 1.90625,
"learning_rate": 8.537652365214949e-05,
"loss": 0.3321,
"mean_token_accuracy": 0.9035966873168946,
"num_tokens": 214237.0,
"step": 105
},
{
"entropy": 0.3243670523166656,
"epoch": 0.12235817575083426,
"grad_norm": 1.484375,
"learning_rate": 8.537617285251812e-05,
"loss": 0.3225,
"mean_token_accuracy": 0.9064516067504883,
"num_tokens": 224477.0,
"step": 110
},
{
"entropy": 0.3252596139907837,
"epoch": 0.12791991101223582,
"grad_norm": 1.5546875,
"learning_rate": 8.537579752286277e-05,
"loss": 0.3515,
"mean_token_accuracy": 0.8956989169120788,
"num_tokens": 234717.0,
"step": 115
},
{
"entropy": 0.27094421088695525,
"epoch": 0.13348164627363737,
"grad_norm": 3.71875,
"learning_rate": 8.537539766347103e-05,
"loss": 0.302,
"mean_token_accuracy": 0.9114369511604309,
"num_tokens": 244957.0,
"step": 120
},
{
"entropy": 0.3085549890995026,
"epoch": 0.13904338153503892,
"grad_norm": 1.4921875,
"learning_rate": 8.537497327464926e-05,
"loss": 0.3061,
"mean_token_accuracy": 0.9045943140983581,
"num_tokens": 255197.0,
"step": 125
},
{
"entropy": 0.31254134476184847,
"epoch": 0.1446051167964405,
"grad_norm": 1.7109375,
"learning_rate": 8.537452435672265e-05,
"loss": 0.3357,
"mean_token_accuracy": 0.9013739109039307,
"num_tokens": 265354.0,
"step": 130
},
{
"entropy": 0.282534995675087,
"epoch": 0.15016685205784205,
"grad_norm": 1.3203125,
"learning_rate": 8.537405091003517e-05,
"loss": 0.2944,
"mean_token_accuracy": 0.9155425190925598,
"num_tokens": 275594.0,
"step": 135
}
],
"logging_steps": 5,
"max_steps": 17980,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4588266032463872.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}