|
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 2.932811944543192,
|
|
"eval_steps": 500,
|
|
"global_step": 16500,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.08887308922858159,
|
|
"grad_norm": 20.37746810913086,
|
|
"learning_rate": 4.851878184619031e-05,
|
|
"loss": 0.6617,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.17774617845716317,
|
|
"grad_norm": 8.832418441772461,
|
|
"learning_rate": 4.703756369238061e-05,
|
|
"loss": 0.5007,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.26661926768574473,
|
|
"grad_norm": 112.71693420410156,
|
|
"learning_rate": 4.555634553857092e-05,
|
|
"loss": 0.4867,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.35549235691432635,
|
|
"grad_norm": 0.7494603991508484,
|
|
"learning_rate": 4.407512738476123e-05,
|
|
"loss": 0.5113,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 0.4443654461429079,
|
|
"grad_norm": 0.5181692838668823,
|
|
"learning_rate": 4.259390923095153e-05,
|
|
"loss": 0.557,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 0.5332385353714895,
|
|
"grad_norm": 0.8127353191375732,
|
|
"learning_rate": 4.1112691077141844e-05,
|
|
"loss": 0.4746,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 0.6221116246000711,
|
|
"grad_norm": 52.21469497680664,
|
|
"learning_rate": 3.9631472923332156e-05,
|
|
"loss": 0.4848,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 0.7109847138286527,
|
|
"grad_norm": 3.922440767288208,
|
|
"learning_rate": 3.815025476952246e-05,
|
|
"loss": 0.4689,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 0.7998578030572343,
|
|
"grad_norm": 39.936492919921875,
|
|
"learning_rate": 3.6669036615712765e-05,
|
|
"loss": 0.4724,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 0.8887308922858158,
|
|
"grad_norm": 3.033518075942993,
|
|
"learning_rate": 3.518781846190307e-05,
|
|
"loss": 0.4384,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"epoch": 0.9776039815143974,
|
|
"grad_norm": 8.464579582214355,
|
|
"learning_rate": 3.370660030809338e-05,
|
|
"loss": 0.4534,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"epoch": 1.066477070742979,
|
|
"grad_norm": 0.24204222857952118,
|
|
"learning_rate": 3.2225382154283686e-05,
|
|
"loss": 0.3978,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"epoch": 1.1553501599715605,
|
|
"grad_norm": 0.057052597403526306,
|
|
"learning_rate": 3.074416400047399e-05,
|
|
"loss": 0.3852,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"epoch": 1.2442232492001422,
|
|
"grad_norm": 0.15880955755710602,
|
|
"learning_rate": 2.92629458466643e-05,
|
|
"loss": 0.3915,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"epoch": 1.3330963384287238,
|
|
"grad_norm": 0.09902948141098022,
|
|
"learning_rate": 2.7781727692854603e-05,
|
|
"loss": 0.3785,
|
|
"step": 7500
|
|
},
|
|
{
|
|
"epoch": 1.4219694276573054,
|
|
"grad_norm": 0.15144290030002594,
|
|
"learning_rate": 2.6300509539044908e-05,
|
|
"loss": 0.3587,
|
|
"step": 8000
|
|
},
|
|
{
|
|
"epoch": 1.510842516885887,
|
|
"grad_norm": 0.22604715824127197,
|
|
"learning_rate": 2.481929138523522e-05,
|
|
"loss": 0.3811,
|
|
"step": 8500
|
|
},
|
|
{
|
|
"epoch": 1.5997156061144686,
|
|
"grad_norm": 14.215106964111328,
|
|
"learning_rate": 2.3338073231425524e-05,
|
|
"loss": 0.3591,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"epoch": 1.68858869534305,
|
|
"grad_norm": 0.2168210744857788,
|
|
"learning_rate": 2.1856855077615832e-05,
|
|
"loss": 0.37,
|
|
"step": 9500
|
|
},
|
|
{
|
|
"epoch": 1.7774617845716318,
|
|
"grad_norm": 0.15380479395389557,
|
|
"learning_rate": 2.0375636923806137e-05,
|
|
"loss": 0.3492,
|
|
"step": 10000
|
|
},
|
|
{
|
|
"epoch": 1.8663348738002132,
|
|
"grad_norm": 0.09666112810373306,
|
|
"learning_rate": 1.889441876999645e-05,
|
|
"loss": 0.3922,
|
|
"step": 10500
|
|
},
|
|
{
|
|
"epoch": 1.9552079630287948,
|
|
"grad_norm": 0.7467890381813049,
|
|
"learning_rate": 1.7413200616186753e-05,
|
|
"loss": 0.3786,
|
|
"step": 11000
|
|
},
|
|
{
|
|
"epoch": 2.0440810522573765,
|
|
"grad_norm": 1.9722317457199097,
|
|
"learning_rate": 1.593198246237706e-05,
|
|
"loss": 0.3258,
|
|
"step": 11500
|
|
},
|
|
{
|
|
"epoch": 2.132954141485958,
|
|
"grad_norm": 0.07470700144767761,
|
|
"learning_rate": 1.4450764308567366e-05,
|
|
"loss": 0.2807,
|
|
"step": 12000
|
|
},
|
|
{
|
|
"epoch": 2.2218272307145397,
|
|
"grad_norm": 0.11256339401006699,
|
|
"learning_rate": 1.2969546154757672e-05,
|
|
"loss": 0.2751,
|
|
"step": 12500
|
|
},
|
|
{
|
|
"epoch": 2.310700319943121,
|
|
"grad_norm": 0.010098825208842754,
|
|
"learning_rate": 1.148832800094798e-05,
|
|
"loss": 0.2385,
|
|
"step": 13000
|
|
},
|
|
{
|
|
"epoch": 2.399573409171703,
|
|
"grad_norm": 0.15252766013145447,
|
|
"learning_rate": 1.0007109847138287e-05,
|
|
"loss": 0.2379,
|
|
"step": 13500
|
|
},
|
|
{
|
|
"epoch": 2.4884464984002843,
|
|
"grad_norm": 17.05082130432129,
|
|
"learning_rate": 8.525891693328595e-06,
|
|
"loss": 0.2706,
|
|
"step": 14000
|
|
},
|
|
{
|
|
"epoch": 2.5773195876288657,
|
|
"grad_norm": 0.11130794882774353,
|
|
"learning_rate": 7.0446735395189e-06,
|
|
"loss": 0.2591,
|
|
"step": 14500
|
|
},
|
|
{
|
|
"epoch": 2.6661926768574475,
|
|
"grad_norm": 74.68379974365234,
|
|
"learning_rate": 5.5634553857092076e-06,
|
|
"loss": 0.3314,
|
|
"step": 15000
|
|
},
|
|
{
|
|
"epoch": 2.7550657660860294,
|
|
"grad_norm": 15.91612434387207,
|
|
"learning_rate": 4.082237231899514e-06,
|
|
"loss": 0.2514,
|
|
"step": 15500
|
|
},
|
|
{
|
|
"epoch": 2.8439388553146108,
|
|
"grad_norm": 39.0718994140625,
|
|
"learning_rate": 2.601019078089821e-06,
|
|
"loss": 0.2358,
|
|
"step": 16000
|
|
},
|
|
{
|
|
"epoch": 2.932811944543192,
|
|
"grad_norm": 0.09988761693239212,
|
|
"learning_rate": 1.119800924280128e-06,
|
|
"loss": 0.2359,
|
|
"step": 16500
|
|
}
|
|
],
|
|
"logging_steps": 500,
|
|
"max_steps": 16878,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 4794112531464192.0,
|
|
"train_batch_size": 4,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|
|
|