yalhessi's picture
Training in progress, epoch 5, checkpoint
a26ff7a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 720,
"global_step": 17995,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.13892747985551543,
"grad_norm": 0.4299773573875427,
"learning_rate": 0.0003953968695007873,
"loss": 0.2967,
"step": 500
},
{
"epoch": 0.2000555709919422,
"eval_loss": 0.1983761489391327,
"eval_runtime": 16.4512,
"eval_samples_per_second": 30.393,
"eval_steps_per_second": 3.83,
"step": 720
},
{
"epoch": 0.27785495971103086,
"grad_norm": 0.5706949830055237,
"learning_rate": 0.0003907659535056034,
"loss": 0.1959,
"step": 1000
},
{
"epoch": 0.4001111419838844,
"eval_loss": 0.17383727431297302,
"eval_runtime": 16.4842,
"eval_samples_per_second": 30.332,
"eval_steps_per_second": 3.822,
"step": 1440
},
{
"epoch": 0.41678243956654626,
"grad_norm": 0.5392869710922241,
"learning_rate": 0.0003861350375104196,
"loss": 0.1723,
"step": 1500
},
{
"epoch": 0.5557099194220617,
"grad_norm": 0.5005412697792053,
"learning_rate": 0.0003815041215152357,
"loss": 0.1636,
"step": 2000
},
{
"epoch": 0.6001667129758266,
"eval_loss": 0.16229495406150818,
"eval_runtime": 16.4963,
"eval_samples_per_second": 30.31,
"eval_steps_per_second": 3.819,
"step": 2160
},
{
"epoch": 0.6946373992775771,
"grad_norm": 0.370914489030838,
"learning_rate": 0.0003768732055200519,
"loss": 0.1539,
"step": 2500
},
{
"epoch": 0.8002222839677688,
"eval_loss": 0.1537286937236786,
"eval_runtime": 16.4662,
"eval_samples_per_second": 30.365,
"eval_steps_per_second": 3.826,
"step": 2880
},
{
"epoch": 0.8335648791330925,
"grad_norm": 0.41378697752952576,
"learning_rate": 0.000372242289524868,
"loss": 0.1445,
"step": 3000
},
{
"epoch": 0.972492358988608,
"grad_norm": 0.4000154137611389,
"learning_rate": 0.0003676113735296842,
"loss": 0.1384,
"step": 3500
},
{
"epoch": 1.000277854959711,
"eval_loss": 0.15115150809288025,
"eval_runtime": 16.8021,
"eval_samples_per_second": 29.758,
"eval_steps_per_second": 3.75,
"step": 3600
},
{
"epoch": 1.1114198388441234,
"grad_norm": 0.4774770140647888,
"learning_rate": 0.00036298045753450036,
"loss": 0.1317,
"step": 4000
},
{
"epoch": 1.2003334259516532,
"eval_loss": 0.14330309629440308,
"eval_runtime": 16.4814,
"eval_samples_per_second": 30.337,
"eval_steps_per_second": 3.822,
"step": 4320
},
{
"epoch": 1.2503473186996388,
"grad_norm": 0.4988621175289154,
"learning_rate": 0.0003583495415393165,
"loss": 0.1283,
"step": 4500
},
{
"epoch": 1.3892747985551543,
"grad_norm": 0.5688238739967346,
"learning_rate": 0.000353727887376123,
"loss": 0.1237,
"step": 5000
},
{
"epoch": 1.4003889969435954,
"eval_loss": 0.13683326542377472,
"eval_runtime": 16.5065,
"eval_samples_per_second": 30.291,
"eval_steps_per_second": 3.817,
"step": 5040
},
{
"epoch": 1.5282022784106695,
"grad_norm": 0.5433902740478516,
"learning_rate": 0.00034909697138093914,
"loss": 0.1174,
"step": 5500
},
{
"epoch": 1.6004445679355377,
"eval_loss": 0.13955478370189667,
"eval_runtime": 16.4989,
"eval_samples_per_second": 30.305,
"eval_steps_per_second": 3.818,
"step": 5760
},
{
"epoch": 1.667129758266185,
"grad_norm": 0.5300644040107727,
"learning_rate": 0.0003444660553857553,
"loss": 0.1178,
"step": 6000
},
{
"epoch": 1.8005001389274797,
"eval_loss": 0.13158197700977325,
"eval_runtime": 16.533,
"eval_samples_per_second": 30.243,
"eval_steps_per_second": 3.811,
"step": 6480
},
{
"epoch": 1.8060572381217006,
"grad_norm": 0.3573897182941437,
"learning_rate": 0.0003398351393905715,
"loss": 0.113,
"step": 6500
},
{
"epoch": 1.9449847179772157,
"grad_norm": 0.5019258260726929,
"learning_rate": 0.00033520422339538766,
"loss": 0.1134,
"step": 7000
},
{
"epoch": 2.000555709919422,
"eval_loss": 0.13124322891235352,
"eval_runtime": 16.0696,
"eval_samples_per_second": 31.115,
"eval_steps_per_second": 3.92,
"step": 7200
},
{
"epoch": 2.0839121978327313,
"grad_norm": 0.4890081286430359,
"learning_rate": 0.0003305733074002038,
"loss": 0.1075,
"step": 7500
},
{
"epoch": 2.2006112809113643,
"eval_loss": 0.12690122425556183,
"eval_runtime": 15.8697,
"eval_samples_per_second": 31.507,
"eval_steps_per_second": 3.97,
"step": 7920
},
{
"epoch": 2.222839677688247,
"grad_norm": 0.5411983132362366,
"learning_rate": 0.0003259423914050199,
"loss": 0.104,
"step": 8000
},
{
"epoch": 2.361767157543762,
"grad_norm": 0.892245352268219,
"learning_rate": 0.00032132073724182645,
"loss": 0.1018,
"step": 8500
},
{
"epoch": 2.4006668519033063,
"eval_loss": 0.1253955215215683,
"eval_runtime": 15.9212,
"eval_samples_per_second": 31.405,
"eval_steps_per_second": 3.957,
"step": 8640
},
{
"epoch": 2.5006946373992776,
"grad_norm": 0.5154420137405396,
"learning_rate": 0.0003166898212466426,
"loss": 0.1018,
"step": 9000
},
{
"epoch": 2.600722422895249,
"eval_loss": 0.1270376443862915,
"eval_runtime": 15.8556,
"eval_samples_per_second": 31.535,
"eval_steps_per_second": 3.973,
"step": 9360
},
{
"epoch": 2.639622117254793,
"grad_norm": 0.4247698187828064,
"learning_rate": 0.0003120681670834491,
"loss": 0.0988,
"step": 9500
},
{
"epoch": 2.7785495971103087,
"grad_norm": 0.6174339652061462,
"learning_rate": 0.0003074372510882653,
"loss": 0.0931,
"step": 10000
},
{
"epoch": 2.800777993887191,
"eval_loss": 0.12492711842060089,
"eval_runtime": 15.9536,
"eval_samples_per_second": 31.341,
"eval_steps_per_second": 3.949,
"step": 10080
},
{
"epoch": 2.917477076965824,
"grad_norm": 0.3945905864238739,
"learning_rate": 0.0003028063350930814,
"loss": 0.0924,
"step": 10500
},
{
"epoch": 3.000833564879133,
"eval_loss": 0.12177152931690216,
"eval_runtime": 16.5123,
"eval_samples_per_second": 30.28,
"eval_steps_per_second": 3.815,
"step": 10800
},
{
"epoch": 3.0564045568213394,
"grad_norm": 0.4349508285522461,
"learning_rate": 0.0002981754190978976,
"loss": 0.0929,
"step": 11000
},
{
"epoch": 3.1953320366768545,
"grad_norm": 0.5195356011390686,
"learning_rate": 0.00029354450310271375,
"loss": 0.0897,
"step": 11500
},
{
"epoch": 3.2008891358710754,
"eval_loss": 0.12157219648361206,
"eval_runtime": 15.887,
"eval_samples_per_second": 31.472,
"eval_steps_per_second": 3.965,
"step": 11520
},
{
"epoch": 3.33425951653237,
"grad_norm": 0.38773760199546814,
"learning_rate": 0.0002889135871075299,
"loss": 0.0868,
"step": 12000
},
{
"epoch": 3.4009447068630174,
"eval_loss": 0.12406055629253387,
"eval_runtime": 15.9444,
"eval_samples_per_second": 31.359,
"eval_steps_per_second": 3.951,
"step": 12240
},
{
"epoch": 3.4731869963878856,
"grad_norm": 0.3054683804512024,
"learning_rate": 0.00028428267111234605,
"loss": 0.0865,
"step": 12500
},
{
"epoch": 3.6010002778549595,
"eval_loss": 0.11476034671068192,
"eval_runtime": 15.9006,
"eval_samples_per_second": 31.445,
"eval_steps_per_second": 3.962,
"step": 12960
},
{
"epoch": 3.612114476243401,
"grad_norm": 0.5311923623085022,
"learning_rate": 0.0002796610169491526,
"loss": 0.0845,
"step": 13000
},
{
"epoch": 3.7510419560989163,
"grad_norm": 0.7641647458076477,
"learning_rate": 0.0002750301009539687,
"loss": 0.084,
"step": 13500
},
{
"epoch": 3.801055848846902,
"eval_loss": 0.11587072908878326,
"eval_runtime": 15.933,
"eval_samples_per_second": 31.381,
"eval_steps_per_second": 3.954,
"step": 13680
},
{
"epoch": 3.889969435954432,
"grad_norm": 0.5842312574386597,
"learning_rate": 0.00027039918495878483,
"loss": 0.0815,
"step": 14000
},
{
"epoch": 4.001111419838844,
"eval_loss": 0.11761430650949478,
"eval_runtime": 16.0803,
"eval_samples_per_second": 31.094,
"eval_steps_per_second": 3.918,
"step": 14400
},
{
"epoch": 4.0288969158099475,
"grad_norm": 0.5182059407234192,
"learning_rate": 0.000265768268963601,
"loss": 0.0823,
"step": 14500
},
{
"epoch": 4.167824395665463,
"grad_norm": 0.3954576253890991,
"learning_rate": 0.0002611373529684172,
"loss": 0.0753,
"step": 15000
},
{
"epoch": 4.201166990830786,
"eval_loss": 0.11391445249319077,
"eval_runtime": 15.9483,
"eval_samples_per_second": 31.351,
"eval_steps_per_second": 3.95,
"step": 15120
},
{
"epoch": 4.306751875520978,
"grad_norm": 0.5974435210227966,
"learning_rate": 0.00025650643697323335,
"loss": 0.0762,
"step": 15500
},
{
"epoch": 4.4012225618227285,
"eval_loss": 0.11403658986091614,
"eval_runtime": 15.92,
"eval_samples_per_second": 31.407,
"eval_steps_per_second": 3.957,
"step": 15840
},
{
"epoch": 4.445679355376494,
"grad_norm": 0.4496535360813141,
"learning_rate": 0.0002518755209780495,
"loss": 0.0737,
"step": 16000
},
{
"epoch": 4.584606835232009,
"grad_norm": 0.5617558360099792,
"learning_rate": 0.0002472446049828656,
"loss": 0.074,
"step": 16500
},
{
"epoch": 4.601278132814671,
"eval_loss": 0.11306341737508774,
"eval_runtime": 15.9244,
"eval_samples_per_second": 31.398,
"eval_steps_per_second": 3.956,
"step": 16560
},
{
"epoch": 4.723534315087524,
"grad_norm": 0.5999208092689514,
"learning_rate": 0.00024261368898768177,
"loss": 0.0732,
"step": 17000
},
{
"epoch": 4.801333703806613,
"eval_loss": 0.11077062785625458,
"eval_runtime": 15.9311,
"eval_samples_per_second": 31.385,
"eval_steps_per_second": 3.955,
"step": 17280
},
{
"epoch": 4.86246179494304,
"grad_norm": 0.3961442708969116,
"learning_rate": 0.0002379920348244883,
"loss": 0.0724,
"step": 17500
}
],
"logging_steps": 500,
"max_steps": 43188,
"num_input_tokens_seen": 0,
"num_train_epochs": 12,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.3736461282464236e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}