VERSIL91's picture
Training in progress, step 50, checkpoint
4491456 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.03251239535072747,
"eval_steps": 13,
"global_step": 50,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006502479070145493,
"grad_norm": 0.16097666323184967,
"learning_rate": 1e-05,
"loss": 1.2708,
"step": 1
},
{
"epoch": 0.0006502479070145493,
"eval_loss": 1.3661339282989502,
"eval_runtime": 227.5771,
"eval_samples_per_second": 11.385,
"eval_steps_per_second": 5.695,
"step": 1
},
{
"epoch": 0.0013004958140290985,
"grad_norm": 0.19011889398097992,
"learning_rate": 2e-05,
"loss": 1.355,
"step": 2
},
{
"epoch": 0.001950743721043648,
"grad_norm": 0.16794496774673462,
"learning_rate": 3e-05,
"loss": 1.3159,
"step": 3
},
{
"epoch": 0.002600991628058197,
"grad_norm": 0.15826600790023804,
"learning_rate": 4e-05,
"loss": 1.327,
"step": 4
},
{
"epoch": 0.0032512395350727465,
"grad_norm": 0.1772988885641098,
"learning_rate": 5e-05,
"loss": 1.2694,
"step": 5
},
{
"epoch": 0.003901487442087296,
"grad_norm": 0.16691641509532928,
"learning_rate": 6e-05,
"loss": 1.2382,
"step": 6
},
{
"epoch": 0.004551735349101845,
"grad_norm": 0.17923057079315186,
"learning_rate": 7e-05,
"loss": 1.2678,
"step": 7
},
{
"epoch": 0.005201983256116394,
"grad_norm": 0.19407686591148376,
"learning_rate": 8e-05,
"loss": 1.2386,
"step": 8
},
{
"epoch": 0.005852231163130944,
"grad_norm": 0.21998471021652222,
"learning_rate": 9e-05,
"loss": 1.3607,
"step": 9
},
{
"epoch": 0.006502479070145493,
"grad_norm": 0.2332504242658615,
"learning_rate": 0.0001,
"loss": 1.3526,
"step": 10
},
{
"epoch": 0.007152726977160042,
"grad_norm": 0.20302407443523407,
"learning_rate": 9.98458666866564e-05,
"loss": 1.2525,
"step": 11
},
{
"epoch": 0.007802974884174592,
"grad_norm": 0.20793932676315308,
"learning_rate": 9.938441702975689e-05,
"loss": 1.3305,
"step": 12
},
{
"epoch": 0.00845322279118914,
"grad_norm": 0.24406825006008148,
"learning_rate": 9.861849601988383e-05,
"loss": 1.3575,
"step": 13
},
{
"epoch": 0.00845322279118914,
"eval_loss": 1.287429928779602,
"eval_runtime": 74.3569,
"eval_samples_per_second": 34.845,
"eval_steps_per_second": 17.429,
"step": 13
},
{
"epoch": 0.00910347069820369,
"grad_norm": 0.24397294223308563,
"learning_rate": 9.755282581475769e-05,
"loss": 1.2718,
"step": 14
},
{
"epoch": 0.00975371860521824,
"grad_norm": 0.2642868459224701,
"learning_rate": 9.619397662556435e-05,
"loss": 1.2672,
"step": 15
},
{
"epoch": 0.010403966512232788,
"grad_norm": 0.29374638199806213,
"learning_rate": 9.45503262094184e-05,
"loss": 1.2364,
"step": 16
},
{
"epoch": 0.011054214419247338,
"grad_norm": 0.2562926709651947,
"learning_rate": 9.263200821770461e-05,
"loss": 1.2337,
"step": 17
},
{
"epoch": 0.011704462326261888,
"grad_norm": 0.24123510718345642,
"learning_rate": 9.045084971874738e-05,
"loss": 1.2308,
"step": 18
},
{
"epoch": 0.012354710233276436,
"grad_norm": 0.298176109790802,
"learning_rate": 8.802029828000156e-05,
"loss": 1.269,
"step": 19
},
{
"epoch": 0.013004958140290986,
"grad_norm": 0.29537469148635864,
"learning_rate": 8.535533905932738e-05,
"loss": 1.2055,
"step": 20
},
{
"epoch": 0.013655206047305536,
"grad_norm": 0.2935850918292999,
"learning_rate": 8.247240241650918e-05,
"loss": 1.2187,
"step": 21
},
{
"epoch": 0.014305453954320084,
"grad_norm": 0.27675747871398926,
"learning_rate": 7.938926261462366e-05,
"loss": 1.2315,
"step": 22
},
{
"epoch": 0.014955701861334634,
"grad_norm": 0.2816616892814636,
"learning_rate": 7.612492823579745e-05,
"loss": 1.1898,
"step": 23
},
{
"epoch": 0.015605949768349184,
"grad_norm": 0.25230491161346436,
"learning_rate": 7.269952498697734e-05,
"loss": 1.2143,
"step": 24
},
{
"epoch": 0.016256197675363734,
"grad_norm": 0.31681883335113525,
"learning_rate": 6.91341716182545e-05,
"loss": 1.2153,
"step": 25
},
{
"epoch": 0.01690644558237828,
"grad_norm": 0.27878355979919434,
"learning_rate": 6.545084971874738e-05,
"loss": 1.1783,
"step": 26
},
{
"epoch": 0.01690644558237828,
"eval_loss": 1.2144542932510376,
"eval_runtime": 74.3264,
"eval_samples_per_second": 34.86,
"eval_steps_per_second": 17.437,
"step": 26
},
{
"epoch": 0.01755669348939283,
"grad_norm": 0.29930874705314636,
"learning_rate": 6.167226819279528e-05,
"loss": 1.2125,
"step": 27
},
{
"epoch": 0.01820694139640738,
"grad_norm": 0.27576595544815063,
"learning_rate": 5.782172325201155e-05,
"loss": 1.1358,
"step": 28
},
{
"epoch": 0.01885718930342193,
"grad_norm": 0.34936976432800293,
"learning_rate": 5.392295478639225e-05,
"loss": 1.2408,
"step": 29
},
{
"epoch": 0.01950743721043648,
"grad_norm": 0.32189854979515076,
"learning_rate": 5e-05,
"loss": 1.2407,
"step": 30
},
{
"epoch": 0.02015768511745103,
"grad_norm": 0.31609469652175903,
"learning_rate": 4.607704521360776e-05,
"loss": 1.213,
"step": 31
},
{
"epoch": 0.020807933024465576,
"grad_norm": 0.36792436242103577,
"learning_rate": 4.2178276747988446e-05,
"loss": 1.24,
"step": 32
},
{
"epoch": 0.021458180931480126,
"grad_norm": 0.293441504240036,
"learning_rate": 3.832773180720475e-05,
"loss": 1.1205,
"step": 33
},
{
"epoch": 0.022108428838494676,
"grad_norm": 0.357262521982193,
"learning_rate": 3.4549150281252636e-05,
"loss": 1.2352,
"step": 34
},
{
"epoch": 0.022758676745509226,
"grad_norm": 0.35729458928108215,
"learning_rate": 3.086582838174551e-05,
"loss": 1.0767,
"step": 35
},
{
"epoch": 0.023408924652523776,
"grad_norm": 0.3217020034790039,
"learning_rate": 2.7300475013022663e-05,
"loss": 1.2367,
"step": 36
},
{
"epoch": 0.024059172559538326,
"grad_norm": 0.3059484362602234,
"learning_rate": 2.3875071764202563e-05,
"loss": 1.1982,
"step": 37
},
{
"epoch": 0.024709420466552872,
"grad_norm": 0.3412172198295593,
"learning_rate": 2.061073738537635e-05,
"loss": 1.2419,
"step": 38
},
{
"epoch": 0.025359668373567422,
"grad_norm": 0.3185587227344513,
"learning_rate": 1.7527597583490822e-05,
"loss": 1.2128,
"step": 39
},
{
"epoch": 0.025359668373567422,
"eval_loss": 1.198966145515442,
"eval_runtime": 74.3589,
"eval_samples_per_second": 34.845,
"eval_steps_per_second": 17.429,
"step": 39
},
{
"epoch": 0.026009916280581972,
"grad_norm": 0.3025207817554474,
"learning_rate": 1.4644660940672627e-05,
"loss": 1.2036,
"step": 40
},
{
"epoch": 0.02666016418759652,
"grad_norm": 0.3241982161998749,
"learning_rate": 1.1979701719998453e-05,
"loss": 1.2358,
"step": 41
},
{
"epoch": 0.02731041209461107,
"grad_norm": 0.37644854187965393,
"learning_rate": 9.549150281252633e-06,
"loss": 1.2374,
"step": 42
},
{
"epoch": 0.027960660001625618,
"grad_norm": 0.32864895462989807,
"learning_rate": 7.367991782295391e-06,
"loss": 1.1828,
"step": 43
},
{
"epoch": 0.028610907908640168,
"grad_norm": 0.3057841956615448,
"learning_rate": 5.449673790581611e-06,
"loss": 1.0924,
"step": 44
},
{
"epoch": 0.029261155815654718,
"grad_norm": 0.30859678983688354,
"learning_rate": 3.8060233744356633e-06,
"loss": 1.2563,
"step": 45
},
{
"epoch": 0.029911403722669268,
"grad_norm": 0.2911403775215149,
"learning_rate": 2.4471741852423237e-06,
"loss": 1.1663,
"step": 46
},
{
"epoch": 0.030561651629683818,
"grad_norm": 0.2932097017765045,
"learning_rate": 1.3815039801161721e-06,
"loss": 1.2638,
"step": 47
},
{
"epoch": 0.031211899536698368,
"grad_norm": 0.32970038056373596,
"learning_rate": 6.15582970243117e-07,
"loss": 1.2205,
"step": 48
},
{
"epoch": 0.031862147443712914,
"grad_norm": 0.3031351566314697,
"learning_rate": 1.5413331334360182e-07,
"loss": 1.1235,
"step": 49
},
{
"epoch": 0.03251239535072747,
"grad_norm": 0.3202054500579834,
"learning_rate": 0.0,
"loss": 1.2213,
"step": 50
}
],
"logging_steps": 1,
"max_steps": 50,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 13,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.696404144213197e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}