HiTruong's picture
Training in progress, epoch 14, checkpoint
0ecf281 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 14.989939637826962,
"eval_steps": 500,
"global_step": 3720,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.4024144869215292,
"grad_norm": 0.1414330005645752,
"learning_rate": 0.0002,
"loss": 0.8177,
"step": 100
},
{
"epoch": 0.8048289738430584,
"grad_norm": 0.1557430922985077,
"learning_rate": 0.0002,
"loss": 0.7179,
"step": 200
},
{
"epoch": 1.2072434607645874,
"grad_norm": 0.19957831501960754,
"learning_rate": 0.0002,
"loss": 0.676,
"step": 300
},
{
"epoch": 1.6096579476861166,
"grad_norm": 0.23390892148017883,
"learning_rate": 0.0002,
"loss": 0.6436,
"step": 400
},
{
"epoch": 2.0120724346076457,
"grad_norm": 0.22036917507648468,
"learning_rate": 0.0002,
"loss": 0.6293,
"step": 500
},
{
"epoch": 2.414486921529175,
"grad_norm": 0.23049889504909515,
"learning_rate": 0.0002,
"loss": 0.5812,
"step": 600
},
{
"epoch": 2.816901408450704,
"grad_norm": 0.2435583770275116,
"learning_rate": 0.0002,
"loss": 0.5789,
"step": 700
},
{
"epoch": 3.219315895372233,
"grad_norm": 0.2674865126609802,
"learning_rate": 0.0002,
"loss": 0.5455,
"step": 800
},
{
"epoch": 3.6217303822937623,
"grad_norm": 0.2517271339893341,
"learning_rate": 0.0002,
"loss": 0.5281,
"step": 900
},
{
"epoch": 4.0241448692152915,
"grad_norm": 0.27737826108932495,
"learning_rate": 0.0002,
"loss": 0.5117,
"step": 1000
},
{
"epoch": 4.426559356136821,
"grad_norm": 0.2797327935695648,
"learning_rate": 0.0002,
"loss": 0.4646,
"step": 1100
},
{
"epoch": 4.82897384305835,
"grad_norm": 0.2902151644229889,
"learning_rate": 0.0002,
"loss": 0.4705,
"step": 1200
},
{
"epoch": 5.241448692152917,
"grad_norm": 0.3148166239261627,
"learning_rate": 0.0002,
"loss": 0.4095,
"step": 1300
},
{
"epoch": 5.6438631790744465,
"grad_norm": 0.34497711062431335,
"learning_rate": 0.0002,
"loss": 0.4081,
"step": 1400
},
{
"epoch": 6.046277665995976,
"grad_norm": 0.3478667736053467,
"learning_rate": 0.0002,
"loss": 0.403,
"step": 1500
},
{
"epoch": 6.448692152917505,
"grad_norm": 0.3573280870914459,
"learning_rate": 0.0002,
"loss": 0.3532,
"step": 1600
},
{
"epoch": 6.851106639839034,
"grad_norm": 0.423519492149353,
"learning_rate": 0.0002,
"loss": 0.3552,
"step": 1700
},
{
"epoch": 7.253521126760563,
"grad_norm": 0.39140382409095764,
"learning_rate": 0.0002,
"loss": 0.3265,
"step": 1800
},
{
"epoch": 7.655935613682092,
"grad_norm": 0.42240509390830994,
"learning_rate": 0.0002,
"loss": 0.3078,
"step": 1900
},
{
"epoch": 8.058350100603622,
"grad_norm": 0.41395705938339233,
"learning_rate": 0.0002,
"loss": 0.301,
"step": 2000
},
{
"epoch": 8.460764587525151,
"grad_norm": 0.4344163239002228,
"learning_rate": 0.0002,
"loss": 0.2637,
"step": 2100
},
{
"epoch": 8.86317907444668,
"grad_norm": 0.4379712641239166,
"learning_rate": 0.0002,
"loss": 0.2684,
"step": 2200
},
{
"epoch": 9.26559356136821,
"grad_norm": 0.37764859199523926,
"learning_rate": 0.0002,
"loss": 0.2411,
"step": 2300
},
{
"epoch": 9.668008048289739,
"grad_norm": 0.4754630923271179,
"learning_rate": 0.0002,
"loss": 0.2334,
"step": 2400
},
{
"epoch": 10.080482897384305,
"grad_norm": 0.39889901876449585,
"learning_rate": 0.0002,
"loss": 0.1951,
"step": 2500
},
{
"epoch": 10.482897384305835,
"grad_norm": 0.4438158869743347,
"learning_rate": 0.0002,
"loss": 0.2041,
"step": 2600
},
{
"epoch": 10.885311871227364,
"grad_norm": 0.46953314542770386,
"learning_rate": 0.0002,
"loss": 0.2066,
"step": 2700
},
{
"epoch": 11.287726358148893,
"grad_norm": 0.4034820795059204,
"learning_rate": 0.0002,
"loss": 0.1857,
"step": 2800
},
{
"epoch": 11.690140845070422,
"grad_norm": 0.44169268012046814,
"learning_rate": 0.0002,
"loss": 0.1859,
"step": 2900
},
{
"epoch": 12.092555331991951,
"grad_norm": 0.3979734480381012,
"learning_rate": 0.0002,
"loss": 0.1799,
"step": 3000
},
{
"epoch": 12.49496981891348,
"grad_norm": 0.38474753499031067,
"learning_rate": 0.0002,
"loss": 0.165,
"step": 3100
},
{
"epoch": 12.89738430583501,
"grad_norm": 0.40796715021133423,
"learning_rate": 0.0002,
"loss": 0.1706,
"step": 3200
},
{
"epoch": 13.299798792756539,
"grad_norm": 0.42144036293029785,
"learning_rate": 0.0002,
"loss": 0.1563,
"step": 3300
},
{
"epoch": 13.702213279678068,
"grad_norm": 0.40393030643463135,
"learning_rate": 0.0002,
"loss": 0.1555,
"step": 3400
},
{
"epoch": 14.104627766599597,
"grad_norm": 0.3850514888763428,
"learning_rate": 0.0002,
"loss": 0.1537,
"step": 3500
},
{
"epoch": 14.507042253521126,
"grad_norm": 0.3872847855091095,
"learning_rate": 0.0002,
"loss": 0.1427,
"step": 3600
},
{
"epoch": 14.909456740442655,
"grad_norm": 0.3384864032268524,
"learning_rate": 0.0002,
"loss": 0.1471,
"step": 3700
}
],
"logging_steps": 100,
"max_steps": 3720,
"num_input_tokens_seen": 0,
"num_train_epochs": 15,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.606624189428531e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}