lesso06's picture
Training in progress, step 500, checkpoint
741b05c verified
{
"best_metric": 0.8025246858596802,
"best_model_checkpoint": "miner_id_24/checkpoint-500",
"epoch": 0.11486331265793706,
"eval_steps": 50,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00022972662531587412,
"eval_loss": 1.124374270439148,
"eval_runtime": 51.9298,
"eval_samples_per_second": 35.298,
"eval_steps_per_second": 8.839,
"step": 1
},
{
"epoch": 0.002297266253158741,
"grad_norm": 0.9001293778419495,
"learning_rate": 4.12e-05,
"loss": 0.9218,
"step": 10
},
{
"epoch": 0.004594532506317482,
"grad_norm": 1.1467550992965698,
"learning_rate": 8.24e-05,
"loss": 0.9375,
"step": 20
},
{
"epoch": 0.006891798759476223,
"grad_norm": 1.1466718912124634,
"learning_rate": 0.0001236,
"loss": 0.9289,
"step": 30
},
{
"epoch": 0.009189065012634964,
"grad_norm": 1.5765711069107056,
"learning_rate": 0.0001648,
"loss": 0.8709,
"step": 40
},
{
"epoch": 0.011486331265793705,
"grad_norm": 1.228388786315918,
"learning_rate": 0.000206,
"loss": 0.925,
"step": 50
},
{
"epoch": 0.011486331265793705,
"eval_loss": 0.8846560120582581,
"eval_runtime": 52.2146,
"eval_samples_per_second": 35.105,
"eval_steps_per_second": 8.791,
"step": 50
},
{
"epoch": 0.013783597518952447,
"grad_norm": 0.6585016250610352,
"learning_rate": 0.0002057490971767619,
"loss": 0.7491,
"step": 60
},
{
"epoch": 0.01608086377211119,
"grad_norm": 0.8562206029891968,
"learning_rate": 0.00020499761108038175,
"loss": 0.8757,
"step": 70
},
{
"epoch": 0.018378130025269928,
"grad_norm": 0.7634038925170898,
"learning_rate": 0.00020374920287558198,
"loss": 0.9325,
"step": 80
},
{
"epoch": 0.02067539627842867,
"grad_norm": 0.8068410754203796,
"learning_rate": 0.00020200995468164684,
"loss": 0.842,
"step": 90
},
{
"epoch": 0.02297266253158741,
"grad_norm": 1.1460894346237183,
"learning_rate": 0.00019978833994094855,
"loss": 0.8731,
"step": 100
},
{
"epoch": 0.02297266253158741,
"eval_loss": 0.8611448407173157,
"eval_runtime": 51.6983,
"eval_samples_per_second": 35.456,
"eval_steps_per_second": 8.878,
"step": 100
},
{
"epoch": 0.02526992878474615,
"grad_norm": 0.5959282517433167,
"learning_rate": 0.00019709518213718787,
"loss": 0.7564,
"step": 110
},
{
"epoch": 0.027567195037904894,
"grad_norm": 0.7839557528495789,
"learning_rate": 0.00019394360206446948,
"loss": 0.8659,
"step": 120
},
{
"epoch": 0.029864461291063633,
"grad_norm": 0.8548005819320679,
"learning_rate": 0.00019034895390411186,
"loss": 0.8915,
"step": 130
},
{
"epoch": 0.03216172754422238,
"grad_norm": 0.7380218505859375,
"learning_rate": 0.0001863287504206196,
"loss": 0.8667,
"step": 140
},
{
"epoch": 0.03445899379738112,
"grad_norm": 1.0800361633300781,
"learning_rate": 0.00018190257764125471,
"loss": 0.848,
"step": 150
},
{
"epoch": 0.03445899379738112,
"eval_loss": 0.8501473665237427,
"eval_runtime": 51.9217,
"eval_samples_per_second": 35.303,
"eval_steps_per_second": 8.84,
"step": 150
},
{
"epoch": 0.036756260050539856,
"grad_norm": 0.5680871605873108,
"learning_rate": 0.00017709199943488106,
"loss": 0.7338,
"step": 160
},
{
"epoch": 0.0390535263036986,
"grad_norm": 0.6372123956680298,
"learning_rate": 0.00017192045245496238,
"loss": 0.8033,
"step": 170
},
{
"epoch": 0.04135079255685734,
"grad_norm": 0.8441762924194336,
"learning_rate": 0.00016641313195854277,
"loss": 0.9452,
"step": 180
},
{
"epoch": 0.04364805881001608,
"grad_norm": 0.7282130718231201,
"learning_rate": 0.0001605968690574869,
"loss": 0.8036,
"step": 190
},
{
"epoch": 0.04594532506317482,
"grad_norm": 1.0529526472091675,
"learning_rate": 0.0001545,
"loss": 0.8569,
"step": 200
},
{
"epoch": 0.04594532506317482,
"eval_loss": 0.8399880528450012,
"eval_runtime": 51.6878,
"eval_samples_per_second": 35.463,
"eval_steps_per_second": 8.88,
"step": 200
},
{
"epoch": 0.048242591316333565,
"grad_norm": 0.5307555794715881,
"learning_rate": 0.00014815222811927496,
"loss": 0.747,
"step": 210
},
{
"epoch": 0.0505398575694923,
"grad_norm": 0.7486038208007812,
"learning_rate": 0.00014158447912183896,
"loss": 0.8806,
"step": 220
},
{
"epoch": 0.052837123822651044,
"grad_norm": 0.875223696231842,
"learning_rate": 0.00013482875042061958,
"loss": 0.9061,
"step": 230
},
{
"epoch": 0.05513439007580979,
"grad_norm": 0.7279521226882935,
"learning_rate": 0.00012791795524676576,
"loss": 0.8071,
"step": 240
},
{
"epoch": 0.05743165632896853,
"grad_norm": 1.203727126121521,
"learning_rate": 0.00012088576229969385,
"loss": 0.8783,
"step": 250
},
{
"epoch": 0.05743165632896853,
"eval_loss": 0.8261856436729431,
"eval_runtime": 51.6149,
"eval_samples_per_second": 35.513,
"eval_steps_per_second": 8.893,
"step": 250
},
{
"epoch": 0.05972892258212727,
"grad_norm": 0.5175067782402039,
"learning_rate": 0.0001137664317165683,
"loss": 0.6935,
"step": 260
},
{
"epoch": 0.06202618883528601,
"grad_norm": 0.6327574849128723,
"learning_rate": 0.00010659464816035761,
"loss": 0.7742,
"step": 270
},
{
"epoch": 0.06432345508844475,
"grad_norm": 0.7658243775367737,
"learning_rate": 9.940535183964242e-05,
"loss": 0.8565,
"step": 280
},
{
"epoch": 0.06662072134160349,
"grad_norm": 0.7925461530685425,
"learning_rate": 9.22335682834317e-05,
"loss": 0.835,
"step": 290
},
{
"epoch": 0.06891798759476224,
"grad_norm": 0.9995759129524231,
"learning_rate": 8.511423770030617e-05,
"loss": 0.8554,
"step": 300
},
{
"epoch": 0.06891798759476224,
"eval_loss": 0.817738950252533,
"eval_runtime": 51.6421,
"eval_samples_per_second": 35.494,
"eval_steps_per_second": 8.888,
"step": 300
},
{
"epoch": 0.07121525384792098,
"grad_norm": 0.5488688945770264,
"learning_rate": 7.808204475323423e-05,
"loss": 0.692,
"step": 310
},
{
"epoch": 0.07351252010107971,
"grad_norm": 0.6316297054290771,
"learning_rate": 7.117124957938042e-05,
"loss": 0.8166,
"step": 320
},
{
"epoch": 0.07580978635423846,
"grad_norm": 0.6976966857910156,
"learning_rate": 6.441552087816105e-05,
"loss": 0.9222,
"step": 330
},
{
"epoch": 0.0781070526073972,
"grad_norm": 0.6356428265571594,
"learning_rate": 5.784777188072502e-05,
"loss": 0.852,
"step": 340
},
{
"epoch": 0.08040431886055593,
"grad_norm": 0.9787359833717346,
"learning_rate": 5.150000000000002e-05,
"loss": 0.8186,
"step": 350
},
{
"epoch": 0.08040431886055593,
"eval_loss": 0.8114416003227234,
"eval_runtime": 51.6228,
"eval_samples_per_second": 35.508,
"eval_steps_per_second": 8.891,
"step": 350
},
{
"epoch": 0.08270158511371468,
"grad_norm": 0.5780043601989746,
"learning_rate": 4.540313094251309e-05,
"loss": 0.6796,
"step": 360
},
{
"epoch": 0.08499885136687342,
"grad_norm": 0.7033482193946838,
"learning_rate": 3.958686804145719e-05,
"loss": 0.8087,
"step": 370
},
{
"epoch": 0.08729611762003216,
"grad_norm": 0.5974348783493042,
"learning_rate": 3.4079547545037634e-05,
"loss": 0.7923,
"step": 380
},
{
"epoch": 0.08959338387319091,
"grad_norm": 0.8100835084915161,
"learning_rate": 2.8908000565118947e-05,
"loss": 0.8478,
"step": 390
},
{
"epoch": 0.09189065012634964,
"grad_norm": 0.816436231136322,
"learning_rate": 2.4097422358745275e-05,
"loss": 0.8498,
"step": 400
},
{
"epoch": 0.09189065012634964,
"eval_loss": 0.8052087426185608,
"eval_runtime": 51.7512,
"eval_samples_per_second": 35.419,
"eval_steps_per_second": 8.869,
"step": 400
},
{
"epoch": 0.09418791637950838,
"grad_norm": 0.5542159080505371,
"learning_rate": 1.9671249579380422e-05,
"loss": 0.7486,
"step": 410
},
{
"epoch": 0.09648518263266713,
"grad_norm": 0.5713785886764526,
"learning_rate": 1.5651046095888127e-05,
"loss": 0.7955,
"step": 420
},
{
"epoch": 0.09878244888582587,
"grad_norm": 0.7074142098426819,
"learning_rate": 1.205639793553052e-05,
"loss": 0.8298,
"step": 430
},
{
"epoch": 0.1010797151389846,
"grad_norm": 0.6469036936759949,
"learning_rate": 8.904817862812098e-06,
"loss": 0.8189,
"step": 440
},
{
"epoch": 0.10337698139214335,
"grad_norm": 0.9961839914321899,
"learning_rate": 6.211660059051443e-06,
"loss": 0.816,
"step": 450
},
{
"epoch": 0.10337698139214335,
"eval_loss": 0.803685188293457,
"eval_runtime": 52.0877,
"eval_samples_per_second": 35.191,
"eval_steps_per_second": 8.812,
"step": 450
},
{
"epoch": 0.10567424764530209,
"grad_norm": 0.5176745057106018,
"learning_rate": 3.990045318353154e-06,
"loss": 0.7438,
"step": 460
},
{
"epoch": 0.10797151389846082,
"grad_norm": 0.5562933087348938,
"learning_rate": 2.250797124418014e-06,
"loss": 0.8519,
"step": 470
},
{
"epoch": 0.11026878015161957,
"grad_norm": 0.634845495223999,
"learning_rate": 1.0023889196182526e-06,
"loss": 0.8369,
"step": 480
},
{
"epoch": 0.11256604640477831,
"grad_norm": 0.7085196375846863,
"learning_rate": 2.5090282323810766e-07,
"loss": 0.8191,
"step": 490
},
{
"epoch": 0.11486331265793706,
"grad_norm": 1.0037809610366821,
"learning_rate": 0.0,
"loss": 0.8017,
"step": 500
},
{
"epoch": 0.11486331265793706,
"eval_loss": 0.8025246858596802,
"eval_runtime": 51.8757,
"eval_samples_per_second": 35.334,
"eval_steps_per_second": 8.848,
"step": 500
}
],
"logging_steps": 10,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.39979133403136e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}