lesso01's picture
Training in progress, step 500, checkpoint
f5e059a verified
{
"best_metric": 0.32265836000442505,
"best_model_checkpoint": "miner_id_24/checkpoint-500",
"epoch": 0.437636761487965,
"eval_steps": 50,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00087527352297593,
"eval_loss": 0.6020214557647705,
"eval_runtime": 73.5344,
"eval_samples_per_second": 6.541,
"eval_steps_per_second": 1.645,
"step": 1
},
{
"epoch": 0.0087527352297593,
"grad_norm": 3.310575246810913,
"learning_rate": 4.02e-05,
"loss": 2.1865,
"step": 10
},
{
"epoch": 0.0175054704595186,
"grad_norm": 2.0884060859680176,
"learning_rate": 8.04e-05,
"loss": 1.6063,
"step": 20
},
{
"epoch": 0.0262582056892779,
"grad_norm": 0.3308704197406769,
"learning_rate": 0.0001206,
"loss": 0.5758,
"step": 30
},
{
"epoch": 0.0350109409190372,
"grad_norm": 0.3193948566913605,
"learning_rate": 0.0001608,
"loss": 0.0245,
"step": 40
},
{
"epoch": 0.0437636761487965,
"grad_norm": 2.0569775104522705,
"learning_rate": 0.000201,
"loss": 0.0242,
"step": 50
},
{
"epoch": 0.0437636761487965,
"eval_loss": 0.4622001349925995,
"eval_runtime": 73.7255,
"eval_samples_per_second": 6.524,
"eval_steps_per_second": 1.641,
"step": 50
},
{
"epoch": 0.0525164113785558,
"grad_norm": 1.3228288888931274,
"learning_rate": 0.00020075518705111234,
"loss": 1.3474,
"step": 60
},
{
"epoch": 0.061269146608315096,
"grad_norm": 16.066726684570312,
"learning_rate": 0.00020002194090852784,
"loss": 1.1573,
"step": 70
},
{
"epoch": 0.0700218818380744,
"grad_norm": 0.06235679239034653,
"learning_rate": 0.00019880383387374748,
"loss": 0.7334,
"step": 80
},
{
"epoch": 0.0787746170678337,
"grad_norm": 2.231853485107422,
"learning_rate": 0.00019710680044180106,
"loss": 0.0922,
"step": 90
},
{
"epoch": 0.087527352297593,
"grad_norm": 1.0548862218856812,
"learning_rate": 0.0001949391083889838,
"loss": 0.0135,
"step": 100
},
{
"epoch": 0.087527352297593,
"eval_loss": 0.4962608814239502,
"eval_runtime": 74.2268,
"eval_samples_per_second": 6.48,
"eval_steps_per_second": 1.63,
"step": 100
},
{
"epoch": 0.0962800875273523,
"grad_norm": 1.9288173913955688,
"learning_rate": 0.00019231131849308138,
"loss": 1.7685,
"step": 110
},
{
"epoch": 0.1050328227571116,
"grad_norm": 1.6081706285476685,
"learning_rate": 0.00018923623308232218,
"loss": 1.2968,
"step": 120
},
{
"epoch": 0.1137855579868709,
"grad_norm": 0.19498944282531738,
"learning_rate": 0.00018572883366372081,
"loss": 0.503,
"step": 130
},
{
"epoch": 0.12253829321663019,
"grad_norm": 0.06771685183048248,
"learning_rate": 0.00018180620793468224,
"loss": 0.009,
"step": 140
},
{
"epoch": 0.13129102844638948,
"grad_norm": 0.04488571360707283,
"learning_rate": 0.00017748746653345728,
"loss": 0.003,
"step": 150
},
{
"epoch": 0.13129102844638948,
"eval_loss": 0.5679339170455933,
"eval_runtime": 73.9724,
"eval_samples_per_second": 6.502,
"eval_steps_per_second": 1.636,
"step": 150
},
{
"epoch": 0.1400437636761488,
"grad_norm": 2.001000165939331,
"learning_rate": 0.00017279364993403443,
"loss": 2.3431,
"step": 160
},
{
"epoch": 0.1487964989059081,
"grad_norm": 0.5801878571510315,
"learning_rate": 0.00016774762593906525,
"loss": 1.0523,
"step": 170
},
{
"epoch": 0.1575492341356674,
"grad_norm": 0.1135367825627327,
"learning_rate": 0.00016237397827022866,
"loss": 0.3065,
"step": 180
},
{
"epoch": 0.16630196936542668,
"grad_norm": 0.016945907846093178,
"learning_rate": 0.00015669888679881007,
"loss": 0.044,
"step": 190
},
{
"epoch": 0.175054704595186,
"grad_norm": 1.1491788625717163,
"learning_rate": 0.00015075,
"loss": 0.0059,
"step": 200
},
{
"epoch": 0.175054704595186,
"eval_loss": 0.4312651455402374,
"eval_runtime": 73.9325,
"eval_samples_per_second": 6.506,
"eval_steps_per_second": 1.637,
"step": 200
},
{
"epoch": 0.1838074398249453,
"grad_norm": 1.787918210029602,
"learning_rate": 0.00014455630025230227,
"loss": 1.7917,
"step": 210
},
{
"epoch": 0.1925601750547046,
"grad_norm": 0.8348940014839172,
"learning_rate": 0.00013814796263829918,
"loss": 1.4007,
"step": 220
},
{
"epoch": 0.2013129102844639,
"grad_norm": 0.09085320681333542,
"learning_rate": 0.00013155620793468223,
"loss": 0.5069,
"step": 230
},
{
"epoch": 0.2100656455142232,
"grad_norm": 0.22519822418689728,
"learning_rate": 0.0001248131505077666,
"loss": 0.0131,
"step": 240
},
{
"epoch": 0.2188183807439825,
"grad_norm": 1.1649621725082397,
"learning_rate": 0.00011795164185552652,
"loss": 0.0031,
"step": 250
},
{
"epoch": 0.2188183807439825,
"eval_loss": 0.4163644015789032,
"eval_runtime": 74.0082,
"eval_samples_per_second": 6.499,
"eval_steps_per_second": 1.635,
"step": 250
},
{
"epoch": 0.2275711159737418,
"grad_norm": 1.9786158800125122,
"learning_rate": 0.00011100511055839919,
"loss": 1.8299,
"step": 260
},
{
"epoch": 0.2363238512035011,
"grad_norm": 0.5342739820480347,
"learning_rate": 0.00010400739941860137,
"loss": 1.3171,
"step": 270
},
{
"epoch": 0.24507658643326038,
"grad_norm": 0.442454993724823,
"learning_rate": 9.699260058139868e-05,
"loss": 0.5873,
"step": 280
},
{
"epoch": 0.2538293216630197,
"grad_norm": 0.015526807866990566,
"learning_rate": 8.999488944160085e-05,
"loss": 0.0043,
"step": 290
},
{
"epoch": 0.26258205689277897,
"grad_norm": 0.0174541212618351,
"learning_rate": 8.30483581444735e-05,
"loss": 0.0057,
"step": 300
},
{
"epoch": 0.26258205689277897,
"eval_loss": 0.41985148191452026,
"eval_runtime": 74.0162,
"eval_samples_per_second": 6.499,
"eval_steps_per_second": 1.635,
"step": 300
},
{
"epoch": 0.2713347921225383,
"grad_norm": 1.2082922458648682,
"learning_rate": 7.618684949223341e-05,
"loss": 1.6062,
"step": 310
},
{
"epoch": 0.2800875273522976,
"grad_norm": 0.8829997181892395,
"learning_rate": 6.94437920653178e-05,
"loss": 1.1266,
"step": 320
},
{
"epoch": 0.2888402625820569,
"grad_norm": 0.010656801983714104,
"learning_rate": 6.285203736170084e-05,
"loss": 0.4209,
"step": 330
},
{
"epoch": 0.2975929978118162,
"grad_norm": 0.003175681456923485,
"learning_rate": 5.6443699747697714e-05,
"loss": 0.001,
"step": 340
},
{
"epoch": 0.3063457330415755,
"grad_norm": 0.427325040102005,
"learning_rate": 5.025000000000002e-05,
"loss": 0.0057,
"step": 350
},
{
"epoch": 0.3063457330415755,
"eval_loss": 0.35144680738449097,
"eval_runtime": 74.2859,
"eval_samples_per_second": 6.475,
"eval_steps_per_second": 1.629,
"step": 350
},
{
"epoch": 0.3150984682713348,
"grad_norm": 1.127820611000061,
"learning_rate": 4.430111320118996e-05,
"loss": 1.3987,
"step": 360
},
{
"epoch": 0.3238512035010941,
"grad_norm": 1.483729600906372,
"learning_rate": 3.862602172977134e-05,
"loss": 1.3456,
"step": 370
},
{
"epoch": 0.33260393873085337,
"grad_norm": 0.004763344768434763,
"learning_rate": 3.325237406093478e-05,
"loss": 0.4831,
"step": 380
},
{
"epoch": 0.3413566739606127,
"grad_norm": 0.0032259258441627026,
"learning_rate": 2.820635006596558e-05,
"loss": 0.003,
"step": 390
},
{
"epoch": 0.350109409190372,
"grad_norm": 0.011841998435556889,
"learning_rate": 2.351253346654272e-05,
"loss": 0.001,
"step": 400
},
{
"epoch": 0.350109409190372,
"eval_loss": 0.339497447013855,
"eval_runtime": 73.7088,
"eval_samples_per_second": 6.526,
"eval_steps_per_second": 1.642,
"step": 400
},
{
"epoch": 0.3588621444201313,
"grad_norm": 1.2920477390289307,
"learning_rate": 1.9193792065317794e-05,
"loss": 1.4396,
"step": 410
},
{
"epoch": 0.3676148796498906,
"grad_norm": 0.0026803743094205856,
"learning_rate": 1.5271166336279193e-05,
"loss": 1.1564,
"step": 420
},
{
"epoch": 0.37636761487964987,
"grad_norm": 0.006616776809096336,
"learning_rate": 1.1763766917677837e-05,
"loss": 0.2591,
"step": 430
},
{
"epoch": 0.3851203501094092,
"grad_norm": 0.02039826288819313,
"learning_rate": 8.688681506918602e-06,
"loss": 0.0429,
"step": 440
},
{
"epoch": 0.3938730853391685,
"grad_norm": 0.0405426099896431,
"learning_rate": 6.060891611016215e-06,
"loss": 0.0019,
"step": 450
},
{
"epoch": 0.3938730853391685,
"eval_loss": 0.32358914613723755,
"eval_runtime": 74.0818,
"eval_samples_per_second": 6.493,
"eval_steps_per_second": 1.633,
"step": 450
},
{
"epoch": 0.4026258205689278,
"grad_norm": 1.3911057710647583,
"learning_rate": 3.893199558198952e-06,
"loss": 1.3649,
"step": 460
},
{
"epoch": 0.4113785557986871,
"grad_norm": 0.56399005651474,
"learning_rate": 2.1961661262525285e-06,
"loss": 1.1702,
"step": 470
},
{
"epoch": 0.4201312910284464,
"grad_norm": 0.009245248511433601,
"learning_rate": 9.780590914721787e-07,
"loss": 0.1987,
"step": 480
},
{
"epoch": 0.4288840262582057,
"grad_norm": 0.026554109528660774,
"learning_rate": 2.4481294888766817e-07,
"loss": 0.0022,
"step": 490
},
{
"epoch": 0.437636761487965,
"grad_norm": 0.18384258449077606,
"learning_rate": 0.0,
"loss": 0.002,
"step": 500
},
{
"epoch": 0.437636761487965,
"eval_loss": 0.32265836000442505,
"eval_runtime": 74.0617,
"eval_samples_per_second": 6.495,
"eval_steps_per_second": 1.634,
"step": 500
}
],
"logging_steps": 10,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.7024096573259776e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}