Toshiiiii1's picture
Upload 11 files
a4e2dc1 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.995418771290967,
"eval_steps": 500,
"global_step": 25500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.06,
"learning_rate": 5.873140172278778e-05,
"loss": 0.0966,
"step": 500
},
{
"epoch": 0.06,
"eval_loss": 0.03481524437665939,
"eval_runtime": 123.2077,
"eval_samples_per_second": 53.097,
"eval_steps_per_second": 6.639,
"step": 500
},
{
"epoch": 0.12,
"learning_rate": 0.00011746280344557555,
"loss": 0.0446,
"step": 1000
},
{
"epoch": 0.12,
"eval_loss": 0.038774896413087845,
"eval_runtime": 123.3038,
"eval_samples_per_second": 53.056,
"eval_steps_per_second": 6.634,
"step": 1000
},
{
"epoch": 0.18,
"learning_rate": 0.00017619420516836332,
"loss": 0.063,
"step": 1500
},
{
"epoch": 0.18,
"eval_loss": 0.06299161165952682,
"eval_runtime": 123.374,
"eval_samples_per_second": 53.026,
"eval_steps_per_second": 6.63,
"step": 1500
},
{
"epoch": 0.23,
"learning_rate": 0.0002349256068911511,
"loss": 0.0938,
"step": 2000
},
{
"epoch": 0.23,
"eval_loss": 0.07528574019670486,
"eval_runtime": 123.3537,
"eval_samples_per_second": 53.034,
"eval_steps_per_second": 6.631,
"step": 2000
},
{
"epoch": 0.29,
"learning_rate": 0.0002936570086139389,
"loss": 0.1281,
"step": 2500
},
{
"epoch": 0.29,
"eval_loss": 0.10252855718135834,
"eval_runtime": 123.3622,
"eval_samples_per_second": 53.031,
"eval_steps_per_second": 6.631,
"step": 2500
},
{
"epoch": 0.35,
"learning_rate": 0.00029417881226887095,
"loss": 0.1452,
"step": 3000
},
{
"epoch": 0.35,
"eval_loss": 0.12126505374908447,
"eval_runtime": 123.2879,
"eval_samples_per_second": 53.063,
"eval_steps_per_second": 6.635,
"step": 3000
},
{
"epoch": 0.41,
"learning_rate": 0.0002876528170546008,
"loss": 0.1393,
"step": 3500
},
{
"epoch": 0.41,
"eval_loss": 0.0877891555428505,
"eval_runtime": 123.3134,
"eval_samples_per_second": 53.052,
"eval_steps_per_second": 6.634,
"step": 3500
},
{
"epoch": 0.47,
"learning_rate": 0.00028112682184033063,
"loss": 0.1278,
"step": 4000
},
{
"epoch": 0.47,
"eval_loss": 0.08643390238285065,
"eval_runtime": 123.5525,
"eval_samples_per_second": 52.949,
"eval_steps_per_second": 6.621,
"step": 4000
},
{
"epoch": 0.53,
"learning_rate": 0.00027460082662606047,
"loss": 0.121,
"step": 4500
},
{
"epoch": 0.53,
"eval_loss": 0.07961534708738327,
"eval_runtime": 123.4172,
"eval_samples_per_second": 53.007,
"eval_steps_per_second": 6.628,
"step": 4500
},
{
"epoch": 0.59,
"learning_rate": 0.00026807483141179025,
"loss": 0.115,
"step": 5000
},
{
"epoch": 0.59,
"eval_loss": 0.07281593233346939,
"eval_runtime": 123.3851,
"eval_samples_per_second": 53.021,
"eval_steps_per_second": 6.63,
"step": 5000
},
{
"epoch": 0.65,
"learning_rate": 0.0002615488361975201,
"loss": 0.1105,
"step": 5500
},
{
"epoch": 0.65,
"eval_loss": 0.06985253840684891,
"eval_runtime": 123.3894,
"eval_samples_per_second": 53.019,
"eval_steps_per_second": 6.629,
"step": 5500
},
{
"epoch": 0.7,
"learning_rate": 0.00025502284098324993,
"loss": 0.109,
"step": 6000
},
{
"epoch": 0.7,
"eval_loss": 0.0665203332901001,
"eval_runtime": 123.2402,
"eval_samples_per_second": 53.083,
"eval_steps_per_second": 6.637,
"step": 6000
},
{
"epoch": 0.76,
"learning_rate": 0.00024849684576897977,
"loss": 0.0989,
"step": 6500
},
{
"epoch": 0.76,
"eval_loss": 0.06210066005587578,
"eval_runtime": 123.1909,
"eval_samples_per_second": 53.105,
"eval_steps_per_second": 6.64,
"step": 6500
},
{
"epoch": 0.82,
"learning_rate": 0.00024197085055470958,
"loss": 0.0981,
"step": 7000
},
{
"epoch": 0.82,
"eval_loss": 0.06805345416069031,
"eval_runtime": 123.2672,
"eval_samples_per_second": 53.072,
"eval_steps_per_second": 6.636,
"step": 7000
},
{
"epoch": 0.88,
"learning_rate": 0.0002354448553404394,
"loss": 0.0913,
"step": 7500
},
{
"epoch": 0.88,
"eval_loss": 0.05965917557477951,
"eval_runtime": 123.3122,
"eval_samples_per_second": 53.052,
"eval_steps_per_second": 6.634,
"step": 7500
},
{
"epoch": 0.94,
"learning_rate": 0.00022891886012616923,
"loss": 0.0878,
"step": 8000
},
{
"epoch": 0.94,
"eval_loss": 0.05600811913609505,
"eval_runtime": 123.3189,
"eval_samples_per_second": 53.049,
"eval_steps_per_second": 6.633,
"step": 8000
},
{
"epoch": 1.0,
"learning_rate": 0.00022239286491189904,
"loss": 0.0852,
"step": 8500
},
{
"epoch": 1.0,
"eval_loss": 0.05495968833565712,
"eval_runtime": 123.3222,
"eval_samples_per_second": 53.048,
"eval_steps_per_second": 6.633,
"step": 8500
},
{
"epoch": 1.06,
"learning_rate": 0.00021586686969762888,
"loss": 0.0708,
"step": 9000
},
{
"epoch": 1.06,
"eval_loss": 0.05065647512674332,
"eval_runtime": 123.1018,
"eval_samples_per_second": 53.143,
"eval_steps_per_second": 6.645,
"step": 9000
},
{
"epoch": 1.12,
"learning_rate": 0.0002093408744833587,
"loss": 0.0669,
"step": 9500
},
{
"epoch": 1.12,
"eval_loss": 0.05243635177612305,
"eval_runtime": 123.1404,
"eval_samples_per_second": 53.126,
"eval_steps_per_second": 6.643,
"step": 9500
},
{
"epoch": 1.17,
"learning_rate": 0.00020281487926908853,
"loss": 0.0657,
"step": 10000
},
{
"epoch": 1.17,
"eval_loss": 0.04499243199825287,
"eval_runtime": 123.3227,
"eval_samples_per_second": 53.048,
"eval_steps_per_second": 6.633,
"step": 10000
},
{
"epoch": 1.23,
"learning_rate": 0.00019628888405481834,
"loss": 0.0636,
"step": 10500
},
{
"epoch": 1.23,
"eval_loss": 0.04722798988223076,
"eval_runtime": 123.4099,
"eval_samples_per_second": 53.01,
"eval_steps_per_second": 6.628,
"step": 10500
},
{
"epoch": 1.29,
"learning_rate": 0.00018976288884054818,
"loss": 0.0623,
"step": 11000
},
{
"epoch": 1.29,
"eval_loss": 0.042462971061468124,
"eval_runtime": 123.343,
"eval_samples_per_second": 53.039,
"eval_steps_per_second": 6.632,
"step": 11000
},
{
"epoch": 1.35,
"learning_rate": 0.00018323689362627802,
"loss": 0.0596,
"step": 11500
},
{
"epoch": 1.35,
"eval_loss": 0.04393870010972023,
"eval_runtime": 123.1794,
"eval_samples_per_second": 53.11,
"eval_steps_per_second": 6.641,
"step": 11500
},
{
"epoch": 1.41,
"learning_rate": 0.00017671089841200783,
"loss": 0.0612,
"step": 12000
},
{
"epoch": 1.41,
"eval_loss": 0.03987164422869682,
"eval_runtime": 123.4328,
"eval_samples_per_second": 53.001,
"eval_steps_per_second": 6.627,
"step": 12000
},
{
"epoch": 1.47,
"learning_rate": 0.00017018490319773767,
"loss": 0.0553,
"step": 12500
},
{
"epoch": 1.47,
"eval_loss": 0.04043687880039215,
"eval_runtime": 123.5215,
"eval_samples_per_second": 52.962,
"eval_steps_per_second": 6.622,
"step": 12500
},
{
"epoch": 1.53,
"learning_rate": 0.00016365890798346748,
"loss": 0.0565,
"step": 13000
},
{
"epoch": 1.53,
"eval_loss": 0.04023285582661629,
"eval_runtime": 123.3607,
"eval_samples_per_second": 53.031,
"eval_steps_per_second": 6.631,
"step": 13000
},
{
"epoch": 1.59,
"learning_rate": 0.00015713291276919726,
"loss": 0.0541,
"step": 13500
},
{
"epoch": 1.59,
"eval_loss": 0.03617456555366516,
"eval_runtime": 123.7139,
"eval_samples_per_second": 52.88,
"eval_steps_per_second": 6.612,
"step": 13500
},
{
"epoch": 1.64,
"learning_rate": 0.0001506069175549271,
"loss": 0.0527,
"step": 14000
},
{
"epoch": 1.64,
"eval_loss": 0.036614831537008286,
"eval_runtime": 123.4987,
"eval_samples_per_second": 52.972,
"eval_steps_per_second": 6.624,
"step": 14000
},
{
"epoch": 1.7,
"learning_rate": 0.00014408092234065694,
"loss": 0.0485,
"step": 14500
},
{
"epoch": 1.7,
"eval_loss": 0.03705143555998802,
"eval_runtime": 123.4413,
"eval_samples_per_second": 52.997,
"eval_steps_per_second": 6.627,
"step": 14500
},
{
"epoch": 1.76,
"learning_rate": 0.00013755492712638678,
"loss": 0.0502,
"step": 15000
},
{
"epoch": 1.76,
"eval_loss": 0.03249647840857506,
"eval_runtime": 123.5764,
"eval_samples_per_second": 52.939,
"eval_steps_per_second": 6.619,
"step": 15000
},
{
"epoch": 1.82,
"learning_rate": 0.0001310289319121166,
"loss": 0.0485,
"step": 15500
},
{
"epoch": 1.82,
"eval_loss": 0.03329641371965408,
"eval_runtime": 123.7232,
"eval_samples_per_second": 52.876,
"eval_steps_per_second": 6.612,
"step": 15500
},
{
"epoch": 1.88,
"learning_rate": 0.00012450293669784643,
"loss": 0.0459,
"step": 16000
},
{
"epoch": 1.88,
"eval_loss": 0.03429277986288071,
"eval_runtime": 123.8157,
"eval_samples_per_second": 52.837,
"eval_steps_per_second": 6.607,
"step": 16000
},
{
"epoch": 1.94,
"learning_rate": 0.00011797694148357624,
"loss": 0.0461,
"step": 16500
},
{
"epoch": 1.94,
"eval_loss": 0.02981030009686947,
"eval_runtime": 123.4645,
"eval_samples_per_second": 52.987,
"eval_steps_per_second": 6.625,
"step": 16500
},
{
"epoch": 2.0,
"learning_rate": 0.00011145094626930605,
"loss": 0.0423,
"step": 17000
},
{
"epoch": 2.0,
"eval_loss": 0.029031969606876373,
"eval_runtime": 123.8223,
"eval_samples_per_second": 52.834,
"eval_steps_per_second": 6.606,
"step": 17000
},
{
"epoch": 2.06,
"learning_rate": 0.00010492495105503587,
"loss": 0.0304,
"step": 17500
},
{
"epoch": 2.06,
"eval_loss": 0.029610687866806984,
"eval_runtime": 123.3939,
"eval_samples_per_second": 53.017,
"eval_steps_per_second": 6.629,
"step": 17500
},
{
"epoch": 2.11,
"learning_rate": 9.83989558407657e-05,
"loss": 0.0286,
"step": 18000
},
{
"epoch": 2.11,
"eval_loss": 0.029453950002789497,
"eval_runtime": 123.7963,
"eval_samples_per_second": 52.845,
"eval_steps_per_second": 6.608,
"step": 18000
},
{
"epoch": 2.17,
"learning_rate": 9.187296062649552e-05,
"loss": 0.0294,
"step": 18500
},
{
"epoch": 2.17,
"eval_loss": 0.028666863217949867,
"eval_runtime": 123.5343,
"eval_samples_per_second": 52.957,
"eval_steps_per_second": 6.622,
"step": 18500
},
{
"epoch": 2.23,
"learning_rate": 8.534696541222536e-05,
"loss": 0.0271,
"step": 19000
},
{
"epoch": 2.23,
"eval_loss": 0.029929010197520256,
"eval_runtime": 123.4149,
"eval_samples_per_second": 53.008,
"eval_steps_per_second": 6.628,
"step": 19000
},
{
"epoch": 2.29,
"learning_rate": 7.882097019795519e-05,
"loss": 0.0256,
"step": 19500
},
{
"epoch": 2.29,
"eval_loss": 0.027137087658047676,
"eval_runtime": 123.7129,
"eval_samples_per_second": 52.88,
"eval_steps_per_second": 6.612,
"step": 19500
},
{
"epoch": 2.35,
"learning_rate": 7.229497498368501e-05,
"loss": 0.0256,
"step": 20000
},
{
"epoch": 2.35,
"eval_loss": 0.027056274935603142,
"eval_runtime": 123.4616,
"eval_samples_per_second": 52.988,
"eval_steps_per_second": 6.626,
"step": 20000
},
{
"epoch": 2.41,
"learning_rate": 6.576897976941483e-05,
"loss": 0.0231,
"step": 20500
},
{
"epoch": 2.41,
"eval_loss": 0.02597939409315586,
"eval_runtime": 123.5327,
"eval_samples_per_second": 52.958,
"eval_steps_per_second": 6.622,
"step": 20500
},
{
"epoch": 2.47,
"learning_rate": 5.924298455514466e-05,
"loss": 0.0246,
"step": 21000
},
{
"epoch": 2.47,
"eval_loss": 0.025122959166765213,
"eval_runtime": 123.806,
"eval_samples_per_second": 52.841,
"eval_steps_per_second": 6.607,
"step": 21000
},
{
"epoch": 2.53,
"learning_rate": 5.2716989340874485e-05,
"loss": 0.0235,
"step": 21500
},
{
"epoch": 2.53,
"eval_loss": 0.024449240416288376,
"eval_runtime": 123.6218,
"eval_samples_per_second": 52.919,
"eval_steps_per_second": 6.617,
"step": 21500
},
{
"epoch": 2.58,
"learning_rate": 4.61909941266043e-05,
"loss": 0.0234,
"step": 22000
},
{
"epoch": 2.58,
"eval_loss": 0.024168582633137703,
"eval_runtime": 123.8732,
"eval_samples_per_second": 52.812,
"eval_steps_per_second": 6.604,
"step": 22000
},
{
"epoch": 2.64,
"learning_rate": 3.966499891233413e-05,
"loss": 0.0224,
"step": 22500
},
{
"epoch": 2.64,
"eval_loss": 0.023941034451127052,
"eval_runtime": 123.519,
"eval_samples_per_second": 52.963,
"eval_steps_per_second": 6.622,
"step": 22500
},
{
"epoch": 2.7,
"learning_rate": 3.313900369806395e-05,
"loss": 0.0214,
"step": 23000
},
{
"epoch": 2.7,
"eval_loss": 0.02283557504415512,
"eval_runtime": 123.5762,
"eval_samples_per_second": 52.939,
"eval_steps_per_second": 6.619,
"step": 23000
},
{
"epoch": 2.76,
"learning_rate": 2.6613008483793777e-05,
"loss": 0.0213,
"step": 23500
},
{
"epoch": 2.76,
"eval_loss": 0.0223745945841074,
"eval_runtime": 123.8829,
"eval_samples_per_second": 52.808,
"eval_steps_per_second": 6.603,
"step": 23500
},
{
"epoch": 2.82,
"learning_rate": 2.0087013269523602e-05,
"loss": 0.0196,
"step": 24000
},
{
"epoch": 2.82,
"eval_loss": 0.02261008322238922,
"eval_runtime": 123.6333,
"eval_samples_per_second": 52.915,
"eval_steps_per_second": 6.616,
"step": 24000
},
{
"epoch": 2.88,
"learning_rate": 1.3561018055253423e-05,
"loss": 0.0194,
"step": 24500
},
{
"epoch": 2.88,
"eval_loss": 0.021464873105287552,
"eval_runtime": 124.0332,
"eval_samples_per_second": 52.744,
"eval_steps_per_second": 6.595,
"step": 24500
},
{
"epoch": 2.94,
"learning_rate": 7.03502284098325e-06,
"loss": 0.0191,
"step": 25000
},
{
"epoch": 2.94,
"eval_loss": 0.021281694993376732,
"eval_runtime": 123.6857,
"eval_samples_per_second": 52.892,
"eval_steps_per_second": 6.614,
"step": 25000
},
{
"epoch": 3.0,
"learning_rate": 5.090276267130737e-07,
"loss": 0.0197,
"step": 25500
},
{
"epoch": 3.0,
"eval_loss": 0.0211211945861578,
"eval_runtime": 123.7008,
"eval_samples_per_second": 52.886,
"eval_steps_per_second": 6.613,
"step": 25500
}
],
"logging_steps": 500,
"max_steps": 25539,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"total_flos": 2.428545951977472e+16,
"train_batch_size": 18,
"trial_name": null,
"trial_params": null
}