ceb_b128_le5_s4000 / last-checkpoint /trainer_state.json
mikhail-panzo's picture
Training in progress, step 4000, checkpoint
18540d8 verified
raw
history blame
No virus
15.9 kB
{
"best_metric": 0.39425286650657654,
"best_model_checkpoint": "mikhail-panzo/ceb_b128_le5_s4000/checkpoint-4000",
"epoch": 313.72549019607845,
"eval_steps": 500,
"global_step": 4000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 3.9215686274509802,
"grad_norm": 4.645490646362305,
"learning_rate": 2.4500000000000004e-07,
"loss": 0.7929,
"step": 50
},
{
"epoch": 7.8431372549019605,
"grad_norm": 1.4075939655303955,
"learning_rate": 4.95e-07,
"loss": 0.7648,
"step": 100
},
{
"epoch": 11.764705882352942,
"grad_norm": 4.839516639709473,
"learning_rate": 7.450000000000001e-07,
"loss": 0.7575,
"step": 150
},
{
"epoch": 15.686274509803921,
"grad_norm": 1.0808578729629517,
"learning_rate": 9.950000000000002e-07,
"loss": 0.7118,
"step": 200
},
{
"epoch": 19.607843137254903,
"grad_norm": 2.4112303256988525,
"learning_rate": 1.2450000000000002e-06,
"loss": 0.706,
"step": 250
},
{
"epoch": 23.529411764705884,
"grad_norm": 0.9267112016677856,
"learning_rate": 1.495e-06,
"loss": 0.6756,
"step": 300
},
{
"epoch": 27.45098039215686,
"grad_norm": 1.3859704732894897,
"learning_rate": 1.745e-06,
"loss": 0.6412,
"step": 350
},
{
"epoch": 31.372549019607842,
"grad_norm": 0.9298150539398193,
"learning_rate": 1.9950000000000004e-06,
"loss": 0.5781,
"step": 400
},
{
"epoch": 35.294117647058826,
"grad_norm": 1.0941523313522339,
"learning_rate": 2.245e-06,
"loss": 0.5381,
"step": 450
},
{
"epoch": 39.21568627450981,
"grad_norm": 1.264251708984375,
"learning_rate": 2.4950000000000003e-06,
"loss": 0.5272,
"step": 500
},
{
"epoch": 39.21568627450981,
"eval_loss": 0.4584241807460785,
"eval_runtime": 6.3895,
"eval_samples_per_second": 28.171,
"eval_steps_per_second": 3.6,
"step": 500
},
{
"epoch": 43.13725490196079,
"grad_norm": 0.8968011736869812,
"learning_rate": 2.7450000000000004e-06,
"loss": 0.5121,
"step": 550
},
{
"epoch": 47.05882352941177,
"grad_norm": 1.1953343152999878,
"learning_rate": 2.995e-06,
"loss": 0.5047,
"step": 600
},
{
"epoch": 50.98039215686274,
"grad_norm": 0.8916338682174683,
"learning_rate": 3.2450000000000003e-06,
"loss": 0.5013,
"step": 650
},
{
"epoch": 54.90196078431372,
"grad_norm": 1.166737675666809,
"learning_rate": 3.495e-06,
"loss": 0.4847,
"step": 700
},
{
"epoch": 58.8235294117647,
"grad_norm": 0.9191999435424805,
"learning_rate": 3.745e-06,
"loss": 0.4849,
"step": 750
},
{
"epoch": 62.745098039215684,
"grad_norm": 1.0393210649490356,
"learning_rate": 3.995000000000001e-06,
"loss": 0.4827,
"step": 800
},
{
"epoch": 66.66666666666667,
"grad_norm": 0.764959454536438,
"learning_rate": 4.245e-06,
"loss": 0.483,
"step": 850
},
{
"epoch": 70.58823529411765,
"grad_norm": 1.0713616609573364,
"learning_rate": 4.495e-06,
"loss": 0.4762,
"step": 900
},
{
"epoch": 74.50980392156863,
"grad_norm": 0.8433477282524109,
"learning_rate": 4.745e-06,
"loss": 0.4702,
"step": 950
},
{
"epoch": 78.43137254901961,
"grad_norm": 0.6966288685798645,
"learning_rate": 4.9950000000000005e-06,
"loss": 0.4634,
"step": 1000
},
{
"epoch": 78.43137254901961,
"eval_loss": 0.42202475666999817,
"eval_runtime": 6.3604,
"eval_samples_per_second": 28.3,
"eval_steps_per_second": 3.616,
"step": 1000
},
{
"epoch": 82.3529411764706,
"grad_norm": 0.9115990400314331,
"learning_rate": 5.245e-06,
"loss": 0.47,
"step": 1050
},
{
"epoch": 86.27450980392157,
"grad_norm": 0.6754831075668335,
"learning_rate": 5.495000000000001e-06,
"loss": 0.4605,
"step": 1100
},
{
"epoch": 90.19607843137256,
"grad_norm": 1.0708327293395996,
"learning_rate": 5.745000000000001e-06,
"loss": 0.458,
"step": 1150
},
{
"epoch": 94.11764705882354,
"grad_norm": 0.7757265567779541,
"learning_rate": 5.995000000000001e-06,
"loss": 0.456,
"step": 1200
},
{
"epoch": 98.03921568627452,
"grad_norm": 1.1435647010803223,
"learning_rate": 6.245000000000001e-06,
"loss": 0.4576,
"step": 1250
},
{
"epoch": 101.96078431372548,
"grad_norm": 0.8143028020858765,
"learning_rate": 6.4950000000000005e-06,
"loss": 0.4518,
"step": 1300
},
{
"epoch": 105.88235294117646,
"grad_norm": 0.8940721750259399,
"learning_rate": 6.745000000000001e-06,
"loss": 0.4515,
"step": 1350
},
{
"epoch": 109.80392156862744,
"grad_norm": 1.8656580448150635,
"learning_rate": 6.995000000000001e-06,
"loss": 0.4516,
"step": 1400
},
{
"epoch": 113.72549019607843,
"grad_norm": 0.7817286252975464,
"learning_rate": 7.245000000000001e-06,
"loss": 0.4412,
"step": 1450
},
{
"epoch": 117.6470588235294,
"grad_norm": 1.806294322013855,
"learning_rate": 7.495000000000001e-06,
"loss": 0.4466,
"step": 1500
},
{
"epoch": 117.6470588235294,
"eval_loss": 0.41040292382240295,
"eval_runtime": 6.4965,
"eval_samples_per_second": 27.707,
"eval_steps_per_second": 3.54,
"step": 1500
},
{
"epoch": 121.56862745098039,
"grad_norm": 0.6831104755401611,
"learning_rate": 7.745e-06,
"loss": 0.4461,
"step": 1550
},
{
"epoch": 125.49019607843137,
"grad_norm": 1.195868968963623,
"learning_rate": 7.995e-06,
"loss": 0.4429,
"step": 1600
},
{
"epoch": 129.41176470588235,
"grad_norm": 1.1746853590011597,
"learning_rate": 8.245000000000002e-06,
"loss": 0.4358,
"step": 1650
},
{
"epoch": 133.33333333333334,
"grad_norm": 1.2797439098358154,
"learning_rate": 8.495e-06,
"loss": 0.4383,
"step": 1700
},
{
"epoch": 137.2549019607843,
"grad_norm": 0.6744837760925293,
"learning_rate": 8.745000000000002e-06,
"loss": 0.4416,
"step": 1750
},
{
"epoch": 141.1764705882353,
"grad_norm": 0.7655614018440247,
"learning_rate": 8.995000000000001e-06,
"loss": 0.4338,
"step": 1800
},
{
"epoch": 145.09803921568627,
"grad_norm": 0.9920282363891602,
"learning_rate": 9.245e-06,
"loss": 0.4337,
"step": 1850
},
{
"epoch": 149.01960784313727,
"grad_norm": 0.9740642309188843,
"learning_rate": 9.495000000000001e-06,
"loss": 0.4309,
"step": 1900
},
{
"epoch": 152.94117647058823,
"grad_norm": 0.9331285953521729,
"learning_rate": 9.745e-06,
"loss": 0.4337,
"step": 1950
},
{
"epoch": 156.86274509803923,
"grad_norm": 0.8512988686561584,
"learning_rate": 9.995000000000002e-06,
"loss": 0.4289,
"step": 2000
},
{
"epoch": 156.86274509803923,
"eval_loss": 0.4016592502593994,
"eval_runtime": 6.4392,
"eval_samples_per_second": 27.954,
"eval_steps_per_second": 3.572,
"step": 2000
},
{
"epoch": 160.7843137254902,
"grad_norm": 0.7746613025665283,
"learning_rate": 9.755e-06,
"loss": 0.4306,
"step": 2050
},
{
"epoch": 164.7058823529412,
"grad_norm": 0.6868831515312195,
"learning_rate": 9.505000000000001e-06,
"loss": 0.4302,
"step": 2100
},
{
"epoch": 168.62745098039215,
"grad_norm": 1.010834813117981,
"learning_rate": 9.255e-06,
"loss": 0.4254,
"step": 2150
},
{
"epoch": 172.54901960784315,
"grad_norm": 1.054592490196228,
"learning_rate": 9.005000000000001e-06,
"loss": 0.4248,
"step": 2200
},
{
"epoch": 176.47058823529412,
"grad_norm": 0.8121660351753235,
"learning_rate": 8.755e-06,
"loss": 0.4227,
"step": 2250
},
{
"epoch": 180.3921568627451,
"grad_norm": 0.6637047529220581,
"learning_rate": 8.505e-06,
"loss": 0.4232,
"step": 2300
},
{
"epoch": 184.31372549019608,
"grad_norm": 1.0822277069091797,
"learning_rate": 8.255000000000001e-06,
"loss": 0.4226,
"step": 2350
},
{
"epoch": 188.23529411764707,
"grad_norm": 0.759693443775177,
"learning_rate": 8.005e-06,
"loss": 0.4236,
"step": 2400
},
{
"epoch": 192.15686274509804,
"grad_norm": 0.576042652130127,
"learning_rate": 7.755000000000001e-06,
"loss": 0.4162,
"step": 2450
},
{
"epoch": 196.07843137254903,
"grad_norm": 0.8360034227371216,
"learning_rate": 7.505e-06,
"loss": 0.4223,
"step": 2500
},
{
"epoch": 196.07843137254903,
"eval_loss": 0.39692553877830505,
"eval_runtime": 6.4387,
"eval_samples_per_second": 27.956,
"eval_steps_per_second": 3.572,
"step": 2500
},
{
"epoch": 200.0,
"grad_norm": 0.7426376342773438,
"learning_rate": 7.255000000000001e-06,
"loss": 0.4157,
"step": 2550
},
{
"epoch": 203.92156862745097,
"grad_norm": 1.1800576448440552,
"learning_rate": 7.005000000000001e-06,
"loss": 0.419,
"step": 2600
},
{
"epoch": 207.84313725490196,
"grad_norm": 0.7355245351791382,
"learning_rate": 6.7550000000000005e-06,
"loss": 0.4174,
"step": 2650
},
{
"epoch": 211.76470588235293,
"grad_norm": 0.5805600881576538,
"learning_rate": 6.505e-06,
"loss": 0.4146,
"step": 2700
},
{
"epoch": 215.68627450980392,
"grad_norm": 0.9223101139068604,
"learning_rate": 6.255e-06,
"loss": 0.4178,
"step": 2750
},
{
"epoch": 219.6078431372549,
"grad_norm": 0.8155106902122498,
"learning_rate": 6.005000000000001e-06,
"loss": 0.4151,
"step": 2800
},
{
"epoch": 223.52941176470588,
"grad_norm": 0.6420881748199463,
"learning_rate": 5.755000000000001e-06,
"loss": 0.4156,
"step": 2850
},
{
"epoch": 227.45098039215685,
"grad_norm": 0.7704824209213257,
"learning_rate": 5.505000000000001e-06,
"loss": 0.4157,
"step": 2900
},
{
"epoch": 231.37254901960785,
"grad_norm": 0.6147534251213074,
"learning_rate": 5.2550000000000005e-06,
"loss": 0.4177,
"step": 2950
},
{
"epoch": 235.2941176470588,
"grad_norm": 0.758510172367096,
"learning_rate": 5.0049999999999995e-06,
"loss": 0.4149,
"step": 3000
},
{
"epoch": 235.2941176470588,
"eval_loss": 0.39595454931259155,
"eval_runtime": 6.3959,
"eval_samples_per_second": 28.143,
"eval_steps_per_second": 3.596,
"step": 3000
},
{
"epoch": 239.2156862745098,
"grad_norm": 0.7012345790863037,
"learning_rate": 4.755e-06,
"loss": 0.4103,
"step": 3050
},
{
"epoch": 243.13725490196077,
"grad_norm": 0.7060217261314392,
"learning_rate": 4.505e-06,
"loss": 0.4114,
"step": 3100
},
{
"epoch": 247.05882352941177,
"grad_norm": 0.8892201781272888,
"learning_rate": 4.255e-06,
"loss": 0.4093,
"step": 3150
},
{
"epoch": 250.98039215686273,
"grad_norm": 0.6098002791404724,
"learning_rate": 4.005000000000001e-06,
"loss": 0.411,
"step": 3200
},
{
"epoch": 254.90196078431373,
"grad_norm": 0.7227322459220886,
"learning_rate": 3.7550000000000005e-06,
"loss": 0.4108,
"step": 3250
},
{
"epoch": 258.8235294117647,
"grad_norm": 0.8260012865066528,
"learning_rate": 3.505e-06,
"loss": 0.412,
"step": 3300
},
{
"epoch": 262.7450980392157,
"grad_norm": 0.6897534132003784,
"learning_rate": 3.255e-06,
"loss": 0.4112,
"step": 3350
},
{
"epoch": 266.6666666666667,
"grad_norm": 1.0072269439697266,
"learning_rate": 3.005e-06,
"loss": 0.413,
"step": 3400
},
{
"epoch": 270.5882352941176,
"grad_norm": 0.6094086170196533,
"learning_rate": 2.7550000000000003e-06,
"loss": 0.4104,
"step": 3450
},
{
"epoch": 274.5098039215686,
"grad_norm": 0.6129186749458313,
"learning_rate": 2.505e-06,
"loss": 0.4129,
"step": 3500
},
{
"epoch": 274.5098039215686,
"eval_loss": 0.39622604846954346,
"eval_runtime": 6.4203,
"eval_samples_per_second": 28.036,
"eval_steps_per_second": 3.582,
"step": 3500
},
{
"epoch": 278.4313725490196,
"grad_norm": 0.6087095737457275,
"learning_rate": 2.2550000000000004e-06,
"loss": 0.4122,
"step": 3550
},
{
"epoch": 282.3529411764706,
"grad_norm": 0.5597049593925476,
"learning_rate": 2.0050000000000003e-06,
"loss": 0.4111,
"step": 3600
},
{
"epoch": 286.27450980392155,
"grad_norm": 0.5411703586578369,
"learning_rate": 1.7550000000000001e-06,
"loss": 0.4069,
"step": 3650
},
{
"epoch": 290.19607843137254,
"grad_norm": 0.6042822599411011,
"learning_rate": 1.505e-06,
"loss": 0.4079,
"step": 3700
},
{
"epoch": 294.11764705882354,
"grad_norm": 0.6852837204933167,
"learning_rate": 1.255e-06,
"loss": 0.4068,
"step": 3750
},
{
"epoch": 298.03921568627453,
"grad_norm": 0.6229822039604187,
"learning_rate": 1.0050000000000001e-06,
"loss": 0.4112,
"step": 3800
},
{
"epoch": 301.96078431372547,
"grad_norm": 0.6330438256263733,
"learning_rate": 7.550000000000001e-07,
"loss": 0.4053,
"step": 3850
},
{
"epoch": 305.88235294117646,
"grad_norm": 0.5041052103042603,
"learning_rate": 5.05e-07,
"loss": 0.405,
"step": 3900
},
{
"epoch": 309.80392156862746,
"grad_norm": 0.6818642020225525,
"learning_rate": 2.55e-07,
"loss": 0.4095,
"step": 3950
},
{
"epoch": 313.72549019607845,
"grad_norm": 0.4621037542819977,
"learning_rate": 5e-09,
"loss": 0.4108,
"step": 4000
},
{
"epoch": 313.72549019607845,
"eval_loss": 0.39425286650657654,
"eval_runtime": 6.437,
"eval_samples_per_second": 27.963,
"eval_steps_per_second": 3.573,
"step": 4000
}
],
"logging_steps": 50,
"max_steps": 4000,
"num_input_tokens_seen": 0,
"num_train_epochs": 334,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.70355911863679e+16,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}