|
{ |
|
"best_metric": 0.39425286650657654, |
|
"best_model_checkpoint": "mikhail-panzo/ceb_b128_le5_s4000/checkpoint-4000", |
|
"epoch": 313.72549019607845, |
|
"eval_steps": 500, |
|
"global_step": 4000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 3.9215686274509802, |
|
"grad_norm": 4.645490646362305, |
|
"learning_rate": 2.4500000000000004e-07, |
|
"loss": 0.7929, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 7.8431372549019605, |
|
"grad_norm": 1.4075939655303955, |
|
"learning_rate": 4.95e-07, |
|
"loss": 0.7648, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 11.764705882352942, |
|
"grad_norm": 4.839516639709473, |
|
"learning_rate": 7.450000000000001e-07, |
|
"loss": 0.7575, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 15.686274509803921, |
|
"grad_norm": 1.0808578729629517, |
|
"learning_rate": 9.950000000000002e-07, |
|
"loss": 0.7118, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 19.607843137254903, |
|
"grad_norm": 2.4112303256988525, |
|
"learning_rate": 1.2450000000000002e-06, |
|
"loss": 0.706, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 23.529411764705884, |
|
"grad_norm": 0.9267112016677856, |
|
"learning_rate": 1.495e-06, |
|
"loss": 0.6756, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 27.45098039215686, |
|
"grad_norm": 1.3859704732894897, |
|
"learning_rate": 1.745e-06, |
|
"loss": 0.6412, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 31.372549019607842, |
|
"grad_norm": 0.9298150539398193, |
|
"learning_rate": 1.9950000000000004e-06, |
|
"loss": 0.5781, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 35.294117647058826, |
|
"grad_norm": 1.0941523313522339, |
|
"learning_rate": 2.245e-06, |
|
"loss": 0.5381, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 39.21568627450981, |
|
"grad_norm": 1.264251708984375, |
|
"learning_rate": 2.4950000000000003e-06, |
|
"loss": 0.5272, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 39.21568627450981, |
|
"eval_loss": 0.4584241807460785, |
|
"eval_runtime": 6.3895, |
|
"eval_samples_per_second": 28.171, |
|
"eval_steps_per_second": 3.6, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 43.13725490196079, |
|
"grad_norm": 0.8968011736869812, |
|
"learning_rate": 2.7450000000000004e-06, |
|
"loss": 0.5121, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 47.05882352941177, |
|
"grad_norm": 1.1953343152999878, |
|
"learning_rate": 2.995e-06, |
|
"loss": 0.5047, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 50.98039215686274, |
|
"grad_norm": 0.8916338682174683, |
|
"learning_rate": 3.2450000000000003e-06, |
|
"loss": 0.5013, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 54.90196078431372, |
|
"grad_norm": 1.166737675666809, |
|
"learning_rate": 3.495e-06, |
|
"loss": 0.4847, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 58.8235294117647, |
|
"grad_norm": 0.9191999435424805, |
|
"learning_rate": 3.745e-06, |
|
"loss": 0.4849, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 62.745098039215684, |
|
"grad_norm": 1.0393210649490356, |
|
"learning_rate": 3.995000000000001e-06, |
|
"loss": 0.4827, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 66.66666666666667, |
|
"grad_norm": 0.764959454536438, |
|
"learning_rate": 4.245e-06, |
|
"loss": 0.483, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 70.58823529411765, |
|
"grad_norm": 1.0713616609573364, |
|
"learning_rate": 4.495e-06, |
|
"loss": 0.4762, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 74.50980392156863, |
|
"grad_norm": 0.8433477282524109, |
|
"learning_rate": 4.745e-06, |
|
"loss": 0.4702, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 78.43137254901961, |
|
"grad_norm": 0.6966288685798645, |
|
"learning_rate": 4.9950000000000005e-06, |
|
"loss": 0.4634, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 78.43137254901961, |
|
"eval_loss": 0.42202475666999817, |
|
"eval_runtime": 6.3604, |
|
"eval_samples_per_second": 28.3, |
|
"eval_steps_per_second": 3.616, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 82.3529411764706, |
|
"grad_norm": 0.9115990400314331, |
|
"learning_rate": 5.245e-06, |
|
"loss": 0.47, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 86.27450980392157, |
|
"grad_norm": 0.6754831075668335, |
|
"learning_rate": 5.495000000000001e-06, |
|
"loss": 0.4605, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 90.19607843137256, |
|
"grad_norm": 1.0708327293395996, |
|
"learning_rate": 5.745000000000001e-06, |
|
"loss": 0.458, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 94.11764705882354, |
|
"grad_norm": 0.7757265567779541, |
|
"learning_rate": 5.995000000000001e-06, |
|
"loss": 0.456, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 98.03921568627452, |
|
"grad_norm": 1.1435647010803223, |
|
"learning_rate": 6.245000000000001e-06, |
|
"loss": 0.4576, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 101.96078431372548, |
|
"grad_norm": 0.8143028020858765, |
|
"learning_rate": 6.4950000000000005e-06, |
|
"loss": 0.4518, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 105.88235294117646, |
|
"grad_norm": 0.8940721750259399, |
|
"learning_rate": 6.745000000000001e-06, |
|
"loss": 0.4515, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 109.80392156862744, |
|
"grad_norm": 1.8656580448150635, |
|
"learning_rate": 6.995000000000001e-06, |
|
"loss": 0.4516, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 113.72549019607843, |
|
"grad_norm": 0.7817286252975464, |
|
"learning_rate": 7.245000000000001e-06, |
|
"loss": 0.4412, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 117.6470588235294, |
|
"grad_norm": 1.806294322013855, |
|
"learning_rate": 7.495000000000001e-06, |
|
"loss": 0.4466, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 117.6470588235294, |
|
"eval_loss": 0.41040292382240295, |
|
"eval_runtime": 6.4965, |
|
"eval_samples_per_second": 27.707, |
|
"eval_steps_per_second": 3.54, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 121.56862745098039, |
|
"grad_norm": 0.6831104755401611, |
|
"learning_rate": 7.745e-06, |
|
"loss": 0.4461, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 125.49019607843137, |
|
"grad_norm": 1.195868968963623, |
|
"learning_rate": 7.995e-06, |
|
"loss": 0.4429, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 129.41176470588235, |
|
"grad_norm": 1.1746853590011597, |
|
"learning_rate": 8.245000000000002e-06, |
|
"loss": 0.4358, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 133.33333333333334, |
|
"grad_norm": 1.2797439098358154, |
|
"learning_rate": 8.495e-06, |
|
"loss": 0.4383, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 137.2549019607843, |
|
"grad_norm": 0.6744837760925293, |
|
"learning_rate": 8.745000000000002e-06, |
|
"loss": 0.4416, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 141.1764705882353, |
|
"grad_norm": 0.7655614018440247, |
|
"learning_rate": 8.995000000000001e-06, |
|
"loss": 0.4338, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 145.09803921568627, |
|
"grad_norm": 0.9920282363891602, |
|
"learning_rate": 9.245e-06, |
|
"loss": 0.4337, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 149.01960784313727, |
|
"grad_norm": 0.9740642309188843, |
|
"learning_rate": 9.495000000000001e-06, |
|
"loss": 0.4309, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 152.94117647058823, |
|
"grad_norm": 0.9331285953521729, |
|
"learning_rate": 9.745e-06, |
|
"loss": 0.4337, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 156.86274509803923, |
|
"grad_norm": 0.8512988686561584, |
|
"learning_rate": 9.995000000000002e-06, |
|
"loss": 0.4289, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 156.86274509803923, |
|
"eval_loss": 0.4016592502593994, |
|
"eval_runtime": 6.4392, |
|
"eval_samples_per_second": 27.954, |
|
"eval_steps_per_second": 3.572, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 160.7843137254902, |
|
"grad_norm": 0.7746613025665283, |
|
"learning_rate": 9.755e-06, |
|
"loss": 0.4306, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 164.7058823529412, |
|
"grad_norm": 0.6868831515312195, |
|
"learning_rate": 9.505000000000001e-06, |
|
"loss": 0.4302, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 168.62745098039215, |
|
"grad_norm": 1.010834813117981, |
|
"learning_rate": 9.255e-06, |
|
"loss": 0.4254, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 172.54901960784315, |
|
"grad_norm": 1.054592490196228, |
|
"learning_rate": 9.005000000000001e-06, |
|
"loss": 0.4248, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 176.47058823529412, |
|
"grad_norm": 0.8121660351753235, |
|
"learning_rate": 8.755e-06, |
|
"loss": 0.4227, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 180.3921568627451, |
|
"grad_norm": 0.6637047529220581, |
|
"learning_rate": 8.505e-06, |
|
"loss": 0.4232, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 184.31372549019608, |
|
"grad_norm": 1.0822277069091797, |
|
"learning_rate": 8.255000000000001e-06, |
|
"loss": 0.4226, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 188.23529411764707, |
|
"grad_norm": 0.759693443775177, |
|
"learning_rate": 8.005e-06, |
|
"loss": 0.4236, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 192.15686274509804, |
|
"grad_norm": 0.576042652130127, |
|
"learning_rate": 7.755000000000001e-06, |
|
"loss": 0.4162, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 196.07843137254903, |
|
"grad_norm": 0.8360034227371216, |
|
"learning_rate": 7.505e-06, |
|
"loss": 0.4223, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 196.07843137254903, |
|
"eval_loss": 0.39692553877830505, |
|
"eval_runtime": 6.4387, |
|
"eval_samples_per_second": 27.956, |
|
"eval_steps_per_second": 3.572, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 200.0, |
|
"grad_norm": 0.7426376342773438, |
|
"learning_rate": 7.255000000000001e-06, |
|
"loss": 0.4157, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 203.92156862745097, |
|
"grad_norm": 1.1800576448440552, |
|
"learning_rate": 7.005000000000001e-06, |
|
"loss": 0.419, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 207.84313725490196, |
|
"grad_norm": 0.7355245351791382, |
|
"learning_rate": 6.7550000000000005e-06, |
|
"loss": 0.4174, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 211.76470588235293, |
|
"grad_norm": 0.5805600881576538, |
|
"learning_rate": 6.505e-06, |
|
"loss": 0.4146, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 215.68627450980392, |
|
"grad_norm": 0.9223101139068604, |
|
"learning_rate": 6.255e-06, |
|
"loss": 0.4178, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 219.6078431372549, |
|
"grad_norm": 0.8155106902122498, |
|
"learning_rate": 6.005000000000001e-06, |
|
"loss": 0.4151, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 223.52941176470588, |
|
"grad_norm": 0.6420881748199463, |
|
"learning_rate": 5.755000000000001e-06, |
|
"loss": 0.4156, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 227.45098039215685, |
|
"grad_norm": 0.7704824209213257, |
|
"learning_rate": 5.505000000000001e-06, |
|
"loss": 0.4157, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 231.37254901960785, |
|
"grad_norm": 0.6147534251213074, |
|
"learning_rate": 5.2550000000000005e-06, |
|
"loss": 0.4177, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 235.2941176470588, |
|
"grad_norm": 0.758510172367096, |
|
"learning_rate": 5.0049999999999995e-06, |
|
"loss": 0.4149, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 235.2941176470588, |
|
"eval_loss": 0.39595454931259155, |
|
"eval_runtime": 6.3959, |
|
"eval_samples_per_second": 28.143, |
|
"eval_steps_per_second": 3.596, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 239.2156862745098, |
|
"grad_norm": 0.7012345790863037, |
|
"learning_rate": 4.755e-06, |
|
"loss": 0.4103, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 243.13725490196077, |
|
"grad_norm": 0.7060217261314392, |
|
"learning_rate": 4.505e-06, |
|
"loss": 0.4114, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 247.05882352941177, |
|
"grad_norm": 0.8892201781272888, |
|
"learning_rate": 4.255e-06, |
|
"loss": 0.4093, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 250.98039215686273, |
|
"grad_norm": 0.6098002791404724, |
|
"learning_rate": 4.005000000000001e-06, |
|
"loss": 0.411, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 254.90196078431373, |
|
"grad_norm": 0.7227322459220886, |
|
"learning_rate": 3.7550000000000005e-06, |
|
"loss": 0.4108, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 258.8235294117647, |
|
"grad_norm": 0.8260012865066528, |
|
"learning_rate": 3.505e-06, |
|
"loss": 0.412, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 262.7450980392157, |
|
"grad_norm": 0.6897534132003784, |
|
"learning_rate": 3.255e-06, |
|
"loss": 0.4112, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 266.6666666666667, |
|
"grad_norm": 1.0072269439697266, |
|
"learning_rate": 3.005e-06, |
|
"loss": 0.413, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 270.5882352941176, |
|
"grad_norm": 0.6094086170196533, |
|
"learning_rate": 2.7550000000000003e-06, |
|
"loss": 0.4104, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 274.5098039215686, |
|
"grad_norm": 0.6129186749458313, |
|
"learning_rate": 2.505e-06, |
|
"loss": 0.4129, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 274.5098039215686, |
|
"eval_loss": 0.39622604846954346, |
|
"eval_runtime": 6.4203, |
|
"eval_samples_per_second": 28.036, |
|
"eval_steps_per_second": 3.582, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 278.4313725490196, |
|
"grad_norm": 0.6087095737457275, |
|
"learning_rate": 2.2550000000000004e-06, |
|
"loss": 0.4122, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 282.3529411764706, |
|
"grad_norm": 0.5597049593925476, |
|
"learning_rate": 2.0050000000000003e-06, |
|
"loss": 0.4111, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 286.27450980392155, |
|
"grad_norm": 0.5411703586578369, |
|
"learning_rate": 1.7550000000000001e-06, |
|
"loss": 0.4069, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 290.19607843137254, |
|
"grad_norm": 0.6042822599411011, |
|
"learning_rate": 1.505e-06, |
|
"loss": 0.4079, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 294.11764705882354, |
|
"grad_norm": 0.6852837204933167, |
|
"learning_rate": 1.255e-06, |
|
"loss": 0.4068, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 298.03921568627453, |
|
"grad_norm": 0.6229822039604187, |
|
"learning_rate": 1.0050000000000001e-06, |
|
"loss": 0.4112, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 301.96078431372547, |
|
"grad_norm": 0.6330438256263733, |
|
"learning_rate": 7.550000000000001e-07, |
|
"loss": 0.4053, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 305.88235294117646, |
|
"grad_norm": 0.5041052103042603, |
|
"learning_rate": 5.05e-07, |
|
"loss": 0.405, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 309.80392156862746, |
|
"grad_norm": 0.6818642020225525, |
|
"learning_rate": 2.55e-07, |
|
"loss": 0.4095, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 313.72549019607845, |
|
"grad_norm": 0.4621037542819977, |
|
"learning_rate": 5e-09, |
|
"loss": 0.4108, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 313.72549019607845, |
|
"eval_loss": 0.39425286650657654, |
|
"eval_runtime": 6.437, |
|
"eval_samples_per_second": 27.963, |
|
"eval_steps_per_second": 3.573, |
|
"step": 4000 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 4000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 334, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.70355911863679e+16, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|