|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 20.0, |
|
"eval_steps": 500, |
|
"global_step": 372000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 9.375e-06, |
|
"loss": 6.8546, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 1.875e-05, |
|
"loss": 5.3571, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 2.8125e-05, |
|
"loss": 5.0355, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 3.75e-05, |
|
"loss": 4.8077, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 4.6874999999999994e-05, |
|
"loss": 4.6389, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 5.625e-05, |
|
"loss": 4.4944, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 6.5625e-05, |
|
"loss": 4.3811, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 7.5e-05, |
|
"loss": 4.2867, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 8.437499999999999e-05, |
|
"loss": 4.2018, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"learning_rate": 9.374999999999999e-05, |
|
"loss": 4.126, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 0.000103115625, |
|
"loss": 4.0645, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 0.000112490625, |
|
"loss": 4.005, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 0.000121865625, |
|
"loss": 3.9411, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 0.00013123125, |
|
"loss": 3.889, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"learning_rate": 0.000140596875, |
|
"loss": 3.8468, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"learning_rate": 0.0001499625, |
|
"loss": 3.8099, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"learning_rate": 0.000159328125, |
|
"loss": 3.7701, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"learning_rate": 0.000168703125, |
|
"loss": 3.7444, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.34515730881026885, |
|
"eval_loss": 3.9303231239318848, |
|
"eval_runtime": 153.4286, |
|
"eval_samples_per_second": 377.492, |
|
"eval_steps_per_second": 5.899, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"learning_rate": 0.000178078125, |
|
"loss": 3.7026, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"learning_rate": 0.00018745312499999998, |
|
"loss": 3.676, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"learning_rate": 0.00019681874999999998, |
|
"loss": 3.6595, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"learning_rate": 0.00020619374999999998, |
|
"loss": 3.6375, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"learning_rate": 0.00021555937499999998, |
|
"loss": 3.6166, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"learning_rate": 0.00022493437499999998, |
|
"loss": 3.6019, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"learning_rate": 0.00023430937499999997, |
|
"loss": 3.5891, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"learning_rate": 0.00024368437499999997, |
|
"loss": 3.57, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"learning_rate": 0.00025305, |
|
"loss": 3.5536, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"learning_rate": 0.000262425, |
|
"loss": 3.544, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"learning_rate": 0.000271790625, |
|
"loss": 3.5305, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"learning_rate": 0.000281165625, |
|
"loss": 3.5216, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"learning_rate": 0.000290540625, |
|
"loss": 3.5093, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"learning_rate": 0.00029990624999999993, |
|
"loss": 3.5018, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"learning_rate": 0.00029912647058823525, |
|
"loss": 3.4851, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"learning_rate": 0.000298245, |
|
"loss": 3.4713, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"learning_rate": 0.0002973626470588235, |
|
"loss": 3.459, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"learning_rate": 0.000296480294117647, |
|
"loss": 3.4466, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"learning_rate": 0.00029559794117647056, |
|
"loss": 3.4367, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.3746240995756378, |
|
"eval_loss": 3.637483596801758, |
|
"eval_runtime": 155.3554, |
|
"eval_samples_per_second": 372.81, |
|
"eval_steps_per_second": 5.825, |
|
"step": 37200 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"learning_rate": 0.0002947155882352941, |
|
"loss": 3.3868, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"learning_rate": 0.0002938341176470588, |
|
"loss": 3.3768, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"learning_rate": 0.00029295176470588234, |
|
"loss": 3.3733, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"learning_rate": 0.0002920694117647058, |
|
"loss": 3.3685, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"learning_rate": 0.00029118794117647057, |
|
"loss": 3.3617, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"learning_rate": 0.00029030558823529406, |
|
"loss": 3.3563, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"learning_rate": 0.0002894241176470588, |
|
"loss": 3.3446, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"learning_rate": 0.00028854176470588234, |
|
"loss": 3.3428, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"learning_rate": 0.00028766029411764703, |
|
"loss": 3.3408, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"learning_rate": 0.0002867779411764706, |
|
"loss": 3.3363, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"learning_rate": 0.0002858955882352941, |
|
"loss": 3.3302, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"learning_rate": 0.00028501323529411766, |
|
"loss": 3.3205, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"learning_rate": 0.0002841317647058823, |
|
"loss": 3.3202, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"learning_rate": 0.00028324941176470584, |
|
"loss": 3.3153, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"learning_rate": 0.0002823670588235294, |
|
"loss": 3.3082, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"learning_rate": 0.00028148558823529407, |
|
"loss": 3.3033, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"learning_rate": 0.0002806032352941176, |
|
"loss": 3.298, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"learning_rate": 0.00027972176470588236, |
|
"loss": 3.29, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.38803887906463225, |
|
"eval_loss": 3.5155463218688965, |
|
"eval_runtime": 155.2596, |
|
"eval_samples_per_second": 373.04, |
|
"eval_steps_per_second": 5.829, |
|
"step": 55800 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"learning_rate": 0.00027883941176470585, |
|
"loss": 3.2763, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"learning_rate": 0.0002779570588235294, |
|
"loss": 3.2269, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"learning_rate": 0.0002770755882352941, |
|
"loss": 3.2265, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"learning_rate": 0.0002761932352941176, |
|
"loss": 3.2255, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"learning_rate": 0.0002753117647058823, |
|
"loss": 3.2326, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"learning_rate": 0.00027442941176470585, |
|
"loss": 3.2239, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"learning_rate": 0.0002735470588235294, |
|
"loss": 3.2301, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"learning_rate": 0.0002726655882352941, |
|
"loss": 3.2253, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"learning_rate": 0.00027178323529411763, |
|
"loss": 3.2214, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"learning_rate": 0.00027090088235294117, |
|
"loss": 3.2241, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"learning_rate": 0.00027001941176470586, |
|
"loss": 3.2196, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"learning_rate": 0.00026913705882352935, |
|
"loss": 3.2222, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"learning_rate": 0.0002682547058823529, |
|
"loss": 3.2175, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"learning_rate": 0.00026737323529411763, |
|
"loss": 3.2146, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"learning_rate": 0.0002664908823529411, |
|
"loss": 3.2127, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"learning_rate": 0.00026560941176470587, |
|
"loss": 3.2124, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"learning_rate": 0.0002647270588235294, |
|
"loss": 3.2123, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"learning_rate": 0.0002638455882352941, |
|
"loss": 3.2077, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"learning_rate": 0.00026296323529411764, |
|
"loss": 3.2081, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.3940327953259559, |
|
"eval_loss": 3.475128412246704, |
|
"eval_runtime": 154.8524, |
|
"eval_samples_per_second": 374.021, |
|
"eval_steps_per_second": 5.844, |
|
"step": 74400 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"learning_rate": 0.0002620808823529412, |
|
"loss": 3.1625, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"learning_rate": 0.0002611994117647058, |
|
"loss": 3.1382, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"learning_rate": 0.00026031705882352936, |
|
"loss": 3.1446, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"learning_rate": 0.0002594347058823529, |
|
"loss": 3.1452, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"learning_rate": 0.00025855235294117645, |
|
"loss": 3.1449, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"learning_rate": 0.00025767088235294114, |
|
"loss": 3.1535, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"learning_rate": 0.0002567894117647059, |
|
"loss": 3.1498, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 4.41, |
|
"learning_rate": 0.0002559070588235294, |
|
"loss": 3.1478, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"learning_rate": 0.0002550247058823529, |
|
"loss": 3.1472, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"learning_rate": 0.00025414235294117645, |
|
"loss": 3.1502, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 4.57, |
|
"learning_rate": 0.00025325999999999994, |
|
"loss": 3.1471, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"learning_rate": 0.0002523785294117647, |
|
"loss": 3.1488, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"learning_rate": 0.00025149617647058823, |
|
"loss": 3.1472, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"learning_rate": 0.0002506138235294117, |
|
"loss": 3.1535, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"learning_rate": 0.00024973235294117646, |
|
"loss": 3.1546, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"learning_rate": 0.00024885, |
|
"loss": 3.1469, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 4.89, |
|
"learning_rate": 0.0002479685294117647, |
|
"loss": 3.15, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"learning_rate": 0.00024708617647058824, |
|
"loss": 3.1455, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"learning_rate": 0.0002462047058823529, |
|
"loss": 3.1438, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.398348033440005, |
|
"eval_loss": 3.419024705886841, |
|
"eval_runtime": 154.8451, |
|
"eval_samples_per_second": 374.038, |
|
"eval_steps_per_second": 5.845, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 5.05, |
|
"learning_rate": 0.0002453223529411764, |
|
"loss": 3.0764, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 5.11, |
|
"learning_rate": 0.00024443999999999996, |
|
"loss": 3.0808, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 5.16, |
|
"learning_rate": 0.0002435576470588235, |
|
"loss": 3.0854, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 5.22, |
|
"learning_rate": 0.0002426770588235294, |
|
"loss": 3.0928, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 5.27, |
|
"learning_rate": 0.00024179470588235293, |
|
"loss": 3.0894, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 5.32, |
|
"learning_rate": 0.00024091323529411765, |
|
"loss": 3.0909, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 5.38, |
|
"learning_rate": 0.00024003088235294116, |
|
"loss": 3.0971, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 5.43, |
|
"learning_rate": 0.00023914941176470585, |
|
"loss": 3.0979, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 5.48, |
|
"learning_rate": 0.00023826705882352937, |
|
"loss": 3.1, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 5.54, |
|
"learning_rate": 0.0002373847058823529, |
|
"loss": 3.0951, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 5.59, |
|
"learning_rate": 0.00023650235294117645, |
|
"loss": 3.0994, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 5.65, |
|
"learning_rate": 0.00023562088235294114, |
|
"loss": 3.0989, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 5.7, |
|
"learning_rate": 0.00023473852941176469, |
|
"loss": 3.1001, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"learning_rate": 0.00023385617647058823, |
|
"loss": 3.0974, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 5.81, |
|
"learning_rate": 0.00023297382352941174, |
|
"loss": 3.0927, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 5.86, |
|
"learning_rate": 0.00023209235294117646, |
|
"loss": 3.0977, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 5.91, |
|
"learning_rate": 0.00023121, |
|
"loss": 3.102, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 5.97, |
|
"learning_rate": 0.0002303276470588235, |
|
"loss": 3.0947, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.40218399473400135, |
|
"eval_loss": 3.3905391693115234, |
|
"eval_runtime": 155.1728, |
|
"eval_samples_per_second": 373.248, |
|
"eval_steps_per_second": 5.832, |
|
"step": 111600 |
|
}, |
|
{ |
|
"epoch": 6.02, |
|
"learning_rate": 0.0002294461764705882, |
|
"loss": 3.0704, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 6.08, |
|
"learning_rate": 0.00022856382352941175, |
|
"loss": 3.031, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 6.13, |
|
"learning_rate": 0.00022768235294117644, |
|
"loss": 3.0337, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 6.18, |
|
"learning_rate": 0.00022679999999999998, |
|
"loss": 3.0437, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 6.24, |
|
"learning_rate": 0.00022591764705882353, |
|
"loss": 3.0462, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 6.29, |
|
"learning_rate": 0.00022503617647058822, |
|
"loss": 3.0505, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 6.34, |
|
"learning_rate": 0.00022415382352941176, |
|
"loss": 3.0507, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"learning_rate": 0.00022327235294117645, |
|
"loss": 3.0527, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 6.45, |
|
"learning_rate": 0.00022238999999999996, |
|
"loss": 3.0526, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 6.51, |
|
"learning_rate": 0.0002215076470588235, |
|
"loss": 3.0515, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 6.56, |
|
"learning_rate": 0.00022062617647058822, |
|
"loss": 3.0496, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 6.61, |
|
"learning_rate": 0.0002197447058823529, |
|
"loss": 3.0517, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 6.67, |
|
"learning_rate": 0.00021886235294117645, |
|
"loss": 3.0625, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 6.72, |
|
"learning_rate": 0.00021798, |
|
"loss": 3.0565, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 6.77, |
|
"learning_rate": 0.0002170976470588235, |
|
"loss": 3.0602, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 6.83, |
|
"learning_rate": 0.00021621617647058823, |
|
"loss": 3.0597, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 6.88, |
|
"learning_rate": 0.00021533470588235292, |
|
"loss": 3.0591, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 6.94, |
|
"learning_rate": 0.00021445235294117643, |
|
"loss": 3.0612, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 6.99, |
|
"learning_rate": 0.00021356999999999998, |
|
"loss": 3.0569, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.40292205685134075, |
|
"eval_loss": 3.3831627368927, |
|
"eval_runtime": 154.6364, |
|
"eval_samples_per_second": 374.543, |
|
"eval_steps_per_second": 5.852, |
|
"step": 130200 |
|
}, |
|
{ |
|
"epoch": 7.04, |
|
"learning_rate": 0.00021268764705882352, |
|
"loss": 3.0044, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 7.1, |
|
"learning_rate": 0.00021180529411764704, |
|
"loss": 2.9984, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 7.15, |
|
"learning_rate": 0.00021092382352941175, |
|
"loss": 2.9983, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"learning_rate": 0.0002100414705882353, |
|
"loss": 3.0072, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 7.26, |
|
"learning_rate": 0.0002091591176470588, |
|
"loss": 3.013, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 7.31, |
|
"learning_rate": 0.00020827764705882353, |
|
"loss": 3.0085, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 7.37, |
|
"learning_rate": 0.00020739529411764701, |
|
"loss": 3.009, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 7.42, |
|
"learning_rate": 0.00020651382352941173, |
|
"loss": 3.0152, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 7.47, |
|
"learning_rate": 0.00020563147058823527, |
|
"loss": 3.0172, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 7.53, |
|
"learning_rate": 0.0002047491176470588, |
|
"loss": 3.0198, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 7.58, |
|
"learning_rate": 0.0002038676470588235, |
|
"loss": 3.0207, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 7.63, |
|
"learning_rate": 0.00020298529411764705, |
|
"loss": 3.0238, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 7.69, |
|
"learning_rate": 0.0002021029411764706, |
|
"loss": 3.0222, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 7.74, |
|
"learning_rate": 0.00020122147058823528, |
|
"loss": 3.0237, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 7.8, |
|
"learning_rate": 0.00020033911764705882, |
|
"loss": 3.0225, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 7.85, |
|
"learning_rate": 0.00019945852941176468, |
|
"loss": 3.0232, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 7.9, |
|
"learning_rate": 0.0001985761764705882, |
|
"loss": 3.0275, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 7.96, |
|
"learning_rate": 0.00019769382352941174, |
|
"loss": 3.029, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.40509459032581013, |
|
"eval_loss": 3.3739943504333496, |
|
"eval_runtime": 154.5594, |
|
"eval_samples_per_second": 374.73, |
|
"eval_steps_per_second": 5.855, |
|
"step": 148800 |
|
}, |
|
{ |
|
"epoch": 8.01, |
|
"learning_rate": 0.0001968114705882353, |
|
"loss": 3.0145, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 8.06, |
|
"learning_rate": 0.00019592999999999998, |
|
"loss": 2.9607, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 8.12, |
|
"learning_rate": 0.00019504764705882352, |
|
"loss": 2.9717, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 8.17, |
|
"learning_rate": 0.00019416529411764706, |
|
"loss": 2.9732, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 8.23, |
|
"learning_rate": 0.00019328382352941175, |
|
"loss": 2.977, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"learning_rate": 0.00019240147058823527, |
|
"loss": 2.974, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 8.33, |
|
"learning_rate": 0.00019151911764705878, |
|
"loss": 2.9822, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 8.39, |
|
"learning_rate": 0.00019063676470588233, |
|
"loss": 2.9825, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 8.44, |
|
"learning_rate": 0.00018975529411764704, |
|
"loss": 2.9837, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 8.49, |
|
"learning_rate": 0.00018887294117647058, |
|
"loss": 2.9869, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 8.55, |
|
"learning_rate": 0.00018799147058823527, |
|
"loss": 2.9899, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"learning_rate": 0.00018710911764705882, |
|
"loss": 2.9881, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 8.66, |
|
"learning_rate": 0.00018622676470588236, |
|
"loss": 2.9899, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 8.71, |
|
"learning_rate": 0.00018534441176470588, |
|
"loss": 2.9893, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 8.76, |
|
"learning_rate": 0.00018446294117647056, |
|
"loss": 2.9917, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 8.82, |
|
"learning_rate": 0.00018358147058823528, |
|
"loss": 2.9931, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 8.87, |
|
"learning_rate": 0.0001826991176470588, |
|
"loss": 2.995, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 8.92, |
|
"learning_rate": 0.00018181676470588234, |
|
"loss": 2.9965, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 8.98, |
|
"learning_rate": 0.00018093441176470585, |
|
"loss": 2.9953, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.40588418108360097, |
|
"eval_loss": 3.3781139850616455, |
|
"eval_runtime": 155.1788, |
|
"eval_samples_per_second": 373.234, |
|
"eval_steps_per_second": 5.832, |
|
"step": 167400 |
|
}, |
|
{ |
|
"epoch": 9.03, |
|
"learning_rate": 0.0001800520588235294, |
|
"loss": 2.9607, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 9.09, |
|
"learning_rate": 0.00017916970588235294, |
|
"loss": 2.9343, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 9.14, |
|
"learning_rate": 0.00017828823529411763, |
|
"loss": 2.94, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 9.19, |
|
"learning_rate": 0.00017740588235294117, |
|
"loss": 2.9503, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"learning_rate": 0.00017652441176470586, |
|
"loss": 2.9435, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 9.3, |
|
"learning_rate": 0.00017564205882352938, |
|
"loss": 2.954, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 9.35, |
|
"learning_rate": 0.0001747605882352941, |
|
"loss": 2.9553, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 9.41, |
|
"learning_rate": 0.00017387823529411764, |
|
"loss": 2.9573, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 9.46, |
|
"learning_rate": 0.00017299588235294115, |
|
"loss": 2.9569, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 9.52, |
|
"learning_rate": 0.0001721135294117647, |
|
"loss": 2.9613, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 9.57, |
|
"learning_rate": 0.0001712320588235294, |
|
"loss": 2.9613, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 9.62, |
|
"learning_rate": 0.00017034970588235293, |
|
"loss": 2.9651, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 9.68, |
|
"learning_rate": 0.00016946823529411762, |
|
"loss": 2.9647, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 9.73, |
|
"learning_rate": 0.00016858588235294116, |
|
"loss": 2.9642, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 9.78, |
|
"learning_rate": 0.00016770352941176467, |
|
"loss": 2.9664, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 9.84, |
|
"learning_rate": 0.0001668220588235294, |
|
"loss": 2.9679, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 9.89, |
|
"learning_rate": 0.00016593970588235293, |
|
"loss": 2.9668, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 9.95, |
|
"learning_rate": 0.00016505735294117645, |
|
"loss": 2.9655, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"learning_rate": 0.00016417588235294117, |
|
"loss": 2.9667, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.40691334306935756, |
|
"eval_loss": 3.3879029750823975, |
|
"eval_runtime": 154.9035, |
|
"eval_samples_per_second": 373.897, |
|
"eval_steps_per_second": 5.842, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 10.05, |
|
"learning_rate": 0.0001632935294117647, |
|
"loss": 2.904, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 10.11, |
|
"learning_rate": 0.00016241117647058822, |
|
"loss": 2.9111, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 10.16, |
|
"learning_rate": 0.0001615297058823529, |
|
"loss": 2.915, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 10.22, |
|
"learning_rate": 0.00016064735294117646, |
|
"loss": 2.9198, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 10.27, |
|
"learning_rate": 0.00015976588235294114, |
|
"loss": 2.9215, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 10.32, |
|
"learning_rate": 0.0001588835294117647, |
|
"loss": 2.9301, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 10.38, |
|
"learning_rate": 0.00015800117647058823, |
|
"loss": 2.9329, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 10.43, |
|
"learning_rate": 0.00015711970588235292, |
|
"loss": 2.9325, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 10.48, |
|
"learning_rate": 0.00015623735294117646, |
|
"loss": 2.9347, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 10.54, |
|
"learning_rate": 0.000155355, |
|
"loss": 2.9338, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 10.59, |
|
"learning_rate": 0.00015447264705882352, |
|
"loss": 2.9341, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 10.65, |
|
"learning_rate": 0.0001535911764705882, |
|
"loss": 2.9439, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 10.7, |
|
"learning_rate": 0.00015270882352941173, |
|
"loss": 2.9439, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 10.75, |
|
"learning_rate": 0.00015182735294117644, |
|
"loss": 2.9419, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 10.81, |
|
"learning_rate": 0.00015094499999999999, |
|
"loss": 2.9381, |
|
"step": 201000 |
|
}, |
|
{ |
|
"epoch": 10.86, |
|
"learning_rate": 0.00015006264705882353, |
|
"loss": 2.9431, |
|
"step": 202000 |
|
}, |
|
{ |
|
"epoch": 10.91, |
|
"learning_rate": 0.00014918029411764704, |
|
"loss": 2.9435, |
|
"step": 203000 |
|
}, |
|
{ |
|
"epoch": 10.97, |
|
"learning_rate": 0.00014829882352941176, |
|
"loss": 2.9426, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.4070695413601653, |
|
"eval_loss": 3.376622438430786, |
|
"eval_runtime": 154.8354, |
|
"eval_samples_per_second": 374.062, |
|
"eval_steps_per_second": 5.845, |
|
"step": 204600 |
|
}, |
|
{ |
|
"epoch": 11.02, |
|
"learning_rate": 0.00014741735294117645, |
|
"loss": 2.918, |
|
"step": 205000 |
|
}, |
|
{ |
|
"epoch": 11.08, |
|
"learning_rate": 0.000146535, |
|
"loss": 2.8858, |
|
"step": 206000 |
|
}, |
|
{ |
|
"epoch": 11.13, |
|
"learning_rate": 0.0001456535294117647, |
|
"loss": 2.8906, |
|
"step": 207000 |
|
}, |
|
{ |
|
"epoch": 11.18, |
|
"learning_rate": 0.00014477117647058822, |
|
"loss": 2.8937, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 11.24, |
|
"learning_rate": 0.00014388882352941174, |
|
"loss": 2.9007, |
|
"step": 209000 |
|
}, |
|
{ |
|
"epoch": 11.29, |
|
"learning_rate": 0.00014300647058823528, |
|
"loss": 2.8996, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 11.34, |
|
"learning_rate": 0.000142125, |
|
"loss": 2.9052, |
|
"step": 211000 |
|
}, |
|
{ |
|
"epoch": 11.4, |
|
"learning_rate": 0.00014124264705882351, |
|
"loss": 2.9059, |
|
"step": 212000 |
|
}, |
|
{ |
|
"epoch": 11.45, |
|
"learning_rate": 0.0001403611764705882, |
|
"loss": 2.9084, |
|
"step": 213000 |
|
}, |
|
{ |
|
"epoch": 11.51, |
|
"learning_rate": 0.00013947882352941175, |
|
"loss": 2.9092, |
|
"step": 214000 |
|
}, |
|
{ |
|
"epoch": 11.56, |
|
"learning_rate": 0.0001385964705882353, |
|
"loss": 2.915, |
|
"step": 215000 |
|
}, |
|
{ |
|
"epoch": 11.61, |
|
"learning_rate": 0.000137715, |
|
"loss": 2.9136, |
|
"step": 216000 |
|
}, |
|
{ |
|
"epoch": 11.67, |
|
"learning_rate": 0.00013683264705882352, |
|
"loss": 2.9149, |
|
"step": 217000 |
|
}, |
|
{ |
|
"epoch": 11.72, |
|
"learning_rate": 0.00013595029411764704, |
|
"loss": 2.9185, |
|
"step": 218000 |
|
}, |
|
{ |
|
"epoch": 11.77, |
|
"learning_rate": 0.00013506882352941175, |
|
"loss": 2.9194, |
|
"step": 219000 |
|
}, |
|
{ |
|
"epoch": 11.83, |
|
"learning_rate": 0.0001341864705882353, |
|
"loss": 2.9203, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 11.88, |
|
"learning_rate": 0.0001333041176470588, |
|
"loss": 2.9234, |
|
"step": 221000 |
|
}, |
|
{ |
|
"epoch": 11.94, |
|
"learning_rate": 0.0001324226470588235, |
|
"loss": 2.9217, |
|
"step": 222000 |
|
}, |
|
{ |
|
"epoch": 11.99, |
|
"learning_rate": 0.00013154029411764704, |
|
"loss": 2.9217, |
|
"step": 223000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.40853243072891327, |
|
"eval_loss": 3.3643746376037598, |
|
"eval_runtime": 154.7928, |
|
"eval_samples_per_second": 374.165, |
|
"eval_steps_per_second": 5.847, |
|
"step": 223200 |
|
}, |
|
{ |
|
"epoch": 12.04, |
|
"learning_rate": 0.0001306579411764706, |
|
"loss": 2.8752, |
|
"step": 224000 |
|
}, |
|
{ |
|
"epoch": 12.1, |
|
"learning_rate": 0.0001297755882352941, |
|
"loss": 2.8662, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 12.15, |
|
"learning_rate": 0.00012889323529411765, |
|
"loss": 2.8742, |
|
"step": 226000 |
|
}, |
|
{ |
|
"epoch": 12.2, |
|
"learning_rate": 0.0001280126470588235, |
|
"loss": 2.8765, |
|
"step": 227000 |
|
}, |
|
{ |
|
"epoch": 12.26, |
|
"learning_rate": 0.00012713029411764705, |
|
"loss": 2.8786, |
|
"step": 228000 |
|
}, |
|
{ |
|
"epoch": 12.31, |
|
"learning_rate": 0.00012624882352941177, |
|
"loss": 2.8818, |
|
"step": 229000 |
|
}, |
|
{ |
|
"epoch": 12.37, |
|
"learning_rate": 0.00012536647058823528, |
|
"loss": 2.8831, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 12.42, |
|
"learning_rate": 0.0001244841176470588, |
|
"loss": 2.8812, |
|
"step": 231000 |
|
}, |
|
{ |
|
"epoch": 12.47, |
|
"learning_rate": 0.00012360264705882351, |
|
"loss": 2.8865, |
|
"step": 232000 |
|
}, |
|
{ |
|
"epoch": 12.53, |
|
"learning_rate": 0.00012272029411764706, |
|
"loss": 2.8874, |
|
"step": 233000 |
|
}, |
|
{ |
|
"epoch": 12.58, |
|
"learning_rate": 0.00012183794117647059, |
|
"loss": 2.8898, |
|
"step": 234000 |
|
}, |
|
{ |
|
"epoch": 12.63, |
|
"learning_rate": 0.00012095558823529412, |
|
"loss": 2.8919, |
|
"step": 235000 |
|
}, |
|
{ |
|
"epoch": 12.69, |
|
"learning_rate": 0.0001200741176470588, |
|
"loss": 2.8975, |
|
"step": 236000 |
|
}, |
|
{ |
|
"epoch": 12.74, |
|
"learning_rate": 0.00011919176470588235, |
|
"loss": 2.8972, |
|
"step": 237000 |
|
}, |
|
{ |
|
"epoch": 12.8, |
|
"learning_rate": 0.00011831029411764705, |
|
"loss": 2.8976, |
|
"step": 238000 |
|
}, |
|
{ |
|
"epoch": 12.85, |
|
"learning_rate": 0.00011742794117647058, |
|
"loss": 2.9018, |
|
"step": 239000 |
|
}, |
|
{ |
|
"epoch": 12.9, |
|
"learning_rate": 0.0001165455882352941, |
|
"loss": 2.8994, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 12.96, |
|
"learning_rate": 0.00011566411764705881, |
|
"loss": 2.8993, |
|
"step": 241000 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.40817650017205326, |
|
"eval_loss": 3.3694441318511963, |
|
"eval_runtime": 154.6531, |
|
"eval_samples_per_second": 374.503, |
|
"eval_steps_per_second": 5.852, |
|
"step": 241800 |
|
}, |
|
{ |
|
"epoch": 13.01, |
|
"learning_rate": 0.00011478176470588234, |
|
"loss": 2.8887, |
|
"step": 242000 |
|
}, |
|
{ |
|
"epoch": 13.06, |
|
"learning_rate": 0.00011389941176470588, |
|
"loss": 2.8461, |
|
"step": 243000 |
|
}, |
|
{ |
|
"epoch": 13.12, |
|
"learning_rate": 0.0001130170588235294, |
|
"loss": 2.8513, |
|
"step": 244000 |
|
}, |
|
{ |
|
"epoch": 13.17, |
|
"learning_rate": 0.0001121355882352941, |
|
"loss": 2.856, |
|
"step": 245000 |
|
}, |
|
{ |
|
"epoch": 13.23, |
|
"learning_rate": 0.00011125411764705882, |
|
"loss": 2.8574, |
|
"step": 246000 |
|
}, |
|
{ |
|
"epoch": 13.28, |
|
"learning_rate": 0.00011037176470588235, |
|
"loss": 2.859, |
|
"step": 247000 |
|
}, |
|
{ |
|
"epoch": 13.33, |
|
"learning_rate": 0.00010948941176470588, |
|
"loss": 2.8599, |
|
"step": 248000 |
|
}, |
|
{ |
|
"epoch": 13.39, |
|
"learning_rate": 0.00010860794117647058, |
|
"loss": 2.867, |
|
"step": 249000 |
|
}, |
|
{ |
|
"epoch": 13.44, |
|
"learning_rate": 0.00010772558823529411, |
|
"loss": 2.8631, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 13.49, |
|
"learning_rate": 0.00010684323529411764, |
|
"loss": 2.8676, |
|
"step": 251000 |
|
}, |
|
{ |
|
"epoch": 13.55, |
|
"learning_rate": 0.00010596088235294117, |
|
"loss": 2.8709, |
|
"step": 252000 |
|
}, |
|
{ |
|
"epoch": 13.6, |
|
"learning_rate": 0.00010507941176470587, |
|
"loss": 2.8697, |
|
"step": 253000 |
|
}, |
|
{ |
|
"epoch": 13.66, |
|
"learning_rate": 0.0001041970588235294, |
|
"loss": 2.8746, |
|
"step": 254000 |
|
}, |
|
{ |
|
"epoch": 13.71, |
|
"learning_rate": 0.00010331470588235293, |
|
"loss": 2.8754, |
|
"step": 255000 |
|
}, |
|
{ |
|
"epoch": 13.76, |
|
"learning_rate": 0.00010243323529411765, |
|
"loss": 2.8773, |
|
"step": 256000 |
|
}, |
|
{ |
|
"epoch": 13.82, |
|
"learning_rate": 0.00010155088235294116, |
|
"loss": 2.8724, |
|
"step": 257000 |
|
}, |
|
{ |
|
"epoch": 13.87, |
|
"learning_rate": 0.00010066852941176469, |
|
"loss": 2.8786, |
|
"step": 258000 |
|
}, |
|
{ |
|
"epoch": 13.92, |
|
"learning_rate": 9.978617647058822e-05, |
|
"loss": 2.8808, |
|
"step": 259000 |
|
}, |
|
{ |
|
"epoch": 13.98, |
|
"learning_rate": 9.890382352941176e-05, |
|
"loss": 2.8758, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.4088403261124711, |
|
"eval_loss": 3.386643886566162, |
|
"eval_runtime": 154.4069, |
|
"eval_samples_per_second": 375.1, |
|
"eval_steps_per_second": 5.861, |
|
"step": 260400 |
|
}, |
|
{ |
|
"epoch": 14.03, |
|
"learning_rate": 9.802235294117647e-05, |
|
"loss": 2.8446, |
|
"step": 261000 |
|
}, |
|
{ |
|
"epoch": 14.09, |
|
"learning_rate": 9.714088235294117e-05, |
|
"loss": 2.8291, |
|
"step": 262000 |
|
}, |
|
{ |
|
"epoch": 14.14, |
|
"learning_rate": 9.62585294117647e-05, |
|
"loss": 2.8344, |
|
"step": 263000 |
|
}, |
|
{ |
|
"epoch": 14.19, |
|
"learning_rate": 9.537705882352941e-05, |
|
"loss": 2.8359, |
|
"step": 264000 |
|
}, |
|
{ |
|
"epoch": 14.25, |
|
"learning_rate": 9.449470588235294e-05, |
|
"loss": 2.8377, |
|
"step": 265000 |
|
}, |
|
{ |
|
"epoch": 14.3, |
|
"learning_rate": 9.361235294117646e-05, |
|
"loss": 2.8413, |
|
"step": 266000 |
|
}, |
|
{ |
|
"epoch": 14.35, |
|
"learning_rate": 9.273088235294116e-05, |
|
"loss": 2.8432, |
|
"step": 267000 |
|
}, |
|
{ |
|
"epoch": 14.41, |
|
"learning_rate": 9.18485294117647e-05, |
|
"loss": 2.8471, |
|
"step": 268000 |
|
}, |
|
{ |
|
"epoch": 14.46, |
|
"learning_rate": 9.096617647058823e-05, |
|
"loss": 2.8443, |
|
"step": 269000 |
|
}, |
|
{ |
|
"epoch": 14.52, |
|
"learning_rate": 9.008382352941176e-05, |
|
"loss": 2.8479, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 14.57, |
|
"learning_rate": 8.920235294117645e-05, |
|
"loss": 2.8513, |
|
"step": 271000 |
|
}, |
|
{ |
|
"epoch": 14.62, |
|
"learning_rate": 8.832e-05, |
|
"loss": 2.8549, |
|
"step": 272000 |
|
}, |
|
{ |
|
"epoch": 14.68, |
|
"learning_rate": 8.743941176470588e-05, |
|
"loss": 2.8546, |
|
"step": 273000 |
|
}, |
|
{ |
|
"epoch": 14.73, |
|
"learning_rate": 8.65570588235294e-05, |
|
"loss": 2.8584, |
|
"step": 274000 |
|
}, |
|
{ |
|
"epoch": 14.78, |
|
"learning_rate": 8.567470588235293e-05, |
|
"loss": 2.8543, |
|
"step": 275000 |
|
}, |
|
{ |
|
"epoch": 14.84, |
|
"learning_rate": 8.479235294117646e-05, |
|
"loss": 2.8597, |
|
"step": 276000 |
|
}, |
|
{ |
|
"epoch": 14.89, |
|
"learning_rate": 8.391088235294117e-05, |
|
"loss": 2.8591, |
|
"step": 277000 |
|
}, |
|
{ |
|
"epoch": 14.95, |
|
"learning_rate": 8.302941176470588e-05, |
|
"loss": 2.8617, |
|
"step": 278000 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"learning_rate": 8.21470588235294e-05, |
|
"loss": 2.8544, |
|
"step": 279000 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.4086822467239676, |
|
"eval_loss": 3.384922504425049, |
|
"eval_runtime": 154.8421, |
|
"eval_samples_per_second": 374.046, |
|
"eval_steps_per_second": 5.845, |
|
"step": 279000 |
|
}, |
|
{ |
|
"epoch": 15.05, |
|
"learning_rate": 8.12664705882353e-05, |
|
"loss": 2.8106, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 15.11, |
|
"learning_rate": 8.038411764705882e-05, |
|
"loss": 2.8126, |
|
"step": 281000 |
|
}, |
|
{ |
|
"epoch": 15.16, |
|
"learning_rate": 7.950176470588235e-05, |
|
"loss": 2.8172, |
|
"step": 282000 |
|
}, |
|
{ |
|
"epoch": 15.22, |
|
"learning_rate": 7.861941176470587e-05, |
|
"loss": 2.8257, |
|
"step": 283000 |
|
}, |
|
{ |
|
"epoch": 15.27, |
|
"learning_rate": 7.77370588235294e-05, |
|
"loss": 2.8198, |
|
"step": 284000 |
|
}, |
|
{ |
|
"epoch": 15.32, |
|
"learning_rate": 7.685558823529411e-05, |
|
"loss": 2.8253, |
|
"step": 285000 |
|
}, |
|
{ |
|
"epoch": 15.38, |
|
"learning_rate": 7.597323529411764e-05, |
|
"loss": 2.8255, |
|
"step": 286000 |
|
}, |
|
{ |
|
"epoch": 15.43, |
|
"learning_rate": 7.509176470588235e-05, |
|
"loss": 2.8325, |
|
"step": 287000 |
|
}, |
|
{ |
|
"epoch": 15.48, |
|
"learning_rate": 7.420941176470588e-05, |
|
"loss": 2.8319, |
|
"step": 288000 |
|
}, |
|
{ |
|
"epoch": 15.54, |
|
"learning_rate": 7.332794117647058e-05, |
|
"loss": 2.8316, |
|
"step": 289000 |
|
}, |
|
{ |
|
"epoch": 15.59, |
|
"learning_rate": 7.244558823529411e-05, |
|
"loss": 2.8325, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 15.65, |
|
"learning_rate": 7.156323529411764e-05, |
|
"loss": 2.835, |
|
"step": 291000 |
|
}, |
|
{ |
|
"epoch": 15.7, |
|
"learning_rate": 7.068088235294117e-05, |
|
"loss": 2.8365, |
|
"step": 292000 |
|
}, |
|
{ |
|
"epoch": 15.75, |
|
"learning_rate": 6.97985294117647e-05, |
|
"loss": 2.8328, |
|
"step": 293000 |
|
}, |
|
{ |
|
"epoch": 15.81, |
|
"learning_rate": 6.891705882352941e-05, |
|
"loss": 2.834, |
|
"step": 294000 |
|
}, |
|
{ |
|
"epoch": 15.86, |
|
"learning_rate": 6.803470588235293e-05, |
|
"loss": 2.8412, |
|
"step": 295000 |
|
}, |
|
{ |
|
"epoch": 15.91, |
|
"learning_rate": 6.715323529411764e-05, |
|
"loss": 2.8342, |
|
"step": 296000 |
|
}, |
|
{ |
|
"epoch": 15.97, |
|
"learning_rate": 6.627088235294117e-05, |
|
"loss": 2.8363, |
|
"step": 297000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.4085711947778578, |
|
"eval_loss": 3.4028050899505615, |
|
"eval_runtime": 154.624, |
|
"eval_samples_per_second": 374.573, |
|
"eval_steps_per_second": 5.853, |
|
"step": 297600 |
|
}, |
|
{ |
|
"epoch": 16.02, |
|
"learning_rate": 6.538941176470588e-05, |
|
"loss": 2.8225, |
|
"step": 298000 |
|
}, |
|
{ |
|
"epoch": 16.08, |
|
"learning_rate": 6.45070588235294e-05, |
|
"loss": 2.7936, |
|
"step": 299000 |
|
}, |
|
{ |
|
"epoch": 16.13, |
|
"learning_rate": 6.362470588235293e-05, |
|
"loss": 2.8004, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 16.18, |
|
"learning_rate": 6.274323529411764e-05, |
|
"loss": 2.8005, |
|
"step": 301000 |
|
}, |
|
{ |
|
"epoch": 16.24, |
|
"learning_rate": 6.186088235294117e-05, |
|
"loss": 2.8083, |
|
"step": 302000 |
|
}, |
|
{ |
|
"epoch": 16.29, |
|
"learning_rate": 6.09785294117647e-05, |
|
"loss": 2.8083, |
|
"step": 303000 |
|
}, |
|
{ |
|
"epoch": 16.34, |
|
"learning_rate": 6.0097058823529405e-05, |
|
"loss": 2.8047, |
|
"step": 304000 |
|
}, |
|
{ |
|
"epoch": 16.4, |
|
"learning_rate": 5.921470588235294e-05, |
|
"loss": 2.8067, |
|
"step": 305000 |
|
}, |
|
{ |
|
"epoch": 16.45, |
|
"learning_rate": 5.8332352941176464e-05, |
|
"loss": 2.8139, |
|
"step": 306000 |
|
}, |
|
{ |
|
"epoch": 16.51, |
|
"learning_rate": 5.7449999999999994e-05, |
|
"loss": 2.8145, |
|
"step": 307000 |
|
}, |
|
{ |
|
"epoch": 16.56, |
|
"learning_rate": 5.6568529411764696e-05, |
|
"loss": 2.8164, |
|
"step": 308000 |
|
}, |
|
{ |
|
"epoch": 16.61, |
|
"learning_rate": 5.568705882352941e-05, |
|
"loss": 2.8113, |
|
"step": 309000 |
|
}, |
|
{ |
|
"epoch": 16.67, |
|
"learning_rate": 5.4804705882352935e-05, |
|
"loss": 2.8179, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 16.72, |
|
"learning_rate": 5.3923235294117644e-05, |
|
"loss": 2.819, |
|
"step": 311000 |
|
}, |
|
{ |
|
"epoch": 16.77, |
|
"learning_rate": 5.3040882352941166e-05, |
|
"loss": 2.816, |
|
"step": 312000 |
|
}, |
|
{ |
|
"epoch": 16.83, |
|
"learning_rate": 5.21585294117647e-05, |
|
"loss": 2.8201, |
|
"step": 313000 |
|
}, |
|
{ |
|
"epoch": 16.88, |
|
"learning_rate": 5.127617647058823e-05, |
|
"loss": 2.8151, |
|
"step": 314000 |
|
}, |
|
{ |
|
"epoch": 16.94, |
|
"learning_rate": 5.0394705882352935e-05, |
|
"loss": 2.819, |
|
"step": 315000 |
|
}, |
|
{ |
|
"epoch": 16.99, |
|
"learning_rate": 4.951235294117647e-05, |
|
"loss": 2.8222, |
|
"step": 316000 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.40890569425739837, |
|
"eval_loss": 3.3989803791046143, |
|
"eval_runtime": 154.6683, |
|
"eval_samples_per_second": 374.466, |
|
"eval_steps_per_second": 5.851, |
|
"step": 316200 |
|
}, |
|
{ |
|
"epoch": 17.04, |
|
"learning_rate": 4.8629999999999993e-05, |
|
"loss": 2.7864, |
|
"step": 317000 |
|
}, |
|
{ |
|
"epoch": 17.1, |
|
"learning_rate": 4.774764705882353e-05, |
|
"loss": 2.7845, |
|
"step": 318000 |
|
}, |
|
{ |
|
"epoch": 17.15, |
|
"learning_rate": 4.6867058823529405e-05, |
|
"loss": 2.7865, |
|
"step": 319000 |
|
}, |
|
{ |
|
"epoch": 17.2, |
|
"learning_rate": 4.598470588235294e-05, |
|
"loss": 2.7893, |
|
"step": 320000 |
|
}, |
|
{ |
|
"epoch": 17.26, |
|
"learning_rate": 4.5103235294117643e-05, |
|
"loss": 2.788, |
|
"step": 321000 |
|
}, |
|
{ |
|
"epoch": 17.31, |
|
"learning_rate": 4.422088235294117e-05, |
|
"loss": 2.7909, |
|
"step": 322000 |
|
}, |
|
{ |
|
"epoch": 17.37, |
|
"learning_rate": 4.33385294117647e-05, |
|
"loss": 2.7889, |
|
"step": 323000 |
|
}, |
|
{ |
|
"epoch": 17.42, |
|
"learning_rate": 4.245617647058823e-05, |
|
"loss": 2.7943, |
|
"step": 324000 |
|
}, |
|
{ |
|
"epoch": 17.47, |
|
"learning_rate": 4.157382352941177e-05, |
|
"loss": 2.7979, |
|
"step": 325000 |
|
}, |
|
{ |
|
"epoch": 17.53, |
|
"learning_rate": 4.069235294117647e-05, |
|
"loss": 2.7969, |
|
"step": 326000 |
|
}, |
|
{ |
|
"epoch": 17.58, |
|
"learning_rate": 3.981e-05, |
|
"loss": 2.7966, |
|
"step": 327000 |
|
}, |
|
{ |
|
"epoch": 17.63, |
|
"learning_rate": 3.892764705882352e-05, |
|
"loss": 2.802, |
|
"step": 328000 |
|
}, |
|
{ |
|
"epoch": 17.69, |
|
"learning_rate": 3.804529411764706e-05, |
|
"loss": 2.7987, |
|
"step": 329000 |
|
}, |
|
{ |
|
"epoch": 17.74, |
|
"learning_rate": 3.716382352941176e-05, |
|
"loss": 2.7979, |
|
"step": 330000 |
|
}, |
|
{ |
|
"epoch": 17.8, |
|
"learning_rate": 3.6282352941176464e-05, |
|
"loss": 2.7976, |
|
"step": 331000 |
|
}, |
|
{ |
|
"epoch": 17.85, |
|
"learning_rate": 3.540088235294117e-05, |
|
"loss": 2.8026, |
|
"step": 332000 |
|
}, |
|
{ |
|
"epoch": 17.9, |
|
"learning_rate": 3.45185294117647e-05, |
|
"loss": 2.798, |
|
"step": 333000 |
|
}, |
|
{ |
|
"epoch": 17.96, |
|
"learning_rate": 3.363617647058823e-05, |
|
"loss": 2.8018, |
|
"step": 334000 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.4095525903185545, |
|
"eval_loss": 3.398442029953003, |
|
"eval_runtime": 155.2345, |
|
"eval_samples_per_second": 373.1, |
|
"eval_steps_per_second": 5.83, |
|
"step": 334800 |
|
}, |
|
{ |
|
"epoch": 18.01, |
|
"learning_rate": 3.275382352941176e-05, |
|
"loss": 2.7957, |
|
"step": 335000 |
|
}, |
|
{ |
|
"epoch": 18.06, |
|
"learning_rate": 3.187147058823529e-05, |
|
"loss": 2.7702, |
|
"step": 336000 |
|
}, |
|
{ |
|
"epoch": 18.12, |
|
"learning_rate": 3.099e-05, |
|
"loss": 2.7682, |
|
"step": 337000 |
|
}, |
|
{ |
|
"epoch": 18.17, |
|
"learning_rate": 3.0108529411764705e-05, |
|
"loss": 2.7714, |
|
"step": 338000 |
|
}, |
|
{ |
|
"epoch": 18.23, |
|
"learning_rate": 2.9226176470588235e-05, |
|
"loss": 2.7758, |
|
"step": 339000 |
|
}, |
|
{ |
|
"epoch": 18.28, |
|
"learning_rate": 2.834382352941176e-05, |
|
"loss": 2.7784, |
|
"step": 340000 |
|
}, |
|
{ |
|
"epoch": 18.33, |
|
"learning_rate": 2.746147058823529e-05, |
|
"loss": 2.7762, |
|
"step": 341000 |
|
}, |
|
{ |
|
"epoch": 18.39, |
|
"learning_rate": 2.6579117647058823e-05, |
|
"loss": 2.779, |
|
"step": 342000 |
|
}, |
|
{ |
|
"epoch": 18.44, |
|
"learning_rate": 2.5697647058823526e-05, |
|
"loss": 2.7778, |
|
"step": 343000 |
|
}, |
|
{ |
|
"epoch": 18.49, |
|
"learning_rate": 2.481529411764706e-05, |
|
"loss": 2.7852, |
|
"step": 344000 |
|
}, |
|
{ |
|
"epoch": 18.55, |
|
"learning_rate": 2.3932941176470588e-05, |
|
"loss": 2.7799, |
|
"step": 345000 |
|
}, |
|
{ |
|
"epoch": 18.6, |
|
"learning_rate": 2.3050588235294114e-05, |
|
"loss": 2.7818, |
|
"step": 346000 |
|
}, |
|
{ |
|
"epoch": 18.66, |
|
"learning_rate": 2.2169117647058823e-05, |
|
"loss": 2.7816, |
|
"step": 347000 |
|
}, |
|
{ |
|
"epoch": 18.71, |
|
"learning_rate": 2.128764705882353e-05, |
|
"loss": 2.7822, |
|
"step": 348000 |
|
}, |
|
{ |
|
"epoch": 18.76, |
|
"learning_rate": 2.040529411764706e-05, |
|
"loss": 2.7823, |
|
"step": 349000 |
|
}, |
|
{ |
|
"epoch": 18.82, |
|
"learning_rate": 1.9522941176470585e-05, |
|
"loss": 2.7791, |
|
"step": 350000 |
|
}, |
|
{ |
|
"epoch": 18.87, |
|
"learning_rate": 1.8640588235294117e-05, |
|
"loss": 2.7863, |
|
"step": 351000 |
|
}, |
|
{ |
|
"epoch": 18.92, |
|
"learning_rate": 1.7759117647058823e-05, |
|
"loss": 2.784, |
|
"step": 352000 |
|
}, |
|
{ |
|
"epoch": 18.98, |
|
"learning_rate": 1.6876764705882353e-05, |
|
"loss": 2.7834, |
|
"step": 353000 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.4090608848172977, |
|
"eval_loss": 3.4143290519714355, |
|
"eval_runtime": 154.3909, |
|
"eval_samples_per_second": 375.139, |
|
"eval_steps_per_second": 5.862, |
|
"step": 353400 |
|
}, |
|
{ |
|
"epoch": 19.03, |
|
"learning_rate": 1.599529411764706e-05, |
|
"loss": 2.7704, |
|
"step": 354000 |
|
}, |
|
{ |
|
"epoch": 19.09, |
|
"learning_rate": 1.5113823529411762e-05, |
|
"loss": 2.7643, |
|
"step": 355000 |
|
}, |
|
{ |
|
"epoch": 19.14, |
|
"learning_rate": 1.4231470588235294e-05, |
|
"loss": 2.7622, |
|
"step": 356000 |
|
}, |
|
{ |
|
"epoch": 19.19, |
|
"learning_rate": 1.3349117647058821e-05, |
|
"loss": 2.7693, |
|
"step": 357000 |
|
}, |
|
{ |
|
"epoch": 19.25, |
|
"learning_rate": 1.2466764705882353e-05, |
|
"loss": 2.7638, |
|
"step": 358000 |
|
}, |
|
{ |
|
"epoch": 19.3, |
|
"learning_rate": 1.158441176470588e-05, |
|
"loss": 2.7635, |
|
"step": 359000 |
|
}, |
|
{ |
|
"epoch": 19.35, |
|
"learning_rate": 1.0702058823529411e-05, |
|
"loss": 2.7606, |
|
"step": 360000 |
|
}, |
|
{ |
|
"epoch": 19.41, |
|
"learning_rate": 9.820588235294115e-06, |
|
"loss": 2.7663, |
|
"step": 361000 |
|
}, |
|
{ |
|
"epoch": 19.46, |
|
"learning_rate": 8.938235294117647e-06, |
|
"loss": 2.7634, |
|
"step": 362000 |
|
}, |
|
{ |
|
"epoch": 19.52, |
|
"learning_rate": 8.056764705882352e-06, |
|
"loss": 2.7638, |
|
"step": 363000 |
|
}, |
|
{ |
|
"epoch": 19.57, |
|
"learning_rate": 7.175294117647058e-06, |
|
"loss": 2.7667, |
|
"step": 364000 |
|
}, |
|
{ |
|
"epoch": 19.62, |
|
"learning_rate": 6.2929411764705876e-06, |
|
"loss": 2.7678, |
|
"step": 365000 |
|
}, |
|
{ |
|
"epoch": 19.68, |
|
"learning_rate": 5.411470588235293e-06, |
|
"loss": 2.7655, |
|
"step": 366000 |
|
}, |
|
{ |
|
"epoch": 19.73, |
|
"learning_rate": 4.529117647058823e-06, |
|
"loss": 2.7639, |
|
"step": 367000 |
|
}, |
|
{ |
|
"epoch": 19.78, |
|
"learning_rate": 3.6467647058823527e-06, |
|
"loss": 2.7684, |
|
"step": 368000 |
|
}, |
|
{ |
|
"epoch": 19.84, |
|
"learning_rate": 2.765294117647059e-06, |
|
"loss": 2.7625, |
|
"step": 369000 |
|
}, |
|
{ |
|
"epoch": 19.89, |
|
"learning_rate": 1.882941176470588e-06, |
|
"loss": 2.7642, |
|
"step": 370000 |
|
}, |
|
{ |
|
"epoch": 19.95, |
|
"learning_rate": 1.0005882352941176e-06, |
|
"loss": 2.7677, |
|
"step": 371000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"learning_rate": 1.1823529411764706e-07, |
|
"loss": 2.7626, |
|
"step": 372000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.4089445254884035, |
|
"eval_loss": 3.4202394485473633, |
|
"eval_runtime": 154.4184, |
|
"eval_samples_per_second": 375.072, |
|
"eval_steps_per_second": 5.861, |
|
"step": 372000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"step": 372000, |
|
"total_flos": 1.56732865394688e+18, |
|
"train_loss": 3.0576216975386425, |
|
"train_runtime": 80892.0203, |
|
"train_samples_per_second": 147.156, |
|
"train_steps_per_second": 4.599 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 372000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 5000, |
|
"total_flos": 1.56732865394688e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|