|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9998557067962099, |
|
"eval_steps": 500, |
|
"global_step": 5197, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 8.974358974358975e-07, |
|
"loss": 2.9732, |
|
"loss_": 1.436, |
|
"moe_loss": 0.1675, |
|
"moe_loss_longrong": 1.4982, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 1.794871794871795e-06, |
|
"loss": 2.9414, |
|
"loss_": 1.3375, |
|
"moe_loss": 0.1674, |
|
"moe_loss_longrong": 1.4964, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 2.6923076923076923e-06, |
|
"loss": 2.8682, |
|
"loss_": 1.1438, |
|
"moe_loss": 0.1663, |
|
"moe_loss_longrong": 1.4917, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 3.58974358974359e-06, |
|
"loss": 2.806, |
|
"loss_": 1.2312, |
|
"moe_loss": 0.1655, |
|
"moe_loss_longrong": 1.488, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 4.487179487179488e-06, |
|
"loss": 2.7734, |
|
"loss_": 1.2285, |
|
"moe_loss": 0.1645, |
|
"moe_loss_longrong": 1.4882, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 5.384615384615385e-06, |
|
"loss": 2.7314, |
|
"loss_": 1.0526, |
|
"moe_loss": 0.1635, |
|
"moe_loss_longrong": 1.4847, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 6.282051282051282e-06, |
|
"loss": 2.6961, |
|
"loss_": 0.9249, |
|
"moe_loss": 0.1627, |
|
"moe_loss_longrong": 1.4803, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 7.17948717948718e-06, |
|
"loss": 2.6779, |
|
"loss_": 0.9535, |
|
"moe_loss": 0.1665, |
|
"moe_loss_longrong": 1.5082, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 8.076923076923077e-06, |
|
"loss": 2.6983, |
|
"loss_": 1.1541, |
|
"moe_loss": 0.1617, |
|
"moe_loss_longrong": 1.4777, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 8.974358974358976e-06, |
|
"loss": 2.6718, |
|
"loss_": 1.0194, |
|
"moe_loss": 0.1613, |
|
"moe_loss_longrong": 1.4718, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 9.871794871794872e-06, |
|
"loss": 2.6443, |
|
"loss_": 0.7302, |
|
"moe_loss": 0.1615, |
|
"moe_loss_longrong": 1.4723, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 1.076923076923077e-05, |
|
"loss": 2.7002, |
|
"loss_": 1.1497, |
|
"moe_loss": 0.1615, |
|
"moe_loss_longrong": 1.4679, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 1.1666666666666668e-05, |
|
"loss": 2.6528, |
|
"loss_": 1.114, |
|
"moe_loss": 0.1613, |
|
"moe_loss_longrong": 1.4653, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 1.2564102564102565e-05, |
|
"loss": 2.667, |
|
"loss_": 1.0337, |
|
"moe_loss": 0.1614, |
|
"moe_loss_longrong": 1.466, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 1.3461538461538463e-05, |
|
"loss": 2.61, |
|
"loss_": 0.8435, |
|
"moe_loss": 0.1643, |
|
"moe_loss_longrong": 1.4985, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 1.435897435897436e-05, |
|
"loss": 2.6878, |
|
"loss_": 1.2116, |
|
"moe_loss": 0.1611, |
|
"moe_loss_longrong": 1.4619, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 1.5256410256410257e-05, |
|
"loss": 2.6614, |
|
"loss_": 1.2295, |
|
"moe_loss": 0.1612, |
|
"moe_loss_longrong": 1.4628, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 1.6153846153846154e-05, |
|
"loss": 2.6541, |
|
"loss_": 1.087, |
|
"moe_loss": 0.1612, |
|
"moe_loss_longrong": 1.4593, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 1.7051282051282053e-05, |
|
"loss": 2.6268, |
|
"loss_": 1.0603, |
|
"moe_loss": 0.1611, |
|
"moe_loss_longrong": 1.4563, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 1.794871794871795e-05, |
|
"loss": 2.6591, |
|
"loss_": 1.1362, |
|
"moe_loss": 0.1611, |
|
"moe_loss_longrong": 1.4566, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 1.8846153846153846e-05, |
|
"loss": 2.6675, |
|
"loss_": 0.9348, |
|
"moe_loss": 0.1611, |
|
"moe_loss_longrong": 1.4551, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 1.9743589743589745e-05, |
|
"loss": 2.6317, |
|
"loss_": 1.1054, |
|
"moe_loss": 0.1611, |
|
"moe_loss_longrong": 1.4542, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 1.999995145147809e-05, |
|
"loss": 2.658, |
|
"loss_": 0.9526, |
|
"moe_loss": 0.1611, |
|
"moe_loss_longrong": 1.4514, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 1.9999720361590812e-05, |
|
"loss": 2.6381, |
|
"loss_": 0.9977, |
|
"moe_loss": 0.1615, |
|
"moe_loss_longrong": 1.4506, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 1.9999298966967264e-05, |
|
"loss": 2.6193, |
|
"loss_": 0.8738, |
|
"moe_loss": 0.1612, |
|
"moe_loss_longrong": 1.4527, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 1.9998687275627008e-05, |
|
"loss": 2.617, |
|
"loss_": 1.0383, |
|
"moe_loss": 0.1612, |
|
"moe_loss_longrong": 1.4503, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 1.999788529921114e-05, |
|
"loss": 2.6334, |
|
"loss_": 0.9878, |
|
"moe_loss": 0.1611, |
|
"moe_loss_longrong": 1.4484, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 1.9996893052982083e-05, |
|
"loss": 2.6288, |
|
"loss_": 0.9578, |
|
"moe_loss": 0.161, |
|
"moe_loss_longrong": 1.4476, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 1.9995710555823277e-05, |
|
"loss": 2.6573, |
|
"loss_": 1.0551, |
|
"moe_loss": 0.1608, |
|
"moe_loss_longrong": 1.4478, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 1.9994337830238836e-05, |
|
"loss": 2.6195, |
|
"loss_": 1.2421, |
|
"moe_loss": 0.1609, |
|
"moe_loss_longrong": 1.4449, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 1.9992774902353104e-05, |
|
"loss": 2.5979, |
|
"loss_": 1.0235, |
|
"moe_loss": 0.1627, |
|
"moe_loss_longrong": 1.4802, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 1.9991021801910177e-05, |
|
"loss": 2.6143, |
|
"loss_": 1.1486, |
|
"moe_loss": 0.1608, |
|
"moe_loss_longrong": 1.4443, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 1.9989078562273313e-05, |
|
"loss": 2.6047, |
|
"loss_": 0.9541, |
|
"moe_loss": 0.161, |
|
"moe_loss_longrong": 1.4455, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 1.9986945220424326e-05, |
|
"loss": 2.6336, |
|
"loss_": 1.1406, |
|
"moe_loss": 0.1607, |
|
"moe_loss_longrong": 1.4428, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 1.9984621816962843e-05, |
|
"loss": 2.6217, |
|
"loss_": 1.0207, |
|
"moe_loss": 0.1608, |
|
"moe_loss_longrong": 1.4447, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 1.9982108396105584e-05, |
|
"loss": 2.6014, |
|
"loss_": 1.3744, |
|
"moe_loss": 0.1608, |
|
"moe_loss_longrong": 1.4426, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 1.9979405005685466e-05, |
|
"loss": 2.6134, |
|
"loss_": 0.9548, |
|
"moe_loss": 0.1609, |
|
"moe_loss_longrong": 1.4415, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 1.997651169715073e-05, |
|
"loss": 2.6022, |
|
"loss_": 1.058, |
|
"moe_loss": 0.1607, |
|
"moe_loss_longrong": 1.4413, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 1.9973428525563948e-05, |
|
"loss": 2.6219, |
|
"loss_": 1.1897, |
|
"moe_loss": 0.1607, |
|
"moe_loss_longrong": 1.4404, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 1.9970155549600978e-05, |
|
"loss": 2.6232, |
|
"loss_": 1.25, |
|
"moe_loss": 0.1607, |
|
"moe_loss_longrong": 1.4401, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 1.996669283154984e-05, |
|
"loss": 2.5805, |
|
"loss_": 1.138, |
|
"moe_loss": 0.1607, |
|
"moe_loss_longrong": 1.4387, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 1.996304043730955e-05, |
|
"loss": 2.6188, |
|
"loss_": 1.2563, |
|
"moe_loss": 0.1607, |
|
"moe_loss_longrong": 1.4386, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 1.995919843638883e-05, |
|
"loss": 2.5867, |
|
"loss_": 0.9975, |
|
"moe_loss": 0.1607, |
|
"moe_loss_longrong": 1.4387, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 1.9955166901904838e-05, |
|
"loss": 2.5987, |
|
"loss_": 1.157, |
|
"moe_loss": 0.1607, |
|
"moe_loss_longrong": 1.4366, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 1.9950945910581718e-05, |
|
"loss": 2.5875, |
|
"loss_": 1.0582, |
|
"moe_loss": 0.1606, |
|
"moe_loss_longrong": 1.4378, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 1.9946535542749187e-05, |
|
"loss": 2.5848, |
|
"loss_": 0.9934, |
|
"moe_loss": 0.1624, |
|
"moe_loss_longrong": 1.4703, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 1.9941935882340976e-05, |
|
"loss": 2.6086, |
|
"loss_": 0.8756, |
|
"moe_loss": 0.1607, |
|
"moe_loss_longrong": 1.4366, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 1.9937147016893257e-05, |
|
"loss": 2.5968, |
|
"loss_": 1.1941, |
|
"moe_loss": 0.1608, |
|
"moe_loss_longrong": 1.4376, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 1.9932169037542947e-05, |
|
"loss": 2.6158, |
|
"loss_": 0.9761, |
|
"moe_loss": 0.1606, |
|
"moe_loss_longrong": 1.4358, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 1.9927002039026002e-05, |
|
"loss": 2.5944, |
|
"loss_": 1.2162, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.4346, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 1.9921646119675606e-05, |
|
"loss": 2.5806, |
|
"loss_": 0.8511, |
|
"moe_loss": 0.1606, |
|
"moe_loss_longrong": 1.4358, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 1.9916101381420285e-05, |
|
"loss": 2.6285, |
|
"loss_": 0.9065, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.4361, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 1.991036792978199e-05, |
|
"loss": 2.6076, |
|
"loss_": 0.7095, |
|
"moe_loss": 0.1606, |
|
"moe_loss_longrong": 1.4347, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 1.9904445873874068e-05, |
|
"loss": 2.5824, |
|
"loss_": 0.571, |
|
"moe_loss": 0.1606, |
|
"moe_loss_longrong": 1.4343, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 1.98983353263992e-05, |
|
"loss": 2.5803, |
|
"loss_": 0.9037, |
|
"moe_loss": 0.1606, |
|
"moe_loss_longrong": 1.4344, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 1.9892036403647256e-05, |
|
"loss": 2.6071, |
|
"loss_": 1.0289, |
|
"moe_loss": 0.1606, |
|
"moe_loss_longrong": 1.4333, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 1.9885549225493064e-05, |
|
"loss": 2.6155, |
|
"loss_": 1.227, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.4324, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 1.9878873915394154e-05, |
|
"loss": 2.6057, |
|
"loss_": 1.276, |
|
"moe_loss": 0.1607, |
|
"moe_loss_longrong": 1.4326, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 1.987201060038839e-05, |
|
"loss": 2.5446, |
|
"loss_": 1.1148, |
|
"moe_loss": 0.1606, |
|
"moe_loss_longrong": 1.432, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 1.986495941109156e-05, |
|
"loss": 2.5787, |
|
"loss_": 0.9601, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.4317, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 1.9857720481694887e-05, |
|
"loss": 2.6018, |
|
"loss_": 0.8145, |
|
"moe_loss": 0.1606, |
|
"moe_loss_longrong": 1.4318, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 1.985029394996248e-05, |
|
"loss": 2.5863, |
|
"loss_": 0.9872, |
|
"moe_loss": 0.1618, |
|
"moe_loss_longrong": 1.4613, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 1.9842679957228706e-05, |
|
"loss": 2.5837, |
|
"loss_": 1.165, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.43, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 1.9834878648395507e-05, |
|
"loss": 2.6015, |
|
"loss_": 0.9562, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.4307, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 1.9826890171929634e-05, |
|
"loss": 2.5453, |
|
"loss_": 0.8231, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.4306, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 1.981871467985983e-05, |
|
"loss": 2.578, |
|
"loss_": 0.9864, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.4306, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 1.9810352327773935e-05, |
|
"loss": 2.5723, |
|
"loss_": 1.1748, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.4305, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 1.9801803274815915e-05, |
|
"loss": 2.6173, |
|
"loss_": 1.0737, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.4297, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 1.979306768368285e-05, |
|
"loss": 2.5664, |
|
"loss_": 1.3735, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.4291, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 1.9784145720621827e-05, |
|
"loss": 2.5832, |
|
"loss_": 1.0223, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.4296, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 1.9775037555426772e-05, |
|
"loss": 2.5448, |
|
"loss_": 1.2395, |
|
"moe_loss": 0.1607, |
|
"moe_loss_longrong": 1.4291, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 1.9765743361435234e-05, |
|
"loss": 2.5729, |
|
"loss_": 1.1156, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4273, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 1.975626331552507e-05, |
|
"loss": 2.5526, |
|
"loss_": 0.8797, |
|
"moe_loss": 0.1606, |
|
"moe_loss_longrong": 1.4281, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 1.974659759811109e-05, |
|
"loss": 2.573, |
|
"loss_": 1.1636, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.4274, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 1.9736746393141617e-05, |
|
"loss": 2.59, |
|
"loss_": 1.1342, |
|
"moe_loss": 0.1606, |
|
"moe_loss_longrong": 1.4279, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 1.9726709888094994e-05, |
|
"loss": 2.5921, |
|
"loss_": 0.8051, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.4277, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 1.9716488273976006e-05, |
|
"loss": 2.6023, |
|
"loss_": 1.2093, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.4276, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 1.970608174531224e-05, |
|
"loss": 2.5744, |
|
"loss_": 0.9951, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4256, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 1.9695490500150418e-05, |
|
"loss": 2.5917, |
|
"loss_": 1.0794, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.427, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 1.9684714740052584e-05, |
|
"loss": 2.5849, |
|
"loss_": 0.8469, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.426, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 1.9673754670092283e-05, |
|
"loss": 2.5705, |
|
"loss_": 0.96, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4265, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 1.9662610498850684e-05, |
|
"loss": 2.5672, |
|
"loss_": 1.038, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.4258, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 1.965128243841256e-05, |
|
"loss": 2.553, |
|
"loss_": 1.1173, |
|
"moe_loss": 0.1606, |
|
"moe_loss_longrong": 1.4267, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 1.9639770704362305e-05, |
|
"loss": 2.5951, |
|
"loss_": 1.1815, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.4255, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 1.9628075515779796e-05, |
|
"loss": 2.5528, |
|
"loss_": 0.916, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4247, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 1.961619709523623e-05, |
|
"loss": 2.5537, |
|
"loss_": 1.1069, |
|
"moe_loss": 0.1606, |
|
"moe_loss_longrong": 1.4252, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 1.9604135668789897e-05, |
|
"loss": 2.553, |
|
"loss_": 0.8815, |
|
"moe_loss": 0.1616, |
|
"moe_loss_longrong": 1.4545, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 1.959189146598188e-05, |
|
"loss": 2.557, |
|
"loss_": 0.3617, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.4253, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 1.9579464719831668e-05, |
|
"loss": 2.5735, |
|
"loss_": 1.1934, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4248, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 1.9566855666832743e-05, |
|
"loss": 2.5679, |
|
"loss_": 1.2144, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4241, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 1.9554064546948064e-05, |
|
"loss": 2.5541, |
|
"loss_": 0.7773, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4247, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 1.9541091603605508e-05, |
|
"loss": 2.5396, |
|
"loss_": 1.0911, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.4238, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 1.9527937083693233e-05, |
|
"loss": 2.5328, |
|
"loss_": 1.2836, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4241, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 1.951460123755499e-05, |
|
"loss": 2.559, |
|
"loss_": 0.6191, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4249, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 1.9501084318985335e-05, |
|
"loss": 2.5656, |
|
"loss_": 0.5936, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4238, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 1.948738658522483e-05, |
|
"loss": 2.5408, |
|
"loss_": 1.0426, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.4256, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 1.9473508296955126e-05, |
|
"loss": 2.5346, |
|
"loss_": 1.0259, |
|
"moe_loss": 0.1613, |
|
"moe_loss_longrong": 1.4496, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 1.9459449718294008e-05, |
|
"loss": 2.5744, |
|
"loss_": 1.2413, |
|
"moe_loss": 0.1606, |
|
"moe_loss_longrong": 1.4233, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 1.9445211116790365e-05, |
|
"loss": 2.5513, |
|
"loss_": 1.1087, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4224, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 1.9430792763419105e-05, |
|
"loss": 2.5552, |
|
"loss_": 1.1375, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4219, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 1.9416194932576e-05, |
|
"loss": 2.5634, |
|
"loss_": 0.8712, |
|
"moe_loss": 0.1606, |
|
"moe_loss_longrong": 1.4224, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 1.9401417902072447e-05, |
|
"loss": 2.5538, |
|
"loss_": 0.9992, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4211, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 1.93864619531302e-05, |
|
"loss": 2.5786, |
|
"loss_": 1.0579, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4223, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 1.9371327370376018e-05, |
|
"loss": 2.5565, |
|
"loss_": 1.1717, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4223, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 1.935601444183622e-05, |
|
"loss": 2.5491, |
|
"loss_": 1.0508, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.4216, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 1.934052345893125e-05, |
|
"loss": 2.5485, |
|
"loss_": 1.1535, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4201, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 1.932485471647009e-05, |
|
"loss": 2.5434, |
|
"loss_": 0.8935, |
|
"moe_loss": 0.1613, |
|
"moe_loss_longrong": 1.4462, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 1.9309008512644668e-05, |
|
"loss": 2.5549, |
|
"loss_": 0.9146, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4212, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 1.929298514902418e-05, |
|
"loss": 2.5655, |
|
"loss_": 1.2834, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4206, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 1.927678493054935e-05, |
|
"loss": 2.5664, |
|
"loss_": 1.1543, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4203, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 1.9260408165526638e-05, |
|
"loss": 2.5559, |
|
"loss_": 1.1544, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4205, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 1.9243855165622345e-05, |
|
"loss": 2.538, |
|
"loss_": 0.9985, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.42, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 1.9227126245856716e-05, |
|
"loss": 2.528, |
|
"loss_": 0.766, |
|
"moe_loss": 0.1609, |
|
"moe_loss_longrong": 1.4442, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 1.921022172459791e-05, |
|
"loss": 2.56, |
|
"loss_": 1.0356, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4201, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 1.9193141923555984e-05, |
|
"loss": 2.5418, |
|
"loss_": 1.0224, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4191, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 1.917588716777672e-05, |
|
"loss": 2.5489, |
|
"loss_": 1.2415, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.42, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 1.9158457785635478e-05, |
|
"loss": 2.5647, |
|
"loss_": 1.0902, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4194, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 1.914085410883093e-05, |
|
"loss": 2.5695, |
|
"loss_": 1.0454, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4196, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 1.9123076472378753e-05, |
|
"loss": 2.5355, |
|
"loss_": 1.1475, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4185, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 1.910512521460525e-05, |
|
"loss": 2.5557, |
|
"loss_": 0.7606, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4187, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 1.908700067714091e-05, |
|
"loss": 2.5223, |
|
"loss_": 0.9963, |
|
"moe_loss": 0.1611, |
|
"moe_loss_longrong": 1.4451, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 1.906870320491391e-05, |
|
"loss": 2.5422, |
|
"loss_": 0.6658, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4189, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 1.9050233146143554e-05, |
|
"loss": 2.5373, |
|
"loss_": 1.022, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4191, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 1.9031590852333637e-05, |
|
"loss": 2.5536, |
|
"loss_": 0.9191, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4191, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 1.9012776678265756e-05, |
|
"loss": 2.5076, |
|
"loss_": 0.4788, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4179, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 1.899379098199257e-05, |
|
"loss": 2.5061, |
|
"loss_": 1.1005, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4178, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 1.897463412483098e-05, |
|
"loss": 2.5584, |
|
"loss_": 1.0856, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4184, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 1.895530647135524e-05, |
|
"loss": 2.5329, |
|
"loss_": 0.7922, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4175, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 1.8935808389390032e-05, |
|
"loss": 2.524, |
|
"loss_": 1.0799, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4163, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 1.8916140250003475e-05, |
|
"loss": 2.5423, |
|
"loss_": 0.8152, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4174, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 1.8896302427500042e-05, |
|
"loss": 2.533, |
|
"loss_": 1.0454, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.4181, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 1.8876295299413445e-05, |
|
"loss": 2.522, |
|
"loss_": 1.1185, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4169, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 1.885611924649946e-05, |
|
"loss": 2.5539, |
|
"loss_": 1.1256, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4167, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 1.883577465272866e-05, |
|
"loss": 2.5069, |
|
"loss_": 0.743, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4177, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 1.8815261905279133e-05, |
|
"loss": 2.5429, |
|
"loss_": 1.0463, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4162, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 1.879458139452909e-05, |
|
"loss": 2.5381, |
|
"loss_": 1.0908, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4152, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 1.877373351404946e-05, |
|
"loss": 2.4924, |
|
"loss_": 1.1275, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4167, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 1.8752718660596367e-05, |
|
"loss": 2.536, |
|
"loss_": 0.7467, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4165, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 1.873153723410362e-05, |
|
"loss": 2.507, |
|
"loss_": 1.0083, |
|
"moe_loss": 0.1609, |
|
"moe_loss_longrong": 1.4404, |
|
"step": 973 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 1.8710189637675055e-05, |
|
"loss": 2.5118, |
|
"loss_": 0.874, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4161, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 1.8688676277576916e-05, |
|
"loss": 2.5415, |
|
"loss_": 1.1152, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4157, |
|
"step": 987 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 1.866699756323008e-05, |
|
"loss": 2.5225, |
|
"loss_": 0.8857, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.416, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 1.8645153907202285e-05, |
|
"loss": 2.5093, |
|
"loss_": 1.0791, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4157, |
|
"step": 1001 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 1.862314572520028e-05, |
|
"loss": 2.534, |
|
"loss_": 1.1649, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4167, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 1.86009734360619e-05, |
|
"loss": 2.559, |
|
"loss_": 1.2289, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4163, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 1.8578637461748105e-05, |
|
"loss": 2.5738, |
|
"loss_": 0.8422, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.416, |
|
"step": 1022 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 1.8556138227334957e-05, |
|
"loss": 2.5752, |
|
"loss_": 1.0332, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4157, |
|
"step": 1029 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 1.853347616100552e-05, |
|
"loss": 2.5633, |
|
"loss_": 1.2742, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4152, |
|
"step": 1036 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 1.8510651694041702e-05, |
|
"loss": 2.5491, |
|
"loss_": 1.1394, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4153, |
|
"step": 1043 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 1.848766526081607e-05, |
|
"loss": 2.5032, |
|
"loss_": 1.0904, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4148, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 1.846451729878357e-05, |
|
"loss": 2.5687, |
|
"loss_": 1.0973, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4144, |
|
"step": 1057 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 1.84412082484732e-05, |
|
"loss": 2.5378, |
|
"loss_": 0.7628, |
|
"moe_loss": 0.161, |
|
"moe_loss_longrong": 1.4372, |
|
"step": 1064 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 1.841773855347963e-05, |
|
"loss": 2.5285, |
|
"loss_": 1.0832, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4148, |
|
"step": 1071 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 1.8394108660454766e-05, |
|
"loss": 2.53, |
|
"loss_": 0.7952, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.415, |
|
"step": 1078 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 1.8370319019099236e-05, |
|
"loss": 2.5457, |
|
"loss_": 0.8855, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4146, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 1.8346370082153843e-05, |
|
"loss": 2.5227, |
|
"loss_": 1.1518, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4148, |
|
"step": 1092 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 1.8322262305390948e-05, |
|
"loss": 2.5268, |
|
"loss_": 1.0055, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4139, |
|
"step": 1099 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 1.8297996147605787e-05, |
|
"loss": 2.5418, |
|
"loss_": 1.2226, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4139, |
|
"step": 1106 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 1.8273572070607756e-05, |
|
"loss": 2.5465, |
|
"loss_": 1.0475, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.414, |
|
"step": 1113 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 1.8248990539211596e-05, |
|
"loss": 2.5132, |
|
"loss_": 1.2063, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4148, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 1.822425202122858e-05, |
|
"loss": 2.5236, |
|
"loss_": 1.3605, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4145, |
|
"step": 1127 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 1.819935698745759e-05, |
|
"loss": 2.517, |
|
"loss_": 0.9965, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4137, |
|
"step": 1134 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 1.817430591167615e-05, |
|
"loss": 2.5347, |
|
"loss_": 1.1165, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4133, |
|
"step": 1141 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 1.8149099270631434e-05, |
|
"loss": 2.5051, |
|
"loss_": 0.9918, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4133, |
|
"step": 1148 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 1.8123737544031178e-05, |
|
"loss": 2.5228, |
|
"loss_": 1.0757, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4132, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 1.8098221214534543e-05, |
|
"loss": 2.5117, |
|
"loss_": 0.9441, |
|
"moe_loss": 0.1608, |
|
"moe_loss_longrong": 1.435, |
|
"step": 1162 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 1.807255076774294e-05, |
|
"loss": 2.5292, |
|
"loss_": 0.9315, |
|
"moe_loss": 0.1608, |
|
"moe_loss_longrong": 1.4332, |
|
"step": 1169 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 1.80467266921908e-05, |
|
"loss": 2.4974, |
|
"loss_": 0.9341, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.414, |
|
"step": 1176 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 1.802074947933625e-05, |
|
"loss": 2.5251, |
|
"loss_": 1.0369, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4127, |
|
"step": 1183 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 1.799461962355178e-05, |
|
"loss": 2.5424, |
|
"loss_": 1.2525, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4133, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 1.7968337622114824e-05, |
|
"loss": 2.5123, |
|
"loss_": 1.4116, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4131, |
|
"step": 1197 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 1.7941903975198305e-05, |
|
"loss": 2.5119, |
|
"loss_": 0.8823, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4132, |
|
"step": 1204 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 1.791531918586112e-05, |
|
"loss": 2.5372, |
|
"loss_": 0.9075, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4125, |
|
"step": 1211 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 1.7888583760038534e-05, |
|
"loss": 2.5356, |
|
"loss_": 1.065, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4133, |
|
"step": 1218 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"learning_rate": 1.78616982065326e-05, |
|
"loss": 2.5154, |
|
"loss_": 0.9056, |
|
"moe_loss": 0.1611, |
|
"moe_loss_longrong": 1.4338, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"learning_rate": 1.7834663037002444e-05, |
|
"loss": 2.5377, |
|
"loss_": 0.8469, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4123, |
|
"step": 1232 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"learning_rate": 1.7807478765954532e-05, |
|
"loss": 2.5363, |
|
"loss_": 1.0507, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4116, |
|
"step": 1239 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"learning_rate": 1.778014591073288e-05, |
|
"loss": 2.5131, |
|
"loss_": 1.0501, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4123, |
|
"step": 1246 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"learning_rate": 1.7752664991509224e-05, |
|
"loss": 2.5127, |
|
"loss_": 1.0583, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.412, |
|
"step": 1253 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"learning_rate": 1.7725036531273087e-05, |
|
"loss": 2.5147, |
|
"loss_": 1.0225, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4119, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"learning_rate": 1.7697261055821864e-05, |
|
"loss": 2.5399, |
|
"loss_": 1.1622, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4123, |
|
"step": 1267 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"learning_rate": 1.7669339093750786e-05, |
|
"loss": 2.5042, |
|
"loss_": 1.1, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4118, |
|
"step": 1274 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"learning_rate": 1.7641271176442876e-05, |
|
"loss": 2.4867, |
|
"loss_": 1.1917, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4116, |
|
"step": 1281 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"learning_rate": 1.761305783805883e-05, |
|
"loss": 2.5159, |
|
"loss_": 1.1055, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4114, |
|
"step": 1288 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"learning_rate": 1.7584699615526857e-05, |
|
"loss": 2.5064, |
|
"loss_": 0.7231, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4119, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"learning_rate": 1.755619704853246e-05, |
|
"loss": 2.5335, |
|
"loss_": 0.9998, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4112, |
|
"step": 1302 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"learning_rate": 1.752755067950814e-05, |
|
"loss": 2.5332, |
|
"loss_": 1.2823, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4114, |
|
"step": 1309 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"learning_rate": 1.749876105362313e-05, |
|
"loss": 2.5212, |
|
"loss_": 1.2273, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4109, |
|
"step": 1316 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"learning_rate": 1.746982871877296e-05, |
|
"loss": 2.5149, |
|
"loss_": 1.2552, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4113, |
|
"step": 1323 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"learning_rate": 1.744075422556906e-05, |
|
"loss": 2.4876, |
|
"loss_": 0.9236, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4115, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"learning_rate": 1.741153812732828e-05, |
|
"loss": 2.5421, |
|
"loss_": 1.155, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4106, |
|
"step": 1337 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"learning_rate": 1.7382180980062365e-05, |
|
"loss": 2.5095, |
|
"loss_": 0.6978, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4109, |
|
"step": 1344 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"learning_rate": 1.735268334246734e-05, |
|
"loss": 2.5068, |
|
"loss_": 1.0142, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4105, |
|
"step": 1351 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"learning_rate": 1.7323045775912927e-05, |
|
"loss": 2.5247, |
|
"loss_": 0.9039, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4111, |
|
"step": 1358 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"learning_rate": 1.7293268844431826e-05, |
|
"loss": 2.5308, |
|
"loss_": 1.1161, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4101, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"learning_rate": 1.7263353114708993e-05, |
|
"loss": 2.516, |
|
"loss_": 0.5494, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4105, |
|
"step": 1372 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 1.7233299156070852e-05, |
|
"loss": 2.5405, |
|
"loss_": 1.0823, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4091, |
|
"step": 1379 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 1.720310754047446e-05, |
|
"loss": 2.5123, |
|
"loss_": 1.016, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.41, |
|
"step": 1386 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 1.717277884249664e-05, |
|
"loss": 2.4917, |
|
"loss_": 1.1087, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4103, |
|
"step": 1393 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 1.7142313639323012e-05, |
|
"loss": 2.5184, |
|
"loss_": 1.2711, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4101, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 1.7111712510737035e-05, |
|
"loss": 2.5037, |
|
"loss_": 1.1198, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4102, |
|
"step": 1407 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 1.7080976039108964e-05, |
|
"loss": 2.5262, |
|
"loss_": 0.9286, |
|
"moe_loss": 0.1612, |
|
"moe_loss_longrong": 1.4315, |
|
"step": 1414 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 1.7050104809384774e-05, |
|
"loss": 2.4892, |
|
"loss_": 1.0377, |
|
"moe_loss": 0.1609, |
|
"moe_loss_longrong": 1.4276, |
|
"step": 1421 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 1.7019099409075014e-05, |
|
"loss": 2.5076, |
|
"loss_": 0.9405, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4092, |
|
"step": 1428 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 1.6987960428243637e-05, |
|
"loss": 2.5198, |
|
"loss_": 1.3093, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4094, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 1.6956688459496767e-05, |
|
"loss": 2.5508, |
|
"loss_": 1.0043, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4083, |
|
"step": 1442 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 1.6925284097971427e-05, |
|
"loss": 2.5299, |
|
"loss_": 1.1324, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4091, |
|
"step": 1449 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 1.6893747941324197e-05, |
|
"loss": 2.5495, |
|
"loss_": 0.7979, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4097, |
|
"step": 1456 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 1.6862080589719863e-05, |
|
"loss": 2.4692, |
|
"loss_": 0.563, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4097, |
|
"step": 1463 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 1.6830282645819974e-05, |
|
"loss": 2.5107, |
|
"loss_": 0.3532, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4107, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 1.679835471477139e-05, |
|
"loss": 2.498, |
|
"loss_": 0.9877, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4086, |
|
"step": 1477 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"learning_rate": 1.6766297404194745e-05, |
|
"loss": 2.502, |
|
"loss_": 1.1402, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4091, |
|
"step": 1484 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"learning_rate": 1.673411132417291e-05, |
|
"loss": 2.5066, |
|
"loss_": 1.167, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.409, |
|
"step": 1491 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"learning_rate": 1.6701797087239354e-05, |
|
"loss": 2.5273, |
|
"loss_": 0.914, |
|
"moe_loss": 0.1607, |
|
"moe_loss_longrong": 1.4264, |
|
"step": 1498 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"learning_rate": 1.666935530836651e-05, |
|
"loss": 2.5022, |
|
"loss_": 0.9724, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4087, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"learning_rate": 1.663678660495406e-05, |
|
"loss": 2.4806, |
|
"loss_": 1.1766, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4086, |
|
"step": 1512 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"learning_rate": 1.6604091596817193e-05, |
|
"loss": 2.5228, |
|
"loss_": 0.8955, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4086, |
|
"step": 1519 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"learning_rate": 1.657127090617479e-05, |
|
"loss": 2.5303, |
|
"loss_": 0.9496, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4079, |
|
"step": 1526 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"learning_rate": 1.6538325157637614e-05, |
|
"loss": 2.5162, |
|
"loss_": 0.8978, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4085, |
|
"step": 1533 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"learning_rate": 1.650525497819639e-05, |
|
"loss": 2.5187, |
|
"loss_": 0.787, |
|
"moe_loss": 0.1608, |
|
"moe_loss_longrong": 1.4256, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"learning_rate": 1.6472060997209898e-05, |
|
"loss": 2.5283, |
|
"loss_": 1.1671, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4085, |
|
"step": 1547 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"learning_rate": 1.6438743846392987e-05, |
|
"loss": 2.5049, |
|
"loss_": 0.7488, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4082, |
|
"step": 1554 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"learning_rate": 1.6405304159804534e-05, |
|
"loss": 2.4966, |
|
"loss_": 1.0698, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4078, |
|
"step": 1561 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"learning_rate": 1.6371742573835426e-05, |
|
"loss": 2.5307, |
|
"loss_": 1.1426, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4075, |
|
"step": 1568 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"learning_rate": 1.6338059727196386e-05, |
|
"loss": 2.4884, |
|
"loss_": 1.0242, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4078, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"learning_rate": 1.6304256260905872e-05, |
|
"loss": 2.5239, |
|
"loss_": 1.1671, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4079, |
|
"step": 1582 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"learning_rate": 1.627033281827785e-05, |
|
"loss": 2.5292, |
|
"loss_": 0.9625, |
|
"moe_loss": 0.1607, |
|
"moe_loss_longrong": 1.4262, |
|
"step": 1589 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"learning_rate": 1.6236290044909543e-05, |
|
"loss": 2.5336, |
|
"loss_": 0.8255, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4081, |
|
"step": 1596 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"learning_rate": 1.6202128588669177e-05, |
|
"loss": 2.5205, |
|
"loss_": 0.7348, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4082, |
|
"step": 1603 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"learning_rate": 1.6167849099683623e-05, |
|
"loss": 2.4854, |
|
"loss_": 0.9767, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4075, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"learning_rate": 1.6133452230326035e-05, |
|
"loss": 2.5265, |
|
"loss_": 0.9913, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4074, |
|
"step": 1617 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"learning_rate": 1.609893863520343e-05, |
|
"loss": 2.4785, |
|
"loss_": 0.9806, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4076, |
|
"step": 1624 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"learning_rate": 1.6064308971144236e-05, |
|
"loss": 2.5053, |
|
"loss_": 1.207, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4073, |
|
"step": 1631 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 1.60295638971858e-05, |
|
"loss": 2.5212, |
|
"loss_": 1.2091, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4072, |
|
"step": 1638 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 1.599470407456182e-05, |
|
"loss": 2.5177, |
|
"loss_": 1.0634, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4071, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 1.5959730166689783e-05, |
|
"loss": 2.5219, |
|
"loss_": 0.7302, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4069, |
|
"step": 1652 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 1.5924642839158334e-05, |
|
"loss": 2.5273, |
|
"loss_": 1.1267, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4065, |
|
"step": 1659 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 1.5889442759714603e-05, |
|
"loss": 2.5067, |
|
"loss_": 0.8492, |
|
"moe_loss": 0.1606, |
|
"moe_loss_longrong": 1.4234, |
|
"step": 1666 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 1.5854130598251514e-05, |
|
"loss": 2.4997, |
|
"loss_": 1.0397, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4065, |
|
"step": 1673 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 1.581870702679501e-05, |
|
"loss": 2.5277, |
|
"loss_": 0.9804, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4073, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 1.5783172719491288e-05, |
|
"loss": 2.5191, |
|
"loss_": 1.1463, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4064, |
|
"step": 1687 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 1.5747528352593956e-05, |
|
"loss": 2.4859, |
|
"loss_": 1.0594, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4063, |
|
"step": 1694 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 1.5711774604451168e-05, |
|
"loss": 2.5146, |
|
"loss_": 1.0352, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4063, |
|
"step": 1701 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 1.567591215549271e-05, |
|
"loss": 2.5086, |
|
"loss_": 0.8248, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4067, |
|
"step": 1708 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 1.5639941688217063e-05, |
|
"loss": 2.4807, |
|
"loss_": 0.8445, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4063, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 1.5603863887178393e-05, |
|
"loss": 2.5192, |
|
"loss_": 0.9476, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4057, |
|
"step": 1722 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 1.5567679438973543e-05, |
|
"loss": 2.5131, |
|
"loss_": 1.1376, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4062, |
|
"step": 1729 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 1.5531389032228955e-05, |
|
"loss": 2.4964, |
|
"loss_": 1.2426, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4064, |
|
"step": 1736 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"learning_rate": 1.549499335758757e-05, |
|
"loss": 2.5134, |
|
"loss_": 0.8763, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.406, |
|
"step": 1743 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"learning_rate": 1.5458493107695688e-05, |
|
"loss": 2.4855, |
|
"loss_": 1.1827, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4061, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"learning_rate": 1.542188897718977e-05, |
|
"loss": 2.4889, |
|
"loss_": 1.014, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4064, |
|
"step": 1757 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"learning_rate": 1.5385181662683244e-05, |
|
"loss": 2.5111, |
|
"loss_": 1.0933, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4058, |
|
"step": 1764 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"learning_rate": 1.534837186275322e-05, |
|
"loss": 2.5385, |
|
"loss_": 0.8571, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4062, |
|
"step": 1771 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"learning_rate": 1.531146027792722e-05, |
|
"loss": 2.5107, |
|
"loss_": 0.9431, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4058, |
|
"step": 1778 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"learning_rate": 1.527444761066982e-05, |
|
"loss": 2.5031, |
|
"loss_": 1.2333, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4056, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"learning_rate": 1.523733456536931e-05, |
|
"loss": 2.4927, |
|
"loss_": 1.1367, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.406, |
|
"step": 1792 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 1.5200121848324276e-05, |
|
"loss": 2.5148, |
|
"loss_": 1.0457, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4052, |
|
"step": 1799 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 1.5162810167730144e-05, |
|
"loss": 2.4974, |
|
"loss_": 0.8327, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4055, |
|
"step": 1806 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 1.5125400233665728e-05, |
|
"loss": 2.4938, |
|
"loss_": 0.965, |
|
"moe_loss": 0.1603, |
|
"moe_loss_longrong": 1.4061, |
|
"step": 1813 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 1.50878927580797e-05, |
|
"loss": 2.4854, |
|
"loss_": 1.0636, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4061, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 1.5050288454777047e-05, |
|
"loss": 2.4829, |
|
"loss_": 0.8501, |
|
"moe_loss": 0.1607, |
|
"moe_loss_longrong": 1.4209, |
|
"step": 1827 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 1.501258803940548e-05, |
|
"loss": 2.5151, |
|
"loss_": 1.2857, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4049, |
|
"step": 1834 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 1.4974792229441826e-05, |
|
"loss": 2.5045, |
|
"loss_": 1.0645, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4054, |
|
"step": 1841 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 1.4936901744178367e-05, |
|
"loss": 2.5062, |
|
"loss_": 0.5678, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4056, |
|
"step": 1848 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 1.489891730470914e-05, |
|
"loss": 2.4826, |
|
"loss_": 1.2262, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4049, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 1.4860839633916236e-05, |
|
"loss": 2.466, |
|
"loss_": 0.7849, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4048, |
|
"step": 1862 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 1.4822669456456031e-05, |
|
"loss": 2.4872, |
|
"loss_": 0.8576, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4052, |
|
"step": 1869 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 1.4784407498745394e-05, |
|
"loss": 2.4951, |
|
"loss_": 0.8778, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4052, |
|
"step": 1876 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 1.4746054488947863e-05, |
|
"loss": 2.4876, |
|
"loss_": 0.8237, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.405, |
|
"step": 1883 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 1.470761115695979e-05, |
|
"loss": 2.4986, |
|
"loss_": 0.9971, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4045, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 1.4669078234396454e-05, |
|
"loss": 2.4678, |
|
"loss_": 1.1283, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4043, |
|
"step": 1897 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"learning_rate": 1.4630456454578122e-05, |
|
"loss": 2.516, |
|
"loss_": 0.9592, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4045, |
|
"step": 1904 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"learning_rate": 1.4591746552516109e-05, |
|
"loss": 2.5208, |
|
"loss_": 1.0451, |
|
"moe_loss": 0.1607, |
|
"moe_loss_longrong": 1.4182, |
|
"step": 1911 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"learning_rate": 1.4552949264898795e-05, |
|
"loss": 2.498, |
|
"loss_": 0.8697, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4045, |
|
"step": 1918 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"learning_rate": 1.4514065330077575e-05, |
|
"loss": 2.5174, |
|
"loss_": 0.8274, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4044, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"learning_rate": 1.4475095488052843e-05, |
|
"loss": 2.5038, |
|
"loss_": 0.7792, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4052, |
|
"step": 1932 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"learning_rate": 1.4436040480459891e-05, |
|
"loss": 2.5116, |
|
"loss_": 0.9444, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4044, |
|
"step": 1939 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"learning_rate": 1.4396901050554794e-05, |
|
"loss": 2.4786, |
|
"loss_": 1.0648, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4047, |
|
"step": 1946 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 1.435767794320027e-05, |
|
"loss": 2.4987, |
|
"loss_": 1.0158, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4046, |
|
"step": 1953 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 1.4318371904851502e-05, |
|
"loss": 2.5188, |
|
"loss_": 1.1058, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4045, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 1.4278983683541934e-05, |
|
"loss": 2.491, |
|
"loss_": 1.1232, |
|
"moe_loss": 0.1606, |
|
"moe_loss_longrong": 1.4187, |
|
"step": 1967 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 1.4239514028869032e-05, |
|
"loss": 2.487, |
|
"loss_": 0.9791, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4039, |
|
"step": 1974 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 1.4199963691980027e-05, |
|
"loss": 2.492, |
|
"loss_": 1.0493, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4046, |
|
"step": 1981 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 1.4160333425557616e-05, |
|
"loss": 2.5256, |
|
"loss_": 0.6311, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4039, |
|
"step": 1988 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 1.4120623983805617e-05, |
|
"loss": 2.502, |
|
"loss_": 0.8536, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4037, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"learning_rate": 1.408083612243465e-05, |
|
"loss": 2.4939, |
|
"loss_": 1.0558, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.4185, |
|
"step": 2002 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"learning_rate": 1.4040970598647742e-05, |
|
"loss": 2.4975, |
|
"loss_": 0.9278, |
|
"moe_loss": 0.1606, |
|
"moe_loss_longrong": 1.4179, |
|
"step": 2009 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"learning_rate": 1.40010281711259e-05, |
|
"loss": 2.4624, |
|
"loss_": 0.8695, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4036, |
|
"step": 2016 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"learning_rate": 1.3961009600013702e-05, |
|
"loss": 2.4981, |
|
"loss_": 0.9502, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4041, |
|
"step": 2023 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"learning_rate": 1.39209156469048e-05, |
|
"loss": 2.4973, |
|
"loss_": 1.0486, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4037, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"learning_rate": 1.3880747074827454e-05, |
|
"loss": 2.498, |
|
"loss_": 1.0935, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4031, |
|
"step": 2037 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"learning_rate": 1.384050464822999e-05, |
|
"loss": 2.4956, |
|
"loss_": 0.978, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4045, |
|
"step": 2044 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"learning_rate": 1.3800189132966257e-05, |
|
"loss": 2.4826, |
|
"loss_": 0.9682, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4039, |
|
"step": 2051 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"learning_rate": 1.3759801296281072e-05, |
|
"loss": 2.499, |
|
"loss_": 0.8618, |
|
"moe_loss": 0.1606, |
|
"moe_loss_longrong": 1.4181, |
|
"step": 2058 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"learning_rate": 1.371934190679558e-05, |
|
"loss": 2.4876, |
|
"loss_": 0.7575, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4032, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"learning_rate": 1.3678811734492659e-05, |
|
"loss": 2.4821, |
|
"loss_": 0.8992, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4031, |
|
"step": 2072 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"learning_rate": 1.3638211550702256e-05, |
|
"loss": 2.4975, |
|
"loss_": 0.9085, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4031, |
|
"step": 2079 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"learning_rate": 1.3597542128086702e-05, |
|
"loss": 2.4958, |
|
"loss_": 1.1546, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.403, |
|
"step": 2086 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"learning_rate": 1.3556804240626019e-05, |
|
"loss": 2.5323, |
|
"loss_": 1.0748, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4033, |
|
"step": 2093 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"learning_rate": 1.3515998663603174e-05, |
|
"loss": 2.5085, |
|
"loss_": 1.1199, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4034, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 1.3475126173589343e-05, |
|
"loss": 2.4864, |
|
"loss_": 0.8556, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4029, |
|
"step": 2107 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 1.3434187548429126e-05, |
|
"loss": 2.5068, |
|
"loss_": 0.946, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4031, |
|
"step": 2114 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 1.3393183567225724e-05, |
|
"loss": 2.4837, |
|
"loss_": 1.1161, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4029, |
|
"step": 2121 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 1.3352115010326155e-05, |
|
"loss": 2.4825, |
|
"loss_": 0.6543, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4027, |
|
"step": 2128 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 1.3310982659306352e-05, |
|
"loss": 2.5257, |
|
"loss_": 1.2189, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4026, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 1.3269787296956333e-05, |
|
"loss": 2.4993, |
|
"loss_": 0.9341, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4028, |
|
"step": 2142 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 1.3228529707265279e-05, |
|
"loss": 2.4981, |
|
"loss_": 1.102, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4029, |
|
"step": 2149 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 1.3187210675406617e-05, |
|
"loss": 2.5076, |
|
"loss_": 0.6091, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4026, |
|
"step": 2156 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"learning_rate": 1.3145830987723081e-05, |
|
"loss": 2.4946, |
|
"loss_": 0.972, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4025, |
|
"step": 2163 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"learning_rate": 1.3104391431711748e-05, |
|
"loss": 2.471, |
|
"loss_": 0.8826, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4026, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"learning_rate": 1.306289279600905e-05, |
|
"loss": 2.4847, |
|
"loss_": 1.1855, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4025, |
|
"step": 2177 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"learning_rate": 1.3021335870375763e-05, |
|
"loss": 2.505, |
|
"loss_": 1.0819, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4025, |
|
"step": 2184 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"learning_rate": 1.297972144568198e-05, |
|
"loss": 2.4909, |
|
"loss_": 0.8074, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4024, |
|
"step": 2191 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"learning_rate": 1.2938050313892062e-05, |
|
"loss": 2.4929, |
|
"loss_": 1.0944, |
|
"moe_loss": 0.1607, |
|
"moe_loss_longrong": 1.4177, |
|
"step": 2198 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"learning_rate": 1.289632326804956e-05, |
|
"loss": 2.4747, |
|
"loss_": 0.8172, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4027, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 1.2854541102262119e-05, |
|
"loss": 2.4782, |
|
"loss_": 0.8552, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4028, |
|
"step": 2212 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 1.2812704611686386e-05, |
|
"loss": 2.4825, |
|
"loss_": 0.9202, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.402, |
|
"step": 2219 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 1.2770814592512853e-05, |
|
"loss": 2.4951, |
|
"loss_": 1.1396, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4022, |
|
"step": 2226 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 1.2728871841950719e-05, |
|
"loss": 2.4628, |
|
"loss_": 0.9138, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4025, |
|
"step": 2233 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 1.2686877158212715e-05, |
|
"loss": 2.5028, |
|
"loss_": 0.8915, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.402, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 1.2644831340499906e-05, |
|
"loss": 2.4802, |
|
"loss_": 1.3262, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4022, |
|
"step": 2247 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 1.2602735188986498e-05, |
|
"loss": 2.4888, |
|
"loss_": 1.1958, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4025, |
|
"step": 2254 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 1.2560589504804592e-05, |
|
"loss": 2.4964, |
|
"loss_": 1.0784, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4019, |
|
"step": 2261 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"learning_rate": 1.2518395090028952e-05, |
|
"loss": 2.4972, |
|
"loss_": 1.164, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.402, |
|
"step": 2268 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"learning_rate": 1.2476152747661727e-05, |
|
"loss": 2.5173, |
|
"loss_": 1.083, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4018, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"learning_rate": 1.243386328161718e-05, |
|
"loss": 2.5094, |
|
"loss_": 1.1749, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4021, |
|
"step": 2282 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"learning_rate": 1.2391527496706389e-05, |
|
"loss": 2.5007, |
|
"loss_": 1.2048, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4019, |
|
"step": 2289 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"learning_rate": 1.2349146198621917e-05, |
|
"loss": 2.4613, |
|
"loss_": 0.9356, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4018, |
|
"step": 2296 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"learning_rate": 1.23067201939225e-05, |
|
"loss": 2.522, |
|
"loss_": 1.3161, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4019, |
|
"step": 2303 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"learning_rate": 1.2264250290017675e-05, |
|
"loss": 2.4876, |
|
"loss_": 0.9183, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4018, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"learning_rate": 1.222173729515243e-05, |
|
"loss": 2.4852, |
|
"loss_": 1.0262, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4013, |
|
"step": 2317 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"learning_rate": 1.217918201839182e-05, |
|
"loss": 2.4974, |
|
"loss_": 0.9078, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4019, |
|
"step": 2324 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"learning_rate": 1.2136585269605558e-05, |
|
"loss": 2.4873, |
|
"loss_": 1.063, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4015, |
|
"step": 2331 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"learning_rate": 1.209394785945263e-05, |
|
"loss": 2.4491, |
|
"loss_": 0.7031, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4026, |
|
"step": 2338 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"learning_rate": 1.2051270599365825e-05, |
|
"loss": 2.5059, |
|
"loss_": 1.0756, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4012, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"learning_rate": 1.2008554301536328e-05, |
|
"loss": 2.4821, |
|
"loss_": 0.508, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4018, |
|
"step": 2352 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"learning_rate": 1.1965799778898258e-05, |
|
"loss": 2.4776, |
|
"loss_": 1.0053, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4016, |
|
"step": 2359 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"learning_rate": 1.1923007845113178e-05, |
|
"loss": 2.512, |
|
"loss_": 0.6722, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4016, |
|
"step": 2366 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"learning_rate": 1.1880179314554629e-05, |
|
"loss": 2.4488, |
|
"loss_": 0.4041, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.402, |
|
"step": 2373 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"learning_rate": 1.1837315002292629e-05, |
|
"loss": 2.4889, |
|
"loss_": 1.1084, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.401, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"learning_rate": 1.1794415724078147e-05, |
|
"loss": 2.4732, |
|
"loss_": 0.6909, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4019, |
|
"step": 2387 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"learning_rate": 1.17514822963276e-05, |
|
"loss": 2.4599, |
|
"loss_": 1.0441, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4013, |
|
"step": 2394 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"learning_rate": 1.1708515536107299e-05, |
|
"loss": 2.472, |
|
"loss_": 0.7234, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4012, |
|
"step": 2401 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"learning_rate": 1.1665516261117914e-05, |
|
"loss": 2.4923, |
|
"loss_": 1.2036, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4014, |
|
"step": 2408 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"learning_rate": 1.1622485289678886e-05, |
|
"loss": 2.4794, |
|
"loss_": 0.9414, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4011, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 1.1579423440712887e-05, |
|
"loss": 2.4873, |
|
"loss_": 0.8799, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4007, |
|
"step": 2422 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 1.153633153373022e-05, |
|
"loss": 2.4685, |
|
"loss_": 0.8351, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.414, |
|
"step": 2429 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 1.149321038881321e-05, |
|
"loss": 2.4965, |
|
"loss_": 1.0812, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4007, |
|
"step": 2436 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 1.1450060826600618e-05, |
|
"loss": 2.467, |
|
"loss_": 1.0899, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4007, |
|
"step": 2443 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 1.1406883668272015e-05, |
|
"loss": 2.5148, |
|
"loss_": 0.9878, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4008, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 1.1363679735532151e-05, |
|
"loss": 2.4869, |
|
"loss_": 1.0094, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4004, |
|
"step": 2457 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 1.132044985059532e-05, |
|
"loss": 2.4687, |
|
"loss_": 0.8133, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4004, |
|
"step": 2464 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 1.1277194836169714e-05, |
|
"loss": 2.4692, |
|
"loss_": 1.154, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.401, |
|
"step": 2471 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 1.1233915515441765e-05, |
|
"loss": 2.4768, |
|
"loss_": 0.9273, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4005, |
|
"step": 2478 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 1.1190612712060475e-05, |
|
"loss": 2.465, |
|
"loss_": 0.9635, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4005, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 1.1147287250121745e-05, |
|
"loss": 2.5032, |
|
"loss_": 1.3144, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4006, |
|
"step": 2492 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 1.11039399541527e-05, |
|
"loss": 2.4839, |
|
"loss_": 0.852, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4001, |
|
"step": 2499 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 1.1060571649095972e-05, |
|
"loss": 2.4618, |
|
"loss_": 0.816, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.4135, |
|
"step": 2506 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 1.1017183160294033e-05, |
|
"loss": 2.5082, |
|
"loss_": 0.7247, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4005, |
|
"step": 2513 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 1.0973775313473465e-05, |
|
"loss": 2.5026, |
|
"loss_": 0.9287, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4007, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"learning_rate": 1.0930348934729249e-05, |
|
"loss": 2.4564, |
|
"loss_": 1.0246, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4009, |
|
"step": 2527 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"learning_rate": 1.0886904850509052e-05, |
|
"loss": 2.5123, |
|
"loss_": 1.1915, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4005, |
|
"step": 2534 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"learning_rate": 1.0843443887597495e-05, |
|
"loss": 2.4786, |
|
"loss_": 0.9271, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4004, |
|
"step": 2541 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"learning_rate": 1.0799966873100419e-05, |
|
"loss": 2.4941, |
|
"loss_": 1.2428, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4007, |
|
"step": 2548 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"learning_rate": 1.0756474634429133e-05, |
|
"loss": 2.4861, |
|
"loss_": 1.1406, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4004, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"learning_rate": 1.0712967999284682e-05, |
|
"loss": 2.474, |
|
"loss_": 1.0874, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4006, |
|
"step": 2562 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"learning_rate": 1.0669447795642103e-05, |
|
"loss": 2.478, |
|
"loss_": 1.2379, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4004, |
|
"step": 2569 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"learning_rate": 1.0625914851734632e-05, |
|
"loss": 2.4567, |
|
"loss_": 1.0187, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4004, |
|
"step": 2576 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"learning_rate": 1.0582369996037985e-05, |
|
"loss": 2.4762, |
|
"loss_": 0.9279, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4007, |
|
"step": 2583 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"learning_rate": 1.053881405725456e-05, |
|
"loss": 2.4869, |
|
"loss_": 0.8599, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"learning_rate": 1.0495247864297684e-05, |
|
"loss": 2.5043, |
|
"loss_": 1.0374, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4, |
|
"step": 2597 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"learning_rate": 1.0451672246275826e-05, |
|
"loss": 2.4675, |
|
"loss_": 0.57, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4005, |
|
"step": 2604 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"learning_rate": 1.0408088032476822e-05, |
|
"loss": 2.4752, |
|
"loss_": 0.5417, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4003, |
|
"step": 2611 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"learning_rate": 1.036449605235211e-05, |
|
"loss": 2.4568, |
|
"loss_": 1.056, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3999, |
|
"step": 2618 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"learning_rate": 1.0320897135500904e-05, |
|
"loss": 2.4658, |
|
"loss_": 0.8557, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.4, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"learning_rate": 1.0277292111654447e-05, |
|
"loss": 2.4553, |
|
"loss_": 0.9975, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3999, |
|
"step": 2632 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"learning_rate": 1.0233681810660207e-05, |
|
"loss": 2.4815, |
|
"loss_": 1.0191, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4004, |
|
"step": 2639 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"learning_rate": 1.019006706246607e-05, |
|
"loss": 2.4735, |
|
"loss_": 1.0673, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4001, |
|
"step": 2646 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"learning_rate": 1.0146448697104561e-05, |
|
"loss": 2.458, |
|
"loss_": 1.2381, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.4, |
|
"step": 2653 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"learning_rate": 1.010282754467705e-05, |
|
"loss": 2.463, |
|
"loss_": 1.1428, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3997, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"learning_rate": 1.0059204435337938e-05, |
|
"loss": 2.483, |
|
"loss_": 1.2124, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3999, |
|
"step": 2667 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"learning_rate": 1.0015580199278873e-05, |
|
"loss": 2.4907, |
|
"loss_": 0.8323, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3995, |
|
"step": 2674 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"learning_rate": 9.971955666712945e-06, |
|
"loss": 2.4936, |
|
"loss_": 1.1091, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3998, |
|
"step": 2681 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"learning_rate": 9.928331667858886e-06, |
|
"loss": 2.5039, |
|
"loss_": 1.0505, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3998, |
|
"step": 2688 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"learning_rate": 9.884709032925274e-06, |
|
"loss": 2.4704, |
|
"loss_": 0.9685, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.3998, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"learning_rate": 9.841088592094726e-06, |
|
"loss": 2.4897, |
|
"loss_": 1.2011, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3993, |
|
"step": 2702 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"learning_rate": 9.797471175508101e-06, |
|
"loss": 2.4642, |
|
"loss_": 1.064, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3997, |
|
"step": 2709 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"learning_rate": 9.753857613248714e-06, |
|
"loss": 2.4746, |
|
"loss_": 1.089, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3995, |
|
"step": 2716 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"learning_rate": 9.710248735326519e-06, |
|
"loss": 2.4767, |
|
"loss_": 0.7312, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3996, |
|
"step": 2723 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"learning_rate": 9.666645371662324e-06, |
|
"loss": 2.4693, |
|
"loss_": 1.0271, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3994, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"learning_rate": 9.623048352071998e-06, |
|
"loss": 2.4631, |
|
"loss_": 0.7867, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3992, |
|
"step": 2737 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"learning_rate": 9.579458506250668e-06, |
|
"loss": 2.4744, |
|
"loss_": 1.1123, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.3996, |
|
"step": 2744 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"learning_rate": 9.535876663756955e-06, |
|
"loss": 2.4836, |
|
"loss_": 0.9437, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3994, |
|
"step": 2751 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"learning_rate": 9.492303653997146e-06, |
|
"loss": 2.4822, |
|
"loss_": 0.9857, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.399, |
|
"step": 2758 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"learning_rate": 9.448740306209447e-06, |
|
"loss": 2.4816, |
|
"loss_": 0.9489, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3991, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"learning_rate": 9.40518744944818e-06, |
|
"loss": 2.4744, |
|
"loss_": 0.6401, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3996, |
|
"step": 2772 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"learning_rate": 9.361645912568015e-06, |
|
"loss": 2.4736, |
|
"loss_": 0.8008, |
|
"moe_loss": 0.1602, |
|
"moe_loss_longrong": 1.3996, |
|
"step": 2779 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"learning_rate": 9.318116524208198e-06, |
|
"loss": 2.4719, |
|
"loss_": 0.9666, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3997, |
|
"step": 2786 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"learning_rate": 9.27460011277677e-06, |
|
"loss": 2.4865, |
|
"loss_": 1.0383, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3994, |
|
"step": 2793 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"learning_rate": 9.231097506434808e-06, |
|
"loss": 2.4683, |
|
"loss_": 0.807, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3995, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"learning_rate": 9.187609533080668e-06, |
|
"loss": 2.4738, |
|
"loss_": 1.0131, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3992, |
|
"step": 2807 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"learning_rate": 9.144137020334214e-06, |
|
"loss": 2.4559, |
|
"loss_": 0.9178, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3994, |
|
"step": 2814 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"learning_rate": 9.100680795521104e-06, |
|
"loss": 2.4832, |
|
"loss_": 0.8958, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3993, |
|
"step": 2821 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"learning_rate": 9.057241685656995e-06, |
|
"loss": 2.4729, |
|
"loss_": 0.8244, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.399, |
|
"step": 2828 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"learning_rate": 9.013820517431841e-06, |
|
"loss": 2.4458, |
|
"loss_": 0.6857, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3989, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"learning_rate": 8.970418117194146e-06, |
|
"loss": 2.4789, |
|
"loss_": 0.8677, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3991, |
|
"step": 2842 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"learning_rate": 8.927035310935241e-06, |
|
"loss": 2.4633, |
|
"loss_": 0.755, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.399, |
|
"step": 2849 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"learning_rate": 8.883672924273566e-06, |
|
"loss": 2.481, |
|
"loss_": 0.9947, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.399, |
|
"step": 2856 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"learning_rate": 8.840331782438954e-06, |
|
"loss": 2.4821, |
|
"loss_": 0.6639, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3988, |
|
"step": 2863 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"learning_rate": 8.797012710256923e-06, |
|
"loss": 2.4683, |
|
"loss_": 1.2205, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3992, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"learning_rate": 8.753716532132992e-06, |
|
"loss": 2.4611, |
|
"loss_": 0.9415, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3989, |
|
"step": 2877 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"learning_rate": 8.71044407203697e-06, |
|
"loss": 2.491, |
|
"loss_": 0.9864, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.399, |
|
"step": 2884 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"learning_rate": 8.667196153487308e-06, |
|
"loss": 2.4726, |
|
"loss_": 0.865, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3986, |
|
"step": 2891 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"learning_rate": 8.623973599535385e-06, |
|
"loss": 2.457, |
|
"loss_": 0.8526, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.399, |
|
"step": 2898 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"learning_rate": 8.580777232749883e-06, |
|
"loss": 2.4576, |
|
"loss_": 1.0116, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3988, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"learning_rate": 8.537607875201106e-06, |
|
"loss": 2.5113, |
|
"loss_": 0.8642, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3987, |
|
"step": 2912 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"learning_rate": 8.494466348445345e-06, |
|
"loss": 2.4787, |
|
"loss_": 1.0994, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3988, |
|
"step": 2919 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"learning_rate": 8.451353473509254e-06, |
|
"loss": 2.4797, |
|
"loss_": 1.1009, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3983, |
|
"step": 2926 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"learning_rate": 8.408270070874201e-06, |
|
"loss": 2.4709, |
|
"loss_": 0.8487, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3984, |
|
"step": 2933 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"learning_rate": 8.365216960460675e-06, |
|
"loss": 2.5019, |
|
"loss_": 0.9758, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3987, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"learning_rate": 8.322194961612668e-06, |
|
"loss": 2.4919, |
|
"loss_": 0.9281, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3986, |
|
"step": 2947 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"learning_rate": 8.279204893082083e-06, |
|
"loss": 2.4788, |
|
"loss_": 0.9675, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3986, |
|
"step": 2954 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"learning_rate": 8.23624757301318e-06, |
|
"loss": 2.4796, |
|
"loss_": 0.9676, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3986, |
|
"step": 2961 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"learning_rate": 8.193323818926955e-06, |
|
"loss": 2.4471, |
|
"loss_": 0.3893, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3986, |
|
"step": 2968 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"learning_rate": 8.150434447705623e-06, |
|
"loss": 2.4644, |
|
"loss_": 1.0186, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3985, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"learning_rate": 8.107580275577059e-06, |
|
"loss": 2.4517, |
|
"loss_": 1.2343, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3985, |
|
"step": 2982 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"learning_rate": 8.064762118099258e-06, |
|
"loss": 2.4524, |
|
"loss_": 0.7846, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3985, |
|
"step": 2989 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"learning_rate": 8.021980790144828e-06, |
|
"loss": 2.4626, |
|
"loss_": 1.0468, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3988, |
|
"step": 2996 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"learning_rate": 7.979237105885467e-06, |
|
"loss": 2.4822, |
|
"loss_": 0.5538, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3985, |
|
"step": 3003 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"learning_rate": 7.936531878776484e-06, |
|
"loss": 2.4753, |
|
"loss_": 1.1616, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3983, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"learning_rate": 7.893865921541294e-06, |
|
"loss": 2.4418, |
|
"loss_": 0.5863, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3987, |
|
"step": 3017 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"learning_rate": 7.85124004615598e-06, |
|
"loss": 2.4724, |
|
"loss_": 0.9406, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3982, |
|
"step": 3024 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"learning_rate": 7.808655063833832e-06, |
|
"loss": 2.4884, |
|
"loss_": 1.0104, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3989, |
|
"step": 3031 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"learning_rate": 7.766111785009888e-06, |
|
"loss": 2.4676, |
|
"loss_": 0.9396, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.398, |
|
"step": 3038 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 7.723611019325538e-06, |
|
"loss": 2.4705, |
|
"loss_": 0.9611, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3982, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 7.681153575613098e-06, |
|
"loss": 2.4555, |
|
"loss_": 0.931, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3986, |
|
"step": 3052 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 7.638740261880423e-06, |
|
"loss": 2.4369, |
|
"loss_": 0.7901, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3985, |
|
"step": 3059 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 7.596371885295542e-06, |
|
"loss": 2.4852, |
|
"loss_": 0.9128, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3981, |
|
"step": 3066 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 7.55404925217127e-06, |
|
"loss": 2.5004, |
|
"loss_": 1.0571, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3983, |
|
"step": 3073 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 7.511773167949885e-06, |
|
"loss": 2.4582, |
|
"loss_": 1.0777, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3986, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 7.46954443718779e-06, |
|
"loss": 2.4644, |
|
"loss_": 0.89, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3984, |
|
"step": 3087 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"learning_rate": 7.427363863540202e-06, |
|
"loss": 2.4668, |
|
"loss_": 1.0102, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3986, |
|
"step": 3094 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"learning_rate": 7.385232249745873e-06, |
|
"loss": 2.4733, |
|
"loss_": 0.6698, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3988, |
|
"step": 3101 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"learning_rate": 7.343150397611782e-06, |
|
"loss": 2.5122, |
|
"loss_": 1.2655, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3979, |
|
"step": 3108 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"learning_rate": 7.301119107997905e-06, |
|
"loss": 2.461, |
|
"loss_": 1.1851, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3978, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"learning_rate": 7.2591391808019555e-06, |
|
"loss": 2.4727, |
|
"loss_": 0.8541, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3979, |
|
"step": 3122 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"learning_rate": 7.217211414944171e-06, |
|
"loss": 2.4443, |
|
"loss_": 1.0654, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3978, |
|
"step": 3129 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"learning_rate": 7.175336608352113e-06, |
|
"loss": 2.4922, |
|
"loss_": 1.184, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3982, |
|
"step": 3136 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"learning_rate": 7.133515557945463e-06, |
|
"loss": 2.4643, |
|
"loss_": 1.1851, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3981, |
|
"step": 3143 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"learning_rate": 7.091749059620881e-06, |
|
"loss": 2.4581, |
|
"loss_": 1.328, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3982, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"learning_rate": 7.0500379082368305e-06, |
|
"loss": 2.4708, |
|
"loss_": 1.0728, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.398, |
|
"step": 3157 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"learning_rate": 7.008382897598477e-06, |
|
"loss": 2.4901, |
|
"loss_": 0.9315, |
|
"moe_loss": 0.1606, |
|
"moe_loss_longrong": 1.4088, |
|
"step": 3164 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"learning_rate": 6.9667848204425785e-06, |
|
"loss": 2.4706, |
|
"loss_": 1.1113, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3979, |
|
"step": 3171 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"learning_rate": 6.9252444684223765e-06, |
|
"loss": 2.4442, |
|
"loss_": 0.7937, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3977, |
|
"step": 3178 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"learning_rate": 6.88376263209255e-06, |
|
"loss": 2.4406, |
|
"loss_": 1.1277, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3982, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"learning_rate": 6.84234010089417e-06, |
|
"loss": 2.4761, |
|
"loss_": 0.9565, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3982, |
|
"step": 3192 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"learning_rate": 6.800977663139666e-06, |
|
"loss": 2.4832, |
|
"loss_": 0.7461, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.4073, |
|
"step": 3199 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"learning_rate": 6.759676105997834e-06, |
|
"loss": 2.4752, |
|
"loss_": 1.1396, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3978, |
|
"step": 3206 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"learning_rate": 6.718436215478849e-06, |
|
"loss": 2.4594, |
|
"loss_": 1.1075, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3978, |
|
"step": 3213 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"learning_rate": 6.677258776419304e-06, |
|
"loss": 2.4703, |
|
"loss_": 0.9133, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3975, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"learning_rate": 6.63614457246728e-06, |
|
"loss": 2.4534, |
|
"loss_": 0.9049, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3978, |
|
"step": 3227 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"learning_rate": 6.595094386067428e-06, |
|
"loss": 2.4945, |
|
"loss_": 1.1701, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3975, |
|
"step": 3234 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"learning_rate": 6.554108998446096e-06, |
|
"loss": 2.4832, |
|
"loss_": 0.9606, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3977, |
|
"step": 3241 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"learning_rate": 6.513189189596422e-06, |
|
"loss": 2.4639, |
|
"loss_": 1.0635, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3978, |
|
"step": 3248 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"learning_rate": 6.472335738263534e-06, |
|
"loss": 2.4609, |
|
"loss_": 0.9759, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3982, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"learning_rate": 6.431549421929694e-06, |
|
"loss": 2.4641, |
|
"loss_": 0.7825, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.398, |
|
"step": 3262 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"learning_rate": 6.390831016799527e-06, |
|
"loss": 2.458, |
|
"loss_": 0.8033, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3977, |
|
"step": 3269 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"learning_rate": 6.350181297785242e-06, |
|
"loss": 2.4584, |
|
"loss_": 1.0825, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3974, |
|
"step": 3276 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"learning_rate": 6.309601038491874e-06, |
|
"loss": 2.4911, |
|
"loss_": 0.7566, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3979, |
|
"step": 3283 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"learning_rate": 6.269091011202576e-06, |
|
"loss": 2.457, |
|
"loss_": 0.6181, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3977, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"learning_rate": 6.2286519868639095e-06, |
|
"loss": 2.4458, |
|
"loss_": 1.0953, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3975, |
|
"step": 3297 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"learning_rate": 6.188284735071177e-06, |
|
"loss": 2.4848, |
|
"loss_": 0.9802, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3976, |
|
"step": 3304 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"learning_rate": 6.1479900240537956e-06, |
|
"loss": 2.4815, |
|
"loss_": 1.2048, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3977, |
|
"step": 3311 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"learning_rate": 6.107768620660633e-06, |
|
"loss": 2.4552, |
|
"loss_": 1.039, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3976, |
|
"step": 3318 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"learning_rate": 6.067621290345455e-06, |
|
"loss": 2.4365, |
|
"loss_": 1.2129, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3974, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"learning_rate": 6.027548797152336e-06, |
|
"loss": 2.4546, |
|
"loss_": 0.6566, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3973, |
|
"step": 3332 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"learning_rate": 5.987551903701128e-06, |
|
"loss": 2.4409, |
|
"loss_": 1.0142, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.408, |
|
"step": 3339 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"learning_rate": 5.947631371172943e-06, |
|
"loss": 2.4488, |
|
"loss_": 0.9704, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.408, |
|
"step": 3346 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 5.9077879592956675e-06, |
|
"loss": 2.4569, |
|
"loss_": 0.936, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3972, |
|
"step": 3353 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 5.8680224263295045e-06, |
|
"loss": 2.4519, |
|
"loss_": 0.9728, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3972, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 5.828335529052541e-06, |
|
"loss": 2.4757, |
|
"loss_": 0.9242, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3972, |
|
"step": 3367 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 5.788728022746348e-06, |
|
"loss": 2.4769, |
|
"loss_": 0.8005, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3976, |
|
"step": 3374 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 5.749200661181611e-06, |
|
"loss": 2.4548, |
|
"loss_": 1.116, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3971, |
|
"step": 3381 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 5.709754196603781e-06, |
|
"loss": 2.4687, |
|
"loss_": 0.8613, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3971, |
|
"step": 3388 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 5.67038937971875e-06, |
|
"loss": 2.437, |
|
"loss_": 0.9275, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3974, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 5.631106959678575e-06, |
|
"loss": 2.4636, |
|
"loss_": 1.1476, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3974, |
|
"step": 3402 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"learning_rate": 5.5919076840672215e-06, |
|
"loss": 2.449, |
|
"loss_": 0.9428, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.397, |
|
"step": 3409 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"learning_rate": 5.552792298886335e-06, |
|
"loss": 2.4572, |
|
"loss_": 0.8202, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3973, |
|
"step": 3416 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"learning_rate": 5.513761548541032e-06, |
|
"loss": 2.444, |
|
"loss_": 0.905, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3973, |
|
"step": 3423 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"learning_rate": 5.474816175825754e-06, |
|
"loss": 2.4189, |
|
"loss_": 1.1022, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3973, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"learning_rate": 5.4359569219101115e-06, |
|
"loss": 2.5038, |
|
"loss_": 1.1099, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3969, |
|
"step": 3437 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"learning_rate": 5.397184526324792e-06, |
|
"loss": 2.4885, |
|
"loss_": 0.9227, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3973, |
|
"step": 3444 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"learning_rate": 5.358499726947488e-06, |
|
"loss": 2.4389, |
|
"loss_": 0.9602, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3971, |
|
"step": 3451 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"learning_rate": 5.31990325998883e-06, |
|
"loss": 2.4275, |
|
"loss_": 0.9191, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3969, |
|
"step": 3458 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"learning_rate": 5.281395859978414e-06, |
|
"loss": 2.4647, |
|
"loss_": 1.0229, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.397, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"learning_rate": 5.24297825975079e-06, |
|
"loss": 2.4649, |
|
"loss_": 0.9973, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3973, |
|
"step": 3472 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"learning_rate": 5.2046511904315265e-06, |
|
"loss": 2.4409, |
|
"loss_": 0.6513, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3969, |
|
"step": 3479 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"learning_rate": 5.166415381423306e-06, |
|
"loss": 2.4805, |
|
"loss_": 1.1712, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3971, |
|
"step": 3486 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"learning_rate": 5.128271560392037e-06, |
|
"loss": 2.4496, |
|
"loss_": 1.0721, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3971, |
|
"step": 3493 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"learning_rate": 5.09022045325299e-06, |
|
"loss": 2.473, |
|
"loss_": 1.1122, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3971, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"learning_rate": 5.052262784157014e-06, |
|
"loss": 2.4654, |
|
"loss_": 1.0388, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.397, |
|
"step": 3507 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"learning_rate": 5.014399275476721e-06, |
|
"loss": 2.463, |
|
"loss_": 1.0244, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3968, |
|
"step": 3514 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"learning_rate": 4.976630647792771e-06, |
|
"loss": 2.4481, |
|
"loss_": 0.7509, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3972, |
|
"step": 3521 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"learning_rate": 4.938957619880138e-06, |
|
"loss": 2.4624, |
|
"loss_": 1.0897, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.397, |
|
"step": 3528 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"learning_rate": 4.901380908694434e-06, |
|
"loss": 2.4236, |
|
"loss_": 1.1599, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.397, |
|
"step": 3535 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"learning_rate": 4.863901229358261e-06, |
|
"loss": 2.4483, |
|
"loss_": 0.8951, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3971, |
|
"step": 3542 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"learning_rate": 4.8265192951476206e-06, |
|
"loss": 2.4552, |
|
"loss_": 0.9006, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3969, |
|
"step": 3549 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"learning_rate": 4.789235817478322e-06, |
|
"loss": 2.457, |
|
"loss_": 1.0357, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3969, |
|
"step": 3556 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"learning_rate": 4.752051505892438e-06, |
|
"loss": 2.462, |
|
"loss_": 1.031, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3968, |
|
"step": 3563 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"learning_rate": 4.714967068044826e-06, |
|
"loss": 2.459, |
|
"loss_": 1.2418, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3969, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"learning_rate": 4.677983209689631e-06, |
|
"loss": 2.4449, |
|
"loss_": 0.7941, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3973, |
|
"step": 3577 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"learning_rate": 4.641100634666877e-06, |
|
"loss": 2.4528, |
|
"loss_": 0.7962, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.397, |
|
"step": 3584 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"learning_rate": 4.6043200448890724e-06, |
|
"loss": 2.4674, |
|
"loss_": 1.0349, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3967, |
|
"step": 3591 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"learning_rate": 4.567642140327823e-06, |
|
"loss": 2.4498, |
|
"loss_": 0.9343, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3966, |
|
"step": 3598 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"learning_rate": 4.531067619000553e-06, |
|
"loss": 2.4711, |
|
"loss_": 0.7285, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3967, |
|
"step": 3605 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"learning_rate": 4.494597176957186e-06, |
|
"loss": 2.4578, |
|
"loss_": 0.6286, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3968, |
|
"step": 3612 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 4.458231508266912e-06, |
|
"loss": 2.4736, |
|
"loss_": 0.8458, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.4062, |
|
"step": 3619 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 4.421971305004989e-06, |
|
"loss": 2.4841, |
|
"loss_": 0.7491, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3965, |
|
"step": 3626 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 4.385817257239556e-06, |
|
"loss": 2.4332, |
|
"loss_": 0.9237, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4068, |
|
"step": 3633 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 4.349770053018502e-06, |
|
"loss": 2.4673, |
|
"loss_": 0.9196, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3968, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 4.313830378356384e-06, |
|
"loss": 2.4438, |
|
"loss_": 1.0917, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.397, |
|
"step": 3647 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 4.277998917221354e-06, |
|
"loss": 2.4672, |
|
"loss_": 0.8497, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4064, |
|
"step": 3654 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 4.242276351522161e-06, |
|
"loss": 2.4468, |
|
"loss_": 0.8331, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3969, |
|
"step": 3661 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"learning_rate": 4.206663361095164e-06, |
|
"loss": 2.4639, |
|
"loss_": 0.9817, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3963, |
|
"step": 3668 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"learning_rate": 4.171160623691384e-06, |
|
"loss": 2.4403, |
|
"loss_": 1.0819, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3967, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"learning_rate": 4.135768814963622e-06, |
|
"loss": 2.4281, |
|
"loss_": 0.9681, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3966, |
|
"step": 3682 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"learning_rate": 4.100488608453599e-06, |
|
"loss": 2.4383, |
|
"loss_": 0.8748, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3966, |
|
"step": 3689 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"learning_rate": 4.065320675579132e-06, |
|
"loss": 2.4811, |
|
"loss_": 1.2776, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3966, |
|
"step": 3696 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"learning_rate": 4.03026568562135e-06, |
|
"loss": 2.4559, |
|
"loss_": 0.8804, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3967, |
|
"step": 3703 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"learning_rate": 3.995324305711976e-06, |
|
"loss": 2.4263, |
|
"loss_": 0.9593, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3968, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"learning_rate": 3.9604972008206085e-06, |
|
"loss": 2.4698, |
|
"loss_": 1.2848, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3967, |
|
"step": 3717 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"learning_rate": 3.9257850337420856e-06, |
|
"loss": 2.4923, |
|
"loss_": 1.0082, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3965, |
|
"step": 3724 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"learning_rate": 3.891188465083865e-06, |
|
"loss": 2.4502, |
|
"loss_": 1.0253, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4059, |
|
"step": 3731 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"learning_rate": 3.8567081532534374e-06, |
|
"loss": 2.4543, |
|
"loss_": 0.6744, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3962, |
|
"step": 3738 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"learning_rate": 3.822344754445826e-06, |
|
"loss": 2.4628, |
|
"loss_": 1.0211, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3968, |
|
"step": 3745 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"learning_rate": 3.788098922631067e-06, |
|
"loss": 2.4765, |
|
"loss_": 1.0228, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3968, |
|
"step": 3752 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"learning_rate": 3.753971309541784e-06, |
|
"loss": 2.4717, |
|
"loss_": 0.846, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3967, |
|
"step": 3759 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"learning_rate": 3.719962564660783e-06, |
|
"loss": 2.4447, |
|
"loss_": 0.4578, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3973, |
|
"step": 3766 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"learning_rate": 3.6860733352086866e-06, |
|
"loss": 2.4563, |
|
"loss_": 0.8938, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3965, |
|
"step": 3773 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"learning_rate": 3.652304266131612e-06, |
|
"loss": 2.4641, |
|
"loss_": 0.9597, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3966, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"learning_rate": 3.618656000088916e-06, |
|
"loss": 2.4801, |
|
"loss_": 0.7477, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3968, |
|
"step": 3787 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"learning_rate": 3.585129177440938e-06, |
|
"loss": 2.4649, |
|
"loss_": 1.1009, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3965, |
|
"step": 3794 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"learning_rate": 3.5517244362368363e-06, |
|
"loss": 2.4828, |
|
"loss_": 1.1634, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3966, |
|
"step": 3801 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"learning_rate": 3.5184424122024406e-06, |
|
"loss": 2.4532, |
|
"loss_": 1.1849, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3965, |
|
"step": 3808 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"learning_rate": 3.485283738728139e-06, |
|
"loss": 2.4494, |
|
"loss_": 0.8625, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3965, |
|
"step": 3815 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"learning_rate": 3.452249046856836e-06, |
|
"loss": 2.4715, |
|
"loss_": 1.012, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4064, |
|
"step": 3822 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"learning_rate": 3.4193389652719478e-06, |
|
"loss": 2.4256, |
|
"loss_": 1.0154, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3961, |
|
"step": 3829 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"learning_rate": 3.3865541202854314e-06, |
|
"loss": 2.4636, |
|
"loss_": 1.1752, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3964, |
|
"step": 3836 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"learning_rate": 3.353895135825854e-06, |
|
"loss": 2.442, |
|
"loss_": 0.8945, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3964, |
|
"step": 3843 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"learning_rate": 3.321362633426547e-06, |
|
"loss": 2.4677, |
|
"loss_": 0.9853, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3963, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"learning_rate": 3.2889572322137454e-06, |
|
"loss": 2.4633, |
|
"loss_": 1.2634, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3961, |
|
"step": 3857 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"learning_rate": 3.256679548894831e-06, |
|
"loss": 2.4637, |
|
"loss_": 0.8044, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3965, |
|
"step": 3864 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"learning_rate": 3.224530197746587e-06, |
|
"loss": 2.4343, |
|
"loss_": 0.8018, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3968, |
|
"step": 3871 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 3.1925097906034962e-06, |
|
"loss": 2.4328, |
|
"loss_": 0.8425, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4055, |
|
"step": 3878 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 3.1606189368461117e-06, |
|
"loss": 2.4648, |
|
"loss_": 0.9644, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3964, |
|
"step": 3885 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 3.128858243389461e-06, |
|
"loss": 2.4541, |
|
"loss_": 0.6231, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3966, |
|
"step": 3892 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 3.097228314671481e-06, |
|
"loss": 2.476, |
|
"loss_": 0.9949, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3961, |
|
"step": 3899 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 3.065729752641532e-06, |
|
"loss": 2.4229, |
|
"loss_": 0.8875, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3962, |
|
"step": 3906 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 3.034363156748933e-06, |
|
"loss": 2.4502, |
|
"loss_": 0.9087, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4054, |
|
"step": 3913 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 3.0031291239315473e-06, |
|
"loss": 2.4367, |
|
"loss_": 0.8938, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3965, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"learning_rate": 2.9720282486044407e-06, |
|
"loss": 2.471, |
|
"loss_": 0.742, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3963, |
|
"step": 3927 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"learning_rate": 2.941061122648545e-06, |
|
"loss": 2.4598, |
|
"loss_": 0.6142, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3966, |
|
"step": 3934 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"learning_rate": 2.910228335399419e-06, |
|
"loss": 2.4532, |
|
"loss_": 0.9248, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3961, |
|
"step": 3941 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"learning_rate": 2.8795304736360184e-06, |
|
"loss": 2.4694, |
|
"loss_": 0.876, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3961, |
|
"step": 3948 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"learning_rate": 2.8489681215695242e-06, |
|
"loss": 2.4464, |
|
"loss_": 1.0146, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3961, |
|
"step": 3955 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"learning_rate": 2.8185418608322344e-06, |
|
"loss": 2.4632, |
|
"loss_": 0.7415, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3964, |
|
"step": 3962 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"learning_rate": 2.788252270466497e-06, |
|
"loss": 2.4575, |
|
"loss_": 1.1931, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3963, |
|
"step": 3969 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"learning_rate": 2.7580999269136854e-06, |
|
"loss": 2.4825, |
|
"loss_": 0.9967, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3961, |
|
"step": 3976 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"learning_rate": 2.728085404003217e-06, |
|
"loss": 2.4658, |
|
"loss_": 0.9402, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3961, |
|
"step": 3983 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"learning_rate": 2.698209272941659e-06, |
|
"loss": 2.4466, |
|
"loss_": 1.1097, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3963, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"learning_rate": 2.668472102301829e-06, |
|
"loss": 2.4544, |
|
"loss_": 1.018, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3962, |
|
"step": 3997 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"learning_rate": 2.6388744580119975e-06, |
|
"loss": 2.4195, |
|
"loss_": 0.9804, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3961, |
|
"step": 4004 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"learning_rate": 2.6094169033451066e-06, |
|
"loss": 2.4628, |
|
"loss_": 0.7708, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3963, |
|
"step": 4011 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"learning_rate": 2.580099998908049e-06, |
|
"loss": 2.4624, |
|
"loss_": 0.6729, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3962, |
|
"step": 4018 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"learning_rate": 2.5509243026309983e-06, |
|
"loss": 2.4753, |
|
"loss_": 1.177, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3961, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"learning_rate": 2.5218903697568075e-06, |
|
"loss": 2.4669, |
|
"loss_": 1.1103, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.396, |
|
"step": 4032 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"learning_rate": 2.4929987528304144e-06, |
|
"loss": 2.4671, |
|
"loss_": 1.3009, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3962, |
|
"step": 4039 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"learning_rate": 2.4642500016883532e-06, |
|
"loss": 2.4649, |
|
"loss_": 0.4641, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3964, |
|
"step": 4046 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"learning_rate": 2.4356446634482756e-06, |
|
"loss": 2.4255, |
|
"loss_": 0.7561, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3963, |
|
"step": 4053 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"learning_rate": 2.407183282498534e-06, |
|
"loss": 2.4512, |
|
"loss_": 1.0891, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.396, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"learning_rate": 2.3788664004878405e-06, |
|
"loss": 2.4548, |
|
"loss_": 0.8427, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3964, |
|
"step": 4067 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"learning_rate": 2.350694556314934e-06, |
|
"loss": 2.4775, |
|
"loss_": 1.1603, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.396, |
|
"step": 4074 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"learning_rate": 2.32266828611835e-06, |
|
"loss": 2.4762, |
|
"loss_": 0.982, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.396, |
|
"step": 4081 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"learning_rate": 2.2947881232662007e-06, |
|
"loss": 2.4574, |
|
"loss_": 0.6854, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3965, |
|
"step": 4088 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"learning_rate": 2.2670545983460245e-06, |
|
"loss": 2.4641, |
|
"loss_": 1.1094, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.396, |
|
"step": 4095 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"learning_rate": 2.2394682391546928e-06, |
|
"loss": 2.4546, |
|
"loss_": 0.8832, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3962, |
|
"step": 4102 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"learning_rate": 2.2120295706883698e-06, |
|
"loss": 2.4228, |
|
"loss_": 0.534, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3966, |
|
"step": 4109 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"learning_rate": 2.184739115132517e-06, |
|
"loss": 2.4502, |
|
"loss_": 0.6129, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3959, |
|
"step": 4116 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"learning_rate": 2.157597391851949e-06, |
|
"loss": 2.4514, |
|
"loss_": 0.8927, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4045, |
|
"step": 4123 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"learning_rate": 2.130604917380962e-06, |
|
"loss": 2.4434, |
|
"loss_": 0.87, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"learning_rate": 2.103762205413493e-06, |
|
"loss": 2.4475, |
|
"loss_": 1.1291, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3961, |
|
"step": 4137 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"learning_rate": 2.0770697667933436e-06, |
|
"loss": 2.4697, |
|
"loss_": 0.8274, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3961, |
|
"step": 4144 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"learning_rate": 2.0505281095044804e-06, |
|
"loss": 2.4725, |
|
"loss_": 1.0877, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.396, |
|
"step": 4151 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"learning_rate": 2.024137738661329e-06, |
|
"loss": 2.4757, |
|
"loss_": 0.6894, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3962, |
|
"step": 4158 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"learning_rate": 1.997899156499191e-06, |
|
"loss": 2.4566, |
|
"loss_": 0.7625, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3962, |
|
"step": 4165 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"learning_rate": 1.9718128623646792e-06, |
|
"loss": 2.514, |
|
"loss_": 1.3132, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 4172 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"learning_rate": 1.9458793527062035e-06, |
|
"loss": 2.4659, |
|
"loss_": 0.6083, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3961, |
|
"step": 4179 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"learning_rate": 1.9200991210645394e-06, |
|
"loss": 2.441, |
|
"loss_": 0.8687, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3964, |
|
"step": 4186 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"learning_rate": 1.8944726580634287e-06, |
|
"loss": 2.4227, |
|
"loss_": 0.863, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4043, |
|
"step": 4193 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"learning_rate": 1.8690004514002314e-06, |
|
"loss": 2.4488, |
|
"loss_": 1.0513, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3962, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"learning_rate": 1.8436829858366655e-06, |
|
"loss": 2.4269, |
|
"loss_": 0.9285, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3959, |
|
"step": 4207 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"learning_rate": 1.8185207431895613e-06, |
|
"loss": 2.4577, |
|
"loss_": 1.0791, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 4214 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"learning_rate": 1.7935142023217056e-06, |
|
"loss": 2.4565, |
|
"loss_": 1.0052, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 4221 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"learning_rate": 1.768663839132727e-06, |
|
"loss": 2.4314, |
|
"loss_": 0.7553, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3959, |
|
"step": 4228 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"learning_rate": 1.7439701265500274e-06, |
|
"loss": 2.432, |
|
"loss_": 1.137, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 4235 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"learning_rate": 1.7194335345197933e-06, |
|
"loss": 2.466, |
|
"loss_": 0.804, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3963, |
|
"step": 4242 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"learning_rate": 1.6950545299980526e-06, |
|
"loss": 2.4119, |
|
"loss_": 0.8362, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3961, |
|
"step": 4249 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"learning_rate": 1.6708335769417827e-06, |
|
"loss": 2.4555, |
|
"loss_": 0.8946, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3957, |
|
"step": 4256 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"learning_rate": 1.6467711363000794e-06, |
|
"loss": 2.434, |
|
"loss_": 1.1374, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 4263 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"learning_rate": 1.6228676660053932e-06, |
|
"loss": 2.4705, |
|
"loss_": 1.07, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"learning_rate": 1.5991236209648052e-06, |
|
"loss": 2.4467, |
|
"loss_": 0.5343, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3962, |
|
"step": 4277 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"learning_rate": 1.575539453051369e-06, |
|
"loss": 2.4617, |
|
"loss_": 1.2505, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 4284 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"learning_rate": 1.5521156110955293e-06, |
|
"loss": 2.4389, |
|
"loss_": 1.0836, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3959, |
|
"step": 4291 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"learning_rate": 1.5288525408765564e-06, |
|
"loss": 2.4877, |
|
"loss_": 0.8473, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3959, |
|
"step": 4298 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"learning_rate": 1.5057506851140701e-06, |
|
"loss": 2.4786, |
|
"loss_": 1.0259, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 4305 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"learning_rate": 1.4828104834596268e-06, |
|
"loss": 2.4086, |
|
"loss_": 0.5643, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3965, |
|
"step": 4312 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"learning_rate": 1.4600323724883337e-06, |
|
"loss": 2.4481, |
|
"loss_": 1.0485, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3959, |
|
"step": 4319 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"learning_rate": 1.4374167856905542e-06, |
|
"loss": 2.4386, |
|
"loss_": 0.9296, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.396, |
|
"step": 4326 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"learning_rate": 1.414964153463655e-06, |
|
"loss": 2.4538, |
|
"loss_": 0.7446, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3959, |
|
"step": 4333 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"learning_rate": 1.3926749031038055e-06, |
|
"loss": 2.4252, |
|
"loss_": 1.0624, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3963, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"learning_rate": 1.370549458797863e-06, |
|
"loss": 2.4477, |
|
"loss_": 1.0075, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3957, |
|
"step": 4347 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"learning_rate": 1.3485882416152819e-06, |
|
"loss": 2.4224, |
|
"loss_": 0.9794, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3959, |
|
"step": 4354 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"learning_rate": 1.3267916695001172e-06, |
|
"loss": 2.4571, |
|
"loss_": 0.9473, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3954, |
|
"step": 4361 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"learning_rate": 1.3051601572630611e-06, |
|
"loss": 2.449, |
|
"loss_": 1.1259, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3957, |
|
"step": 4368 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"learning_rate": 1.283694116573546e-06, |
|
"loss": 2.4477, |
|
"loss_": 1.0313, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3959, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"learning_rate": 1.2623939559519161e-06, |
|
"loss": 2.46, |
|
"loss_": 0.888, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 4382 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"learning_rate": 1.2412600807616526e-06, |
|
"loss": 2.4559, |
|
"loss_": 0.9206, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3957, |
|
"step": 4389 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"learning_rate": 1.2202928932016588e-06, |
|
"loss": 2.4259, |
|
"loss_": 0.8758, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 4396 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"learning_rate": 1.1994927922985999e-06, |
|
"loss": 2.4477, |
|
"loss_": 0.8513, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3961, |
|
"step": 4403 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"learning_rate": 1.178860173899321e-06, |
|
"loss": 2.4408, |
|
"loss_": 1.0152, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3955, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"learning_rate": 1.1583954306633004e-06, |
|
"loss": 2.4442, |
|
"loss_": 1.1666, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.396, |
|
"step": 4417 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"learning_rate": 1.138098952055181e-06, |
|
"loss": 2.4404, |
|
"loss_": 0.6781, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3959, |
|
"step": 4424 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"learning_rate": 1.1179711243373736e-06, |
|
"loss": 2.4439, |
|
"loss_": 0.8599, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3962, |
|
"step": 4431 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"learning_rate": 1.0980123305626812e-06, |
|
"loss": 2.4635, |
|
"loss_": 1.0466, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.4055, |
|
"step": 4438 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"learning_rate": 1.0782229505670195e-06, |
|
"loss": 2.4436, |
|
"loss_": 1.1018, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 4445 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"learning_rate": 1.0586033609622004e-06, |
|
"loss": 2.4521, |
|
"loss_": 0.9168, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3954, |
|
"step": 4452 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"learning_rate": 1.039153935128744e-06, |
|
"loss": 2.4435, |
|
"loss_": 1.1978, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3955, |
|
"step": 4459 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"learning_rate": 1.0198750432087855e-06, |
|
"loss": 2.4683, |
|
"loss_": 1.2032, |
|
"moe_loss": 0.16, |
|
"moe_loss_longrong": 1.3955, |
|
"step": 4466 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"learning_rate": 1.0007670520990331e-06, |
|
"loss": 2.4688, |
|
"loss_": 1.0949, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 4473 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"learning_rate": 9.818303254437723e-07, |
|
"loss": 2.459, |
|
"loss_": 1.3033, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"learning_rate": 9.630652236279626e-07, |
|
"loss": 2.4758, |
|
"loss_": 0.8537, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 4487 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"learning_rate": 9.444721037703597e-07, |
|
"loss": 2.4579, |
|
"loss_": 0.6901, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3959, |
|
"step": 4494 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"learning_rate": 9.260513197167398e-07, |
|
"loss": 2.4456, |
|
"loss_": 1.0315, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3957, |
|
"step": 4501 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"learning_rate": 9.078032220331523e-07, |
|
"loss": 2.4451, |
|
"loss_": 1.1487, |
|
"moe_loss": 0.16, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 4508 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"learning_rate": 8.897281579992467e-07, |
|
"loss": 2.4204, |
|
"loss_": 1.1135, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 4515 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"learning_rate": 8.718264716016722e-07, |
|
"loss": 2.4218, |
|
"loss_": 0.8637, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3955, |
|
"step": 4522 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"learning_rate": 8.540985035275273e-07, |
|
"loss": 2.447, |
|
"loss_": 0.8818, |
|
"moe_loss": 0.16, |
|
"moe_loss_longrong": 1.3955, |
|
"step": 4529 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"learning_rate": 8.365445911578785e-07, |
|
"loss": 2.4654, |
|
"loss_": 0.9987, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3955, |
|
"step": 4536 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"learning_rate": 8.191650685613273e-07, |
|
"loss": 2.4603, |
|
"loss_": 1.1883, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3956, |
|
"step": 4543 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"learning_rate": 8.019602664876758e-07, |
|
"loss": 2.4475, |
|
"loss_": 1.0646, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"learning_rate": 7.849305123616091e-07, |
|
"loss": 2.4486, |
|
"loss_": 0.8589, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3957, |
|
"step": 4557 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"learning_rate": 7.680761302764727e-07, |
|
"loss": 2.4336, |
|
"loss_": 1.0525, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3957, |
|
"step": 4564 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"learning_rate": 7.513974409881186e-07, |
|
"loss": 2.4663, |
|
"loss_": 1.0924, |
|
"moe_loss": 0.16, |
|
"moe_loss_longrong": 1.3957, |
|
"step": 4571 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"learning_rate": 7.348947619087754e-07, |
|
"loss": 2.4417, |
|
"loss_": 1.0197, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 4578 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"learning_rate": 7.185684071010224e-07, |
|
"loss": 2.4364, |
|
"loss_": 0.9028, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3956, |
|
"step": 4585 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"learning_rate": 7.024186872718164e-07, |
|
"loss": 2.4733, |
|
"loss_": 0.5258, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3961, |
|
"step": 4592 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"learning_rate": 6.864459097665654e-07, |
|
"loss": 2.4453, |
|
"loss_": 0.9338, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3957, |
|
"step": 4599 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"learning_rate": 6.706503785632934e-07, |
|
"loss": 2.432, |
|
"loss_": 1.1129, |
|
"moe_loss": 0.16, |
|
"moe_loss_longrong": 1.3956, |
|
"step": 4606 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"learning_rate": 6.550323942668469e-07, |
|
"loss": 2.4297, |
|
"loss_": 0.6761, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3963, |
|
"step": 4613 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"learning_rate": 6.395922541031741e-07, |
|
"loss": 2.4152, |
|
"loss_": 0.8792, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3956, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"learning_rate": 6.24330251913674e-07, |
|
"loss": 2.4526, |
|
"loss_": 1.1836, |
|
"moe_loss": 0.16, |
|
"moe_loss_longrong": 1.396, |
|
"step": 4627 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"learning_rate": 6.092466781495976e-07, |
|
"loss": 2.4362, |
|
"loss_": 0.9499, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 4634 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"learning_rate": 5.943418198665251e-07, |
|
"loss": 2.4439, |
|
"loss_": 1.1622, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 4641 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"learning_rate": 5.796159607189001e-07, |
|
"loss": 2.4273, |
|
"loss_": 0.9876, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3956, |
|
"step": 4648 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"learning_rate": 5.650693809546348e-07, |
|
"loss": 2.4735, |
|
"loss_": 1.1284, |
|
"moe_loss": 0.16, |
|
"moe_loss_longrong": 1.3956, |
|
"step": 4655 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"learning_rate": 5.507023574097725e-07, |
|
"loss": 2.4393, |
|
"loss_": 0.8675, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 4662 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"learning_rate": 5.365151635032218e-07, |
|
"loss": 2.4482, |
|
"loss_": 0.901, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3955, |
|
"step": 4669 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"learning_rate": 5.225080692315532e-07, |
|
"loss": 2.441, |
|
"loss_": 1.0355, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3956, |
|
"step": 4676 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"learning_rate": 5.086813411638581e-07, |
|
"loss": 2.4277, |
|
"loss_": 0.9478, |
|
"moe_loss": 0.16, |
|
"moe_loss_longrong": 1.3955, |
|
"step": 4683 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"learning_rate": 4.9503524243668e-07, |
|
"loss": 2.444, |
|
"loss_": 0.8901, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4048, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"learning_rate": 4.815700327490014e-07, |
|
"loss": 2.4286, |
|
"loss_": 0.8906, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3957, |
|
"step": 4697 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"learning_rate": 4.6828596835730487e-07, |
|
"loss": 2.4475, |
|
"loss_": 1.028, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3957, |
|
"step": 4704 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"learning_rate": 4.551833020707008e-07, |
|
"loss": 2.4281, |
|
"loss_": 0.6545, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 4711 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"learning_rate": 4.4226228324610544e-07, |
|
"loss": 2.4677, |
|
"loss_": 0.9228, |
|
"moe_loss": 0.16, |
|
"moe_loss_longrong": 1.3954, |
|
"step": 4718 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"learning_rate": 4.295231577835024e-07, |
|
"loss": 2.443, |
|
"loss_": 0.8677, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3957, |
|
"step": 4725 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"learning_rate": 4.1696616812126333e-07, |
|
"loss": 2.4452, |
|
"loss_": 0.8619, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 4732 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"learning_rate": 4.0459155323153034e-07, |
|
"loss": 2.4501, |
|
"loss_": 0.5721, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3959, |
|
"step": 4739 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"learning_rate": 3.9239954861567177e-07, |
|
"loss": 2.4452, |
|
"loss_": 1.2849, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3957, |
|
"step": 4746 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"learning_rate": 3.803903862998004e-07, |
|
"loss": 2.4681, |
|
"loss_": 1.0272, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.4052, |
|
"step": 4753 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"learning_rate": 3.685642948303503e-07, |
|
"loss": 2.4437, |
|
"loss_": 0.929, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3957, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"learning_rate": 3.5692149926974006e-07, |
|
"loss": 2.4502, |
|
"loss_": 1.1455, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 4767 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"learning_rate": 3.454622211920766e-07, |
|
"loss": 2.4262, |
|
"loss_": 0.6494, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3955, |
|
"step": 4774 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"learning_rate": 3.341866786789505e-07, |
|
"loss": 2.4259, |
|
"loss_": 1.113, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3959, |
|
"step": 4781 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"learning_rate": 3.2309508631527486e-07, |
|
"loss": 2.4309, |
|
"loss_": 0.8977, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3957, |
|
"step": 4788 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"learning_rate": 3.121876551852099e-07, |
|
"loss": 2.4311, |
|
"loss_": 1.0739, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.396, |
|
"step": 4795 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"learning_rate": 3.0146459286813924e-07, |
|
"loss": 2.4515, |
|
"loss_": 0.9781, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3959, |
|
"step": 4802 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"learning_rate": 2.909261034347255e-07, |
|
"loss": 2.4553, |
|
"loss_": 0.9123, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3959, |
|
"step": 4809 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"learning_rate": 2.8057238744301994e-07, |
|
"loss": 2.4516, |
|
"loss_": 1.131, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.396, |
|
"step": 4816 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"learning_rate": 2.704036419346534e-07, |
|
"loss": 2.4628, |
|
"loss_": 1.0138, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.4051, |
|
"step": 4823 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"learning_rate": 2.604200604310825e-07, |
|
"loss": 2.4657, |
|
"loss_": 1.0133, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3955, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"learning_rate": 2.506218329299026e-07, |
|
"loss": 2.4311, |
|
"loss_": 1.1114, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.396, |
|
"step": 4837 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"learning_rate": 2.410091459012376e-07, |
|
"loss": 2.4529, |
|
"loss_": 1.0407, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.396, |
|
"step": 4844 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"learning_rate": 2.3158218228419127e-07, |
|
"loss": 2.4564, |
|
"loss_": 1.061, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3955, |
|
"step": 4851 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"learning_rate": 2.2234112148336373e-07, |
|
"loss": 2.4584, |
|
"loss_": 0.735, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3956, |
|
"step": 4858 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"learning_rate": 2.1328613936543396e-07, |
|
"loss": 2.425, |
|
"loss_": 0.9476, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3962, |
|
"step": 4865 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"learning_rate": 2.0441740825582258e-07, |
|
"loss": 2.4643, |
|
"loss_": 1.0806, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3956, |
|
"step": 4872 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"learning_rate": 1.9573509693540104e-07, |
|
"loss": 2.4676, |
|
"loss_": 0.825, |
|
"moe_loss": 0.16, |
|
"moe_loss_longrong": 1.3954, |
|
"step": 4879 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"learning_rate": 1.872393706372866e-07, |
|
"loss": 2.4485, |
|
"loss_": 0.8625, |
|
"moe_loss": 0.1605, |
|
"moe_loss_longrong": 1.4046, |
|
"step": 4886 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"learning_rate": 1.789303910436968e-07, |
|
"loss": 2.4413, |
|
"loss_": 0.7278, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 4893 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"learning_rate": 1.7080831628286886e-07, |
|
"loss": 2.4452, |
|
"loss_": 0.812, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3961, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"learning_rate": 1.6287330092605525e-07, |
|
"loss": 2.4596, |
|
"loss_": 0.6445, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 4907 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"learning_rate": 1.551254959845805e-07, |
|
"loss": 2.4541, |
|
"loss_": 1.0014, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3957, |
|
"step": 4914 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"learning_rate": 1.4756504890696466e-07, |
|
"loss": 2.4342, |
|
"loss_": 1.1963, |
|
"moe_loss": 0.16, |
|
"moe_loss_longrong": 1.3957, |
|
"step": 4921 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"learning_rate": 1.401921035761189e-07, |
|
"loss": 2.4346, |
|
"loss_": 0.8071, |
|
"moe_loss": 0.1604, |
|
"moe_loss_longrong": 1.4051, |
|
"step": 4928 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"learning_rate": 1.3300680030661096e-07, |
|
"loss": 2.464, |
|
"loss_": 1.1756, |
|
"moe_loss": 0.16, |
|
"moe_loss_longrong": 1.3957, |
|
"step": 4935 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"learning_rate": 1.2600927584198618e-07, |
|
"loss": 2.4334, |
|
"loss_": 1.0679, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3954, |
|
"step": 4942 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"learning_rate": 1.1919966335217636e-07, |
|
"loss": 2.4779, |
|
"loss_": 0.8872, |
|
"moe_loss": 0.16, |
|
"moe_loss_longrong": 1.3956, |
|
"step": 4949 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"learning_rate": 1.1257809243095385e-07, |
|
"loss": 2.4339, |
|
"loss_": 0.8956, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3956, |
|
"step": 4956 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"learning_rate": 1.0614468909347476e-07, |
|
"loss": 2.4414, |
|
"loss_": 1.1397, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3956, |
|
"step": 4963 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"learning_rate": 9.989957577387521e-08, |
|
"loss": 2.4253, |
|
"loss_": 0.8755, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3955, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"learning_rate": 9.384287132294223e-08, |
|
"loss": 2.4599, |
|
"loss_": 0.9577, |
|
"moe_loss": 0.16, |
|
"moe_loss_longrong": 1.3954, |
|
"step": 4977 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"learning_rate": 8.797469100585432e-08, |
|
"loss": 2.4615, |
|
"loss_": 0.7768, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3957, |
|
"step": 4984 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"learning_rate": 8.229514649998438e-08, |
|
"loss": 2.4414, |
|
"loss_": 1.0058, |
|
"moe_loss": 0.16, |
|
"moe_loss_longrong": 1.3956, |
|
"step": 4991 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"learning_rate": 7.680434589277696e-08, |
|
"loss": 2.4587, |
|
"loss_": 1.0013, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3957, |
|
"step": 4998 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"learning_rate": 7.150239367969102e-08, |
|
"loss": 2.4314, |
|
"loss_": 0.9539, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3959, |
|
"step": 5005 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"learning_rate": 6.63893907622104e-08, |
|
"loss": 2.4711, |
|
"loss_": 0.9778, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3954, |
|
"step": 5012 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"learning_rate": 6.14654344459209e-08, |
|
"loss": 2.4558, |
|
"loss_": 0.9942, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 5019 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"learning_rate": 5.673061843866623e-08, |
|
"loss": 2.4748, |
|
"loss_": 0.7836, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3959, |
|
"step": 5026 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"learning_rate": 5.218503284875609e-08, |
|
"loss": 2.4418, |
|
"loss_": 1.0962, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3955, |
|
"step": 5033 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"learning_rate": 4.7828764183257545e-08, |
|
"loss": 2.4128, |
|
"loss_": 0.9561, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"learning_rate": 4.366189534634191e-08, |
|
"loss": 2.4604, |
|
"loss_": 0.9591, |
|
"moe_loss": 0.16, |
|
"moe_loss_longrong": 1.3955, |
|
"step": 5047 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"learning_rate": 3.9684505637718194e-08, |
|
"loss": 2.4619, |
|
"loss_": 1.1709, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3959, |
|
"step": 5054 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"learning_rate": 3.589667075110992e-08, |
|
"loss": 2.4199, |
|
"loss_": 1.0163, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3959, |
|
"step": 5061 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"learning_rate": 3.229846277282511e-08, |
|
"loss": 2.4621, |
|
"loss_": 0.9223, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3959, |
|
"step": 5068 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"learning_rate": 2.8889950180382985e-08, |
|
"loss": 2.4625, |
|
"loss_": 0.4562, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3959, |
|
"step": 5075 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"learning_rate": 2.5671197841203867e-08, |
|
"loss": 2.4386, |
|
"loss_": 1.0035, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3958, |
|
"step": 5082 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"learning_rate": 2.264226701138461e-08, |
|
"loss": 2.4681, |
|
"loss_": 1.218, |
|
"moe_loss": 0.16, |
|
"moe_loss_longrong": 1.3955, |
|
"step": 5089 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"learning_rate": 1.9803215334522895e-08, |
|
"loss": 2.427, |
|
"loss_": 0.7696, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3961, |
|
"step": 5096 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"learning_rate": 1.7154096840629186e-08, |
|
"loss": 2.4851, |
|
"loss_": 1.054, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3956, |
|
"step": 5103 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"learning_rate": 1.4694961945093122e-08, |
|
"loss": 2.4448, |
|
"loss_": 0.8593, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3959, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"learning_rate": 1.2425857447725397e-08, |
|
"loss": 2.454, |
|
"loss_": 0.8875, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3956, |
|
"step": 5117 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"learning_rate": 1.0346826531865139e-08, |
|
"loss": 2.4289, |
|
"loss_": 0.9578, |
|
"moe_loss": 0.16, |
|
"moe_loss_longrong": 1.3957, |
|
"step": 5124 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"learning_rate": 8.457908763562783e-09, |
|
"loss": 2.4436, |
|
"loss_": 1.026, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3956, |
|
"step": 5131 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"learning_rate": 6.759140090824012e-09, |
|
"loss": 2.4605, |
|
"loss_": 0.6569, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3954, |
|
"step": 5138 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"learning_rate": 5.25055284292475e-09, |
|
"loss": 2.4668, |
|
"loss_": 1.2459, |
|
"moe_loss": 0.16, |
|
"moe_loss_longrong": 1.3959, |
|
"step": 5145 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"learning_rate": 3.932175729797205e-09, |
|
"loss": 2.4129, |
|
"loss_": 0.9565, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3959, |
|
"step": 5152 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"learning_rate": 2.8040338414847545e-09, |
|
"loss": 2.4469, |
|
"loss_": 0.8875, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3957, |
|
"step": 5159 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"learning_rate": 1.8661486476612144e-09, |
|
"loss": 2.4186, |
|
"loss_": 0.7829, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.396, |
|
"step": 5166 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"learning_rate": 1.1185379972256105e-09, |
|
"loss": 2.4351, |
|
"loss_": 0.7496, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3956, |
|
"step": 5173 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"learning_rate": 5.612161179613385e-10, |
|
"loss": 2.4242, |
|
"loss_": 0.9366, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3956, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"learning_rate": 1.9419361626416e-10, |
|
"loss": 2.4684, |
|
"loss_": 1.0884, |
|
"moe_loss": 0.16, |
|
"moe_loss_longrong": 1.3956, |
|
"step": 5187 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"learning_rate": 1.7477476940142013e-11, |
|
"loss": 2.4801, |
|
"loss_": 1.0033, |
|
"moe_loss": 0.1601, |
|
"moe_loss_longrong": 1.3959, |
|
"step": 5194 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 5197, |
|
"total_flos": 8.818772994970092e+18, |
|
"train_loss": 2.4979199135512236, |
|
"train_runtime": 95362.7285, |
|
"train_samples_per_second": 6.976, |
|
"train_steps_per_second": 0.054 |
|
} |
|
], |
|
"logging_steps": 7, |
|
"max_steps": 5197, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1000, |
|
"total_flos": 8.818772994970092e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|