|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 2010, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0004975124378109452, |
|
"grad_norm": 11.875, |
|
"learning_rate": 9.950248756218907e-07, |
|
"loss": 4.8907, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0024875621890547263, |
|
"grad_norm": 13.125, |
|
"learning_rate": 4.975124378109453e-06, |
|
"loss": 4.8447, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.004975124378109453, |
|
"grad_norm": 12.5, |
|
"learning_rate": 9.950248756218906e-06, |
|
"loss": 4.9268, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.007462686567164179, |
|
"grad_norm": 12.375, |
|
"learning_rate": 1.4925373134328357e-05, |
|
"loss": 4.9396, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.009950248756218905, |
|
"grad_norm": 14.6875, |
|
"learning_rate": 1.990049751243781e-05, |
|
"loss": 5.0521, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.012437810945273632, |
|
"grad_norm": 12.625, |
|
"learning_rate": 2.4875621890547266e-05, |
|
"loss": 4.7898, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.014925373134328358, |
|
"grad_norm": 12.875, |
|
"learning_rate": 2.9850746268656714e-05, |
|
"loss": 4.8798, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.017412935323383085, |
|
"grad_norm": 11.9375, |
|
"learning_rate": 3.4825870646766175e-05, |
|
"loss": 4.8204, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.01990049751243781, |
|
"grad_norm": 11.6875, |
|
"learning_rate": 3.980099502487562e-05, |
|
"loss": 4.684, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.022388059701492536, |
|
"grad_norm": 13.1875, |
|
"learning_rate": 4.477611940298508e-05, |
|
"loss": 4.6143, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.024875621890547265, |
|
"grad_norm": 10.6875, |
|
"learning_rate": 4.975124378109453e-05, |
|
"loss": 4.5667, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02736318407960199, |
|
"grad_norm": 12.5625, |
|
"learning_rate": 5.472636815920398e-05, |
|
"loss": 4.6985, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.029850746268656716, |
|
"grad_norm": 13.375, |
|
"learning_rate": 5.970149253731343e-05, |
|
"loss": 4.5517, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03233830845771144, |
|
"grad_norm": 11.5625, |
|
"learning_rate": 6.46766169154229e-05, |
|
"loss": 4.5351, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.03482587064676617, |
|
"grad_norm": 11.5625, |
|
"learning_rate": 6.965174129353235e-05, |
|
"loss": 4.454, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.03731343283582089, |
|
"grad_norm": 11.125, |
|
"learning_rate": 7.46268656716418e-05, |
|
"loss": 4.4039, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.03980099502487562, |
|
"grad_norm": 10.875, |
|
"learning_rate": 7.960199004975125e-05, |
|
"loss": 4.3716, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04228855721393035, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 8.45771144278607e-05, |
|
"loss": 4.3879, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.04477611940298507, |
|
"grad_norm": 10.25, |
|
"learning_rate": 8.955223880597016e-05, |
|
"loss": 4.1858, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0472636815920398, |
|
"grad_norm": 10.5, |
|
"learning_rate": 9.452736318407961e-05, |
|
"loss": 4.421, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.04975124378109453, |
|
"grad_norm": 10.375, |
|
"learning_rate": 9.950248756218906e-05, |
|
"loss": 4.3506, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05223880597014925, |
|
"grad_norm": 11.3125, |
|
"learning_rate": 0.0001044776119402985, |
|
"loss": 4.2671, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.05472636815920398, |
|
"grad_norm": 8.625, |
|
"learning_rate": 0.00010945273631840796, |
|
"loss": 4.296, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.05721393034825871, |
|
"grad_norm": 11.75, |
|
"learning_rate": 0.00011442786069651741, |
|
"loss": 4.1273, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.05970149253731343, |
|
"grad_norm": 8.25, |
|
"learning_rate": 0.00011940298507462686, |
|
"loss": 4.3068, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.06218905472636816, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 0.0001243781094527363, |
|
"loss": 4.2837, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.06467661691542288, |
|
"grad_norm": 11.75, |
|
"learning_rate": 0.0001293532338308458, |
|
"loss": 4.3174, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.06716417910447761, |
|
"grad_norm": 10.0, |
|
"learning_rate": 0.00013432835820895525, |
|
"loss": 4.086, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.06965174129353234, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 0.0001393034825870647, |
|
"loss": 4.4375, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.07213930348258707, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 0.00014427860696517416, |
|
"loss": 4.2058, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.07462686567164178, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 0.0001492537313432836, |
|
"loss": 4.1712, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07711442786069651, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 0.00015422885572139304, |
|
"loss": 4.0637, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.07960199004975124, |
|
"grad_norm": 9.0, |
|
"learning_rate": 0.0001592039800995025, |
|
"loss": 4.0861, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.08208955223880597, |
|
"grad_norm": 10.0625, |
|
"learning_rate": 0.00016417910447761195, |
|
"loss": 4.1879, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.0845771144278607, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 0.0001691542288557214, |
|
"loss": 4.2878, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.08706467661691543, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 0.00017412935323383086, |
|
"loss": 4.0535, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.08955223880597014, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 0.0001791044776119403, |
|
"loss": 4.1332, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.09203980099502487, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 0.00018407960199004977, |
|
"loss": 4.114, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.0945273631840796, |
|
"grad_norm": 9.25, |
|
"learning_rate": 0.00018905472636815922, |
|
"loss": 4.0242, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.09701492537313433, |
|
"grad_norm": 8.875, |
|
"learning_rate": 0.00019402985074626867, |
|
"loss": 4.1829, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.09950248756218906, |
|
"grad_norm": 11.375, |
|
"learning_rate": 0.00019900497512437813, |
|
"loss": 4.2259, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.10199004975124377, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 0.000199997587258178, |
|
"loss": 4.302, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.1044776119402985, |
|
"grad_norm": 10.4375, |
|
"learning_rate": 0.0001999877856940653, |
|
"loss": 4.0564, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.10696517412935323, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 0.00019997044524974799, |
|
"loss": 4.0902, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.10945273631840796, |
|
"grad_norm": 8.125, |
|
"learning_rate": 0.00019994556723266103, |
|
"loss": 4.0716, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.11194029850746269, |
|
"grad_norm": 8.0, |
|
"learning_rate": 0.00019991315351855748, |
|
"loss": 4.1908, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.11442786069651742, |
|
"grad_norm": 9.0, |
|
"learning_rate": 0.00019987320655136693, |
|
"loss": 4.2773, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.11691542288557213, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 0.00019982572934301122, |
|
"loss": 4.1853, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.11940298507462686, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 0.0001997707254731775, |
|
"loss": 4.1019, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.12189054726368159, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 0.00019970819908904814, |
|
"loss": 4.198, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.12437810945273632, |
|
"grad_norm": 9.125, |
|
"learning_rate": 0.00019963815490498817, |
|
"loss": 4.1697, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.12686567164179105, |
|
"grad_norm": 10.625, |
|
"learning_rate": 0.00019956059820218982, |
|
"loss": 4.0784, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.12935323383084577, |
|
"grad_norm": 9.0, |
|
"learning_rate": 0.00019947553482827418, |
|
"loss": 4.1693, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.1318407960199005, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 0.00019938297119685054, |
|
"loss": 4.0987, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.13432835820895522, |
|
"grad_norm": 7.9375, |
|
"learning_rate": 0.00019928291428703262, |
|
"loss": 4.1734, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.13681592039800994, |
|
"grad_norm": 7.5625, |
|
"learning_rate": 0.00019917537164291244, |
|
"loss": 4.0456, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.13930348258706468, |
|
"grad_norm": 8.25, |
|
"learning_rate": 0.0001990603513729915, |
|
"loss": 4.1025, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.1417910447761194, |
|
"grad_norm": 8.625, |
|
"learning_rate": 0.00019893786214956945, |
|
"loss": 4.1861, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.14427860696517414, |
|
"grad_norm": 8.5, |
|
"learning_rate": 0.0001988079132080901, |
|
"loss": 4.1516, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.14676616915422885, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 0.0001986705143464453, |
|
"loss": 4.0148, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.14925373134328357, |
|
"grad_norm": 7.96875, |
|
"learning_rate": 0.0001985256759242359, |
|
"loss": 3.9918, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1517412935323383, |
|
"grad_norm": 7.875, |
|
"learning_rate": 0.00019837340886199096, |
|
"loss": 4.0434, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.15422885572139303, |
|
"grad_norm": 7.65625, |
|
"learning_rate": 0.00019821372464034416, |
|
"loss": 4.1499, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.15671641791044777, |
|
"grad_norm": 8.75, |
|
"learning_rate": 0.00019804663529916826, |
|
"loss": 4.0495, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.15920398009950248, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 0.00019787215343666732, |
|
"loss": 4.0981, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.16169154228855723, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 0.00019769029220842677, |
|
"loss": 4.0678, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.16417910447761194, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 0.0001975010653264216, |
|
"loss": 4.1043, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.16666666666666666, |
|
"grad_norm": 8.5, |
|
"learning_rate": 0.00019730448705798239, |
|
"loss": 4.1361, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.1691542288557214, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 0.00019710057222471967, |
|
"loss": 3.9753, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.17164179104477612, |
|
"grad_norm": 7.875, |
|
"learning_rate": 0.00019688933620140637, |
|
"loss": 4.1972, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.17412935323383086, |
|
"grad_norm": 7.4375, |
|
"learning_rate": 0.0001966707949148186, |
|
"loss": 4.0355, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.17661691542288557, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 0.00019644496484253474, |
|
"loss": 4.0079, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.1791044776119403, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 0.00019621186301169315, |
|
"loss": 3.9721, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.18159203980099503, |
|
"grad_norm": 8.25, |
|
"learning_rate": 0.00019597150699770835, |
|
"loss": 4.1628, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.18407960199004975, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 0.0001957239149229458, |
|
"loss": 3.9472, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.1865671641791045, |
|
"grad_norm": 8.125, |
|
"learning_rate": 0.00019546910545535558, |
|
"loss": 4.2425, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.1890547263681592, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 0.00019520709780706486, |
|
"loss": 4.1314, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.19154228855721392, |
|
"grad_norm": 8.5, |
|
"learning_rate": 0.00019493791173292923, |
|
"loss": 4.0324, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.19402985074626866, |
|
"grad_norm": 8.0, |
|
"learning_rate": 0.00019466156752904343, |
|
"loss": 4.0719, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.19651741293532338, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 0.00019437808603121087, |
|
"loss": 3.8774, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.19900497512437812, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 0.00019408748861337273, |
|
"loss": 4.1163, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.20149253731343283, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 0.00019378979718599645, |
|
"loss": 4.1658, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.20398009950248755, |
|
"grad_norm": 7.625, |
|
"learning_rate": 0.0001934850341944237, |
|
"loss": 4.1059, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.2064676616915423, |
|
"grad_norm": 7.9375, |
|
"learning_rate": 0.00019317322261717794, |
|
"loss": 4.0303, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.208955223880597, |
|
"grad_norm": 8.25, |
|
"learning_rate": 0.00019285438596423204, |
|
"loss": 3.9217, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.21144278606965175, |
|
"grad_norm": 7.375, |
|
"learning_rate": 0.00019252854827523557, |
|
"loss": 4.0601, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.21393034825870647, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 0.00019219573411770235, |
|
"loss": 4.025, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.21641791044776118, |
|
"grad_norm": 7.71875, |
|
"learning_rate": 0.000191855968585158, |
|
"loss": 4.0082, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.21890547263681592, |
|
"grad_norm": 6.46875, |
|
"learning_rate": 0.000191509277295248, |
|
"loss": 3.9052, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.22139303482587064, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 0.00019115568638780622, |
|
"loss": 3.8947, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.22388059701492538, |
|
"grad_norm": 7.40625, |
|
"learning_rate": 0.00019079522252288386, |
|
"loss": 3.8908, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.2263681592039801, |
|
"grad_norm": 8.5, |
|
"learning_rate": 0.00019042791287873957, |
|
"loss": 4.198, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.22885572139303484, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 0.00019005378514979008, |
|
"loss": 4.06, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.23134328358208955, |
|
"grad_norm": 7.625, |
|
"learning_rate": 0.00018967286754452214, |
|
"loss": 4.1332, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.23383084577114427, |
|
"grad_norm": 7.9375, |
|
"learning_rate": 0.0001892851887833657, |
|
"loss": 4.0782, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.236318407960199, |
|
"grad_norm": 7.28125, |
|
"learning_rate": 0.0001888907780965284, |
|
"loss": 4.0219, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.23880597014925373, |
|
"grad_norm": 7.59375, |
|
"learning_rate": 0.00018848966522179168, |
|
"loss": 4.0916, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.24129353233830847, |
|
"grad_norm": 7.53125, |
|
"learning_rate": 0.00018808188040226868, |
|
"loss": 4.1352, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.24378109452736318, |
|
"grad_norm": 8.25, |
|
"learning_rate": 0.00018766745438412384, |
|
"loss": 4.0799, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.2462686567164179, |
|
"grad_norm": 8.375, |
|
"learning_rate": 0.00018724641841425478, |
|
"loss": 4.0501, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.24875621890547264, |
|
"grad_norm": 7.90625, |
|
"learning_rate": 0.00018681880423793642, |
|
"loss": 4.0131, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2512437810945274, |
|
"grad_norm": 8.25, |
|
"learning_rate": 0.00018638464409642723, |
|
"loss": 4.2064, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.2537313432835821, |
|
"grad_norm": 7.0, |
|
"learning_rate": 0.00018594397072453856, |
|
"loss": 4.1475, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.2562189054726368, |
|
"grad_norm": 7.28125, |
|
"learning_rate": 0.00018549681734816623, |
|
"loss": 3.9829, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.25870646766169153, |
|
"grad_norm": 7.5625, |
|
"learning_rate": 0.0001850432176817857, |
|
"loss": 4.0752, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.26119402985074625, |
|
"grad_norm": 7.96875, |
|
"learning_rate": 0.00018458320592590975, |
|
"loss": 3.8724, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.263681592039801, |
|
"grad_norm": 7.3125, |
|
"learning_rate": 0.00018411681676450999, |
|
"loss": 4.0854, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.26616915422885573, |
|
"grad_norm": 7.84375, |
|
"learning_rate": 0.0001836440853624017, |
|
"loss": 4.0408, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.26865671641791045, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 0.00018316504736259255, |
|
"loss": 4.0437, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.27114427860696516, |
|
"grad_norm": 6.8125, |
|
"learning_rate": 0.00018267973888359509, |
|
"loss": 4.0593, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.2736318407960199, |
|
"grad_norm": 7.625, |
|
"learning_rate": 0.00018218819651670356, |
|
"loss": 3.9724, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.27611940298507465, |
|
"grad_norm": 7.09375, |
|
"learning_rate": 0.00018169045732323492, |
|
"loss": 4.1018, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.27860696517412936, |
|
"grad_norm": 7.75, |
|
"learning_rate": 0.00018118655883173456, |
|
"loss": 4.1389, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.2810945273631841, |
|
"grad_norm": 7.75, |
|
"learning_rate": 0.0001806765390351467, |
|
"loss": 4.0369, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.2835820895522388, |
|
"grad_norm": 6.84375, |
|
"learning_rate": 0.00018016043638794974, |
|
"loss": 4.1131, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.2860696517412935, |
|
"grad_norm": 6.6875, |
|
"learning_rate": 0.00017963828980325697, |
|
"loss": 3.8789, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.2885572139303483, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 0.00017911013864988252, |
|
"loss": 4.1892, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.291044776119403, |
|
"grad_norm": 7.4375, |
|
"learning_rate": 0.00017857602274937308, |
|
"loss": 4.0332, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.2935323383084577, |
|
"grad_norm": 7.6875, |
|
"learning_rate": 0.00017803598237300537, |
|
"loss": 4.0141, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.2960199004975124, |
|
"grad_norm": 7.34375, |
|
"learning_rate": 0.00017749005823874988, |
|
"loss": 3.9258, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.29850746268656714, |
|
"grad_norm": 6.9375, |
|
"learning_rate": 0.00017693829150820068, |
|
"loss": 4.072, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.3009950248756219, |
|
"grad_norm": 7.3125, |
|
"learning_rate": 0.00017638072378347203, |
|
"loss": 3.8492, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.3034825870646766, |
|
"grad_norm": 7.1875, |
|
"learning_rate": 0.0001758173971040616, |
|
"loss": 3.8323, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.30597014925373134, |
|
"grad_norm": 8.25, |
|
"learning_rate": 0.00017524835394368065, |
|
"loss": 3.9926, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.30845771144278605, |
|
"grad_norm": 7.53125, |
|
"learning_rate": 0.00017467363720705204, |
|
"loss": 4.0593, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.31094527363184077, |
|
"grad_norm": 7.90625, |
|
"learning_rate": 0.0001740932902266747, |
|
"loss": 3.8775, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.31343283582089554, |
|
"grad_norm": 7.8125, |
|
"learning_rate": 0.00017350735675955697, |
|
"loss": 4.1344, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.31592039800995025, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 0.000172915880983917, |
|
"loss": 3.948, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.31840796019900497, |
|
"grad_norm": 6.96875, |
|
"learning_rate": 0.0001723189074958521, |
|
"loss": 3.9485, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.3208955223880597, |
|
"grad_norm": 7.625, |
|
"learning_rate": 0.00017171648130597612, |
|
"loss": 3.9687, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.32338308457711445, |
|
"grad_norm": 6.6875, |
|
"learning_rate": 0.0001711086478360257, |
|
"loss": 4.0554, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.32587064676616917, |
|
"grad_norm": 6.9375, |
|
"learning_rate": 0.0001704954529154359, |
|
"loss": 4.0395, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.3283582089552239, |
|
"grad_norm": 6.71875, |
|
"learning_rate": 0.00016987694277788417, |
|
"loss": 3.9427, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.3308457711442786, |
|
"grad_norm": 7.4375, |
|
"learning_rate": 0.000169253164057805, |
|
"loss": 3.9438, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 7.6875, |
|
"learning_rate": 0.0001686241637868734, |
|
"loss": 4.1186, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.3358208955223881, |
|
"grad_norm": 6.59375, |
|
"learning_rate": 0.00016798998939045895, |
|
"loss": 4.0849, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.3383084577114428, |
|
"grad_norm": 7.375, |
|
"learning_rate": 0.00016735068868404998, |
|
"loss": 3.9868, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.3407960199004975, |
|
"grad_norm": 6.9375, |
|
"learning_rate": 0.0001667063098696485, |
|
"loss": 3.9275, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.34328358208955223, |
|
"grad_norm": 8.125, |
|
"learning_rate": 0.0001660569015321357, |
|
"loss": 4.0451, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.34577114427860695, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 0.00016540251263560878, |
|
"loss": 3.9818, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.3482587064676617, |
|
"grad_norm": 8.875, |
|
"learning_rate": 0.00016474319251968923, |
|
"loss": 3.9491, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.35074626865671643, |
|
"grad_norm": 6.65625, |
|
"learning_rate": 0.00016407899089580262, |
|
"loss": 3.9901, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.35323383084577115, |
|
"grad_norm": 7.46875, |
|
"learning_rate": 0.0001634099578434306, |
|
"loss": 3.9471, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.35572139303482586, |
|
"grad_norm": 7.375, |
|
"learning_rate": 0.00016273614380633484, |
|
"loss": 3.897, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.3582089552238806, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 0.0001620575995887538, |
|
"loss": 3.9658, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.36069651741293535, |
|
"grad_norm": 6.78125, |
|
"learning_rate": 0.00016137437635157213, |
|
"loss": 3.9457, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.36318407960199006, |
|
"grad_norm": 6.96875, |
|
"learning_rate": 0.00016068652560846327, |
|
"loss": 4.143, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.3656716417910448, |
|
"grad_norm": 7.21875, |
|
"learning_rate": 0.0001599940992220053, |
|
"loss": 4.0813, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.3681592039800995, |
|
"grad_norm": 7.40625, |
|
"learning_rate": 0.0001592971493997709, |
|
"loss": 4.019, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.3706467661691542, |
|
"grad_norm": 6.75, |
|
"learning_rate": 0.00015859572869039064, |
|
"loss": 4.0779, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.373134328358209, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 0.00015788988997959114, |
|
"loss": 4.1056, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.3756218905472637, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 0.00015717968648620764, |
|
"loss": 4.0207, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.3781094527363184, |
|
"grad_norm": 7.8125, |
|
"learning_rate": 0.00015646517175817114, |
|
"loss": 4.2123, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.3805970149253731, |
|
"grad_norm": 6.84375, |
|
"learning_rate": 0.00015574639966847126, |
|
"loss": 4.0826, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.38308457711442784, |
|
"grad_norm": 6.71875, |
|
"learning_rate": 0.00015502342441109422, |
|
"loss": 4.0236, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.3855721393034826, |
|
"grad_norm": 7.625, |
|
"learning_rate": 0.00015429630049693674, |
|
"loss": 3.9291, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.3880597014925373, |
|
"grad_norm": 7.875, |
|
"learning_rate": 0.00015356508274969594, |
|
"loss": 4.0301, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.39054726368159204, |
|
"grad_norm": 7.03125, |
|
"learning_rate": 0.00015282982630173585, |
|
"loss": 3.9478, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.39303482587064675, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 0.00015209058658993056, |
|
"loss": 3.9102, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.39552238805970147, |
|
"grad_norm": 6.46875, |
|
"learning_rate": 0.0001513474193514842, |
|
"loss": 4.0111, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.39800995024875624, |
|
"grad_norm": 7.15625, |
|
"learning_rate": 0.00015060038061972874, |
|
"loss": 3.9447, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.40049751243781095, |
|
"grad_norm": 5.5, |
|
"learning_rate": 0.000149849526719899, |
|
"loss": 3.7303, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.40298507462686567, |
|
"grad_norm": 7.21875, |
|
"learning_rate": 0.00014909491426488578, |
|
"loss": 4.1654, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.4054726368159204, |
|
"grad_norm": 7.46875, |
|
"learning_rate": 0.00014833660015096766, |
|
"loss": 3.8909, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.4079601990049751, |
|
"grad_norm": 6.84375, |
|
"learning_rate": 0.00014757464155352082, |
|
"loss": 3.9657, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.41044776119402987, |
|
"grad_norm": 7.125, |
|
"learning_rate": 0.0001468090959227082, |
|
"loss": 3.9625, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.4129353233830846, |
|
"grad_norm": 7.34375, |
|
"learning_rate": 0.00014604002097914806, |
|
"loss": 3.8299, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.4154228855721393, |
|
"grad_norm": 7.15625, |
|
"learning_rate": 0.00014526747470956176, |
|
"loss": 3.9513, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.417910447761194, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 0.00014449151536240166, |
|
"loss": 3.828, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.42039800995024873, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 0.00014371220144345954, |
|
"loss": 3.9232, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.4228855721393035, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 0.0001429295917114551, |
|
"loss": 3.8572, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.4253731343283582, |
|
"grad_norm": 7.03125, |
|
"learning_rate": 0.00014214374517360575, |
|
"loss": 3.9477, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.42786069651741293, |
|
"grad_norm": 7.90625, |
|
"learning_rate": 0.00014135472108117787, |
|
"loss": 4.2486, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.43034825870646765, |
|
"grad_norm": 6.34375, |
|
"learning_rate": 0.00014056257892501885, |
|
"loss": 3.9868, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.43283582089552236, |
|
"grad_norm": 6.5, |
|
"learning_rate": 0.00013976737843107202, |
|
"loss": 4.1234, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.43532338308457713, |
|
"grad_norm": 7.0625, |
|
"learning_rate": 0.00013896917955587328, |
|
"loss": 4.006, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.43781094527363185, |
|
"grad_norm": 7.375, |
|
"learning_rate": 0.00013816804248203052, |
|
"loss": 3.9775, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.44029850746268656, |
|
"grad_norm": 6.84375, |
|
"learning_rate": 0.00013736402761368598, |
|
"loss": 3.9257, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.4427860696517413, |
|
"grad_norm": 6.78125, |
|
"learning_rate": 0.00013655719557196185, |
|
"loss": 3.9621, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.44527363184079605, |
|
"grad_norm": 7.25, |
|
"learning_rate": 0.0001357476071903896, |
|
"loss": 3.8718, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.44776119402985076, |
|
"grad_norm": 7.4375, |
|
"learning_rate": 0.0001349353235103232, |
|
"loss": 3.9892, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.4502487562189055, |
|
"grad_norm": 7.625, |
|
"learning_rate": 0.00013412040577633687, |
|
"loss": 4.2505, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.4527363184079602, |
|
"grad_norm": 7.03125, |
|
"learning_rate": 0.0001333029154316072, |
|
"loss": 3.9349, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.4552238805970149, |
|
"grad_norm": 6.9375, |
|
"learning_rate": 0.00013248291411328047, |
|
"loss": 3.9892, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.4577114427860697, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 0.00013166046364782545, |
|
"loss": 3.9654, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.4601990049751244, |
|
"grad_norm": 7.03125, |
|
"learning_rate": 0.0001308356260463717, |
|
"loss": 4.0497, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.4626865671641791, |
|
"grad_norm": 7.34375, |
|
"learning_rate": 0.0001300084635000341, |
|
"loss": 3.8808, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.4651741293532338, |
|
"grad_norm": 6.71875, |
|
"learning_rate": 0.0001291790383752237, |
|
"loss": 3.957, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.46766169154228854, |
|
"grad_norm": 7.59375, |
|
"learning_rate": 0.00012834741320894553, |
|
"loss": 3.936, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.4701492537313433, |
|
"grad_norm": 6.34375, |
|
"learning_rate": 0.00012751365070408333, |
|
"loss": 4.0231, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.472636815920398, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 0.00012667781372467202, |
|
"loss": 4.0101, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.47512437810945274, |
|
"grad_norm": 7.15625, |
|
"learning_rate": 0.00012583996529115762, |
|
"loss": 3.9361, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.47761194029850745, |
|
"grad_norm": 6.5625, |
|
"learning_rate": 0.00012500016857564585, |
|
"loss": 4.0114, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.48009950248756217, |
|
"grad_norm": 6.875, |
|
"learning_rate": 0.00012415848689713903, |
|
"loss": 3.9577, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.48258706467661694, |
|
"grad_norm": 6.875, |
|
"learning_rate": 0.00012331498371676204, |
|
"loss": 3.8951, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.48507462686567165, |
|
"grad_norm": 7.0, |
|
"learning_rate": 0.0001224697226329772, |
|
"loss": 3.9695, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.48756218905472637, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 0.00012162276737678933, |
|
"loss": 3.9444, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.4900497512437811, |
|
"grad_norm": 6.75, |
|
"learning_rate": 0.0001207741818069405, |
|
"loss": 3.9877, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.4925373134328358, |
|
"grad_norm": 6.875, |
|
"learning_rate": 0.00011992402990509515, |
|
"loss": 3.9706, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.49502487562189057, |
|
"grad_norm": 6.9375, |
|
"learning_rate": 0.00011907237577101611, |
|
"loss": 3.8701, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.4975124378109453, |
|
"grad_norm": 7.46875, |
|
"learning_rate": 0.00011821928361773147, |
|
"loss": 4.0109, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 3.9716, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.5024875621890548, |
|
"grad_norm": 6.8125, |
|
"learning_rate": 0.00011650904264292687, |
|
"loss": 3.9534, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.5049751243781094, |
|
"grad_norm": 8.5, |
|
"learning_rate": 0.00011565202277017551, |
|
"loss": 4.0376, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.5074626865671642, |
|
"grad_norm": 5.75, |
|
"learning_rate": 0.000114793822766033, |
|
"loss": 3.9223, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.5099502487562189, |
|
"grad_norm": 7.0, |
|
"learning_rate": 0.00011393450733707309, |
|
"loss": 4.11, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.5124378109452736, |
|
"grad_norm": 6.6875, |
|
"learning_rate": 0.00011307414127397027, |
|
"loss": 4.0138, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.5149253731343284, |
|
"grad_norm": 7.09375, |
|
"learning_rate": 0.00011221278944661473, |
|
"loss": 3.8801, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.5174129353233831, |
|
"grad_norm": 7.84375, |
|
"learning_rate": 0.00011135051679922141, |
|
"loss": 4.0368, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.5199004975124378, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 0.00011048738834543319, |
|
"loss": 3.8343, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.5223880597014925, |
|
"grad_norm": 7.21875, |
|
"learning_rate": 0.00010962346916341903, |
|
"loss": 3.885, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.5248756218905473, |
|
"grad_norm": 6.125, |
|
"learning_rate": 0.00010875882439096729, |
|
"loss": 3.9348, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.527363184079602, |
|
"grad_norm": 6.375, |
|
"learning_rate": 0.00010789351922057435, |
|
"loss": 3.9439, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.5298507462686567, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 0.0001070276188945293, |
|
"loss": 3.7975, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.5323383084577115, |
|
"grad_norm": 6.25, |
|
"learning_rate": 0.00010616118869999483, |
|
"loss": 3.8004, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.5348258706467661, |
|
"grad_norm": 5.875, |
|
"learning_rate": 0.00010529429396408452, |
|
"loss": 3.967, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.5373134328358209, |
|
"grad_norm": 7.15625, |
|
"learning_rate": 0.00010442700004893764, |
|
"loss": 3.8504, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.5398009950248757, |
|
"grad_norm": 6.46875, |
|
"learning_rate": 0.00010355937234679065, |
|
"loss": 3.7783, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.5422885572139303, |
|
"grad_norm": 7.0, |
|
"learning_rate": 0.00010269147627504692, |
|
"loss": 3.7741, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.5447761194029851, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 0.0001018233772713443, |
|
"loss": 3.9042, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.5472636815920398, |
|
"grad_norm": 6.0, |
|
"learning_rate": 0.00010095514078862147, |
|
"loss": 4.0435, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.5497512437810945, |
|
"grad_norm": 6.6875, |
|
"learning_rate": 0.00010008683229018256, |
|
"loss": 4.0422, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.5522388059701493, |
|
"grad_norm": 7.0, |
|
"learning_rate": 9.92185172447616e-05, |
|
"loss": 3.9499, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.554726368159204, |
|
"grad_norm": 7.125, |
|
"learning_rate": 9.835026112158637e-05, |
|
"loss": 3.9851, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.5572139303482587, |
|
"grad_norm": 6.25, |
|
"learning_rate": 9.74821293854419e-05, |
|
"loss": 3.9625, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.5597014925373134, |
|
"grad_norm": 6.75, |
|
"learning_rate": 9.661418749173467e-05, |
|
"loss": 3.9269, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.5621890547263682, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 9.574650088155752e-05, |
|
"loss": 4.0838, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.5646766169154229, |
|
"grad_norm": 7.40625, |
|
"learning_rate": 9.487913497675536e-05, |
|
"loss": 4.0415, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.5671641791044776, |
|
"grad_norm": 6.59375, |
|
"learning_rate": 9.40121551749925e-05, |
|
"loss": 4.0316, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.5696517412935324, |
|
"grad_norm": 6.0, |
|
"learning_rate": 9.314562684482202e-05, |
|
"loss": 4.0425, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.572139303482587, |
|
"grad_norm": 6.25, |
|
"learning_rate": 9.227961532075671e-05, |
|
"loss": 3.9342, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.5746268656716418, |
|
"grad_norm": 6.65625, |
|
"learning_rate": 9.141418589834339e-05, |
|
"loss": 4.0811, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.5771144278606966, |
|
"grad_norm": 7.5625, |
|
"learning_rate": 9.054940382923953e-05, |
|
"loss": 4.0697, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.5796019900497512, |
|
"grad_norm": 7.4375, |
|
"learning_rate": 8.96853343162934e-05, |
|
"loss": 4.0852, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.582089552238806, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 8.882204250862796e-05, |
|
"loss": 4.0077, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.5845771144278606, |
|
"grad_norm": 6.65625, |
|
"learning_rate": 8.795959349672878e-05, |
|
"loss": 3.9226, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.5870646766169154, |
|
"grad_norm": 6.78125, |
|
"learning_rate": 8.709805230753627e-05, |
|
"loss": 4.0092, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.5895522388059702, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 8.623748389954283e-05, |
|
"loss": 3.9131, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.5920398009950248, |
|
"grad_norm": 6.375, |
|
"learning_rate": 8.537795315789509e-05, |
|
"loss": 3.857, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.5945273631840796, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 8.451952488950166e-05, |
|
"loss": 3.9707, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.5970149253731343, |
|
"grad_norm": 7.15625, |
|
"learning_rate": 8.366226381814697e-05, |
|
"loss": 3.9853, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.599502487562189, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 8.280623457961102e-05, |
|
"loss": 4.107, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.6019900497512438, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 8.195150171679608e-05, |
|
"loss": 3.7088, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.6044776119402985, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 8.109812967486025e-05, |
|
"loss": 3.9205, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.6069651741293532, |
|
"grad_norm": 7.1875, |
|
"learning_rate": 8.02461827963585e-05, |
|
"loss": 3.9086, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.6094527363184079, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 7.939572531639128e-05, |
|
"loss": 3.9716, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.6119402985074627, |
|
"grad_norm": 6.40625, |
|
"learning_rate": 7.854682135776131e-05, |
|
"loss": 3.9194, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.6144278606965174, |
|
"grad_norm": 6.875, |
|
"learning_rate": 7.769953492613899e-05, |
|
"loss": 3.8653, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.6169154228855721, |
|
"grad_norm": 6.625, |
|
"learning_rate": 7.685392990523626e-05, |
|
"loss": 4.043, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.6194029850746269, |
|
"grad_norm": 6.6875, |
|
"learning_rate": 7.601007005199021e-05, |
|
"loss": 3.829, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.6218905472636815, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 7.516801899175565e-05, |
|
"loss": 3.9138, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.6243781094527363, |
|
"grad_norm": 6.5, |
|
"learning_rate": 7.432784021350796e-05, |
|
"loss": 3.9103, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.6268656716417911, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 7.348959706505626e-05, |
|
"loss": 3.9792, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.6293532338308457, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 7.265335274826704e-05, |
|
"loss": 4.0775, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.6318407960199005, |
|
"grad_norm": 6.59375, |
|
"learning_rate": 7.181917031429874e-05, |
|
"loss": 4.0234, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.6343283582089553, |
|
"grad_norm": 7.03125, |
|
"learning_rate": 7.09871126588481e-05, |
|
"loss": 3.9329, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.6368159203980099, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 7.015724251740766e-05, |
|
"loss": 3.6704, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.6393034825870647, |
|
"grad_norm": 6.71875, |
|
"learning_rate": 6.932962246053577e-05, |
|
"loss": 3.8563, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.6417910447761194, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 6.850431488913895e-05, |
|
"loss": 3.8506, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.6442786069651741, |
|
"grad_norm": 6.8125, |
|
"learning_rate": 6.76813820297669e-05, |
|
"loss": 4.008, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.6467661691542289, |
|
"grad_norm": 7.09375, |
|
"learning_rate": 6.686088592992067e-05, |
|
"loss": 4.0959, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.6492537313432836, |
|
"grad_norm": 6.375, |
|
"learning_rate": 6.604288845337453e-05, |
|
"loss": 4.0365, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.6517412935323383, |
|
"grad_norm": 6.96875, |
|
"learning_rate": 6.522745127551158e-05, |
|
"loss": 3.8927, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.654228855721393, |
|
"grad_norm": 6.96875, |
|
"learning_rate": 6.44146358786734e-05, |
|
"loss": 3.9165, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.6567164179104478, |
|
"grad_norm": 6.65625, |
|
"learning_rate": 6.360450354752458e-05, |
|
"loss": 4.1257, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.6592039800995025, |
|
"grad_norm": 7.125, |
|
"learning_rate": 6.279711536443185e-05, |
|
"loss": 3.9571, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.6616915422885572, |
|
"grad_norm": 6.125, |
|
"learning_rate": 6.199253220485856e-05, |
|
"loss": 3.7978, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.664179104477612, |
|
"grad_norm": 6.59375, |
|
"learning_rate": 6.119081473277501e-05, |
|
"loss": 3.859, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 6.039202339608432e-05, |
|
"loss": 4.015, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.6691542288557214, |
|
"grad_norm": 6.625, |
|
"learning_rate": 5.959621842206474e-05, |
|
"loss": 4.0804, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.6716417910447762, |
|
"grad_norm": 6.625, |
|
"learning_rate": 5.880345981282876e-05, |
|
"loss": 4.0607, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.6741293532338308, |
|
"grad_norm": 6.46875, |
|
"learning_rate": 5.801380734079907e-05, |
|
"loss": 3.8616, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.6766169154228856, |
|
"grad_norm": 6.375, |
|
"learning_rate": 5.722732054420172e-05, |
|
"loss": 3.8968, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.6791044776119403, |
|
"grad_norm": 6.71875, |
|
"learning_rate": 5.6444058722577165e-05, |
|
"loss": 4.0431, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.681592039800995, |
|
"grad_norm": 6.65625, |
|
"learning_rate": 5.566408093230911e-05, |
|
"loss": 3.9798, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.6840796019900498, |
|
"grad_norm": 5.375, |
|
"learning_rate": 5.4887445982171906e-05, |
|
"loss": 3.7958, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.6865671641791045, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 5.4114212428896424e-05, |
|
"loss": 3.9962, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.6890547263681592, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 5.334443857275487e-05, |
|
"loss": 4.009, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.6915422885572139, |
|
"grad_norm": 6.65625, |
|
"learning_rate": 5.257818245316522e-05, |
|
"loss": 3.9681, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.6940298507462687, |
|
"grad_norm": 7.21875, |
|
"learning_rate": 5.1815501844315105e-05, |
|
"loss": 4.0784, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.6965174129353234, |
|
"grad_norm": 7.375, |
|
"learning_rate": 5.105645425080572e-05, |
|
"loss": 4.0183, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.6990049751243781, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 5.030109690331625e-05, |
|
"loss": 3.9356, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.7014925373134329, |
|
"grad_norm": 7.625, |
|
"learning_rate": 4.954948675428853e-05, |
|
"loss": 3.7845, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.7039800995024875, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 4.880168047363312e-05, |
|
"loss": 3.7763, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.7064676616915423, |
|
"grad_norm": 6.46875, |
|
"learning_rate": 4.8057734444456536e-05, |
|
"loss": 4.0405, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.7089552238805971, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 4.7317704758809946e-05, |
|
"loss": 3.9666, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.7114427860696517, |
|
"grad_norm": 7.0, |
|
"learning_rate": 4.658164721345998e-05, |
|
"loss": 3.9511, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.7139303482587065, |
|
"grad_norm": 7.03125, |
|
"learning_rate": 4.584961730568188e-05, |
|
"loss": 4.0864, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.7164179104477612, |
|
"grad_norm": 6.625, |
|
"learning_rate": 4.512167022907494e-05, |
|
"loss": 4.0077, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.7189054726368159, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 4.439786086940115e-05, |
|
"loss": 3.8572, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.7213930348258707, |
|
"grad_norm": 6.5625, |
|
"learning_rate": 4.3678243800446835e-05, |
|
"loss": 3.812, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.7238805970149254, |
|
"grad_norm": 5.375, |
|
"learning_rate": 4.296287327990797e-05, |
|
"loss": 3.816, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.7263681592039801, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 4.225180324529917e-05, |
|
"loss": 3.8844, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.7288557213930348, |
|
"grad_norm": 7.46875, |
|
"learning_rate": 4.1545087309887045e-05, |
|
"loss": 3.9426, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.7313432835820896, |
|
"grad_norm": 6.0, |
|
"learning_rate": 4.084277875864776e-05, |
|
"loss": 3.9788, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.7338308457711443, |
|
"grad_norm": 6.34375, |
|
"learning_rate": 4.014493054424944e-05, |
|
"loss": 4.0493, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.736318407960199, |
|
"grad_norm": 6.78125, |
|
"learning_rate": 3.945159528305971e-05, |
|
"loss": 4.0197, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.7388059701492538, |
|
"grad_norm": 6.8125, |
|
"learning_rate": 3.876282525117847e-05, |
|
"loss": 3.9014, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.7412935323383084, |
|
"grad_norm": 7.15625, |
|
"learning_rate": 3.807867238049642e-05, |
|
"loss": 3.987, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.7437810945273632, |
|
"grad_norm": 6.0, |
|
"learning_rate": 3.739918825477953e-05, |
|
"loss": 3.9318, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.746268656716418, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 3.672442410577965e-05, |
|
"loss": 3.8518, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.7487562189054726, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 3.605443080937172e-05, |
|
"loss": 3.7997, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.7512437810945274, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 3.5389258881718e-05, |
|
"loss": 3.9, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.753731343283582, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 3.472895847545905e-05, |
|
"loss": 4.005, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.7562189054726368, |
|
"grad_norm": 7.375, |
|
"learning_rate": 3.407357937593237e-05, |
|
"loss": 3.9962, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.7587064676616916, |
|
"grad_norm": 6.5, |
|
"learning_rate": 3.342317099741886e-05, |
|
"loss": 3.9809, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.7611940298507462, |
|
"grad_norm": 6.875, |
|
"learning_rate": 3.27777823794168e-05, |
|
"loss": 3.9891, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.763681592039801, |
|
"grad_norm": 6.84375, |
|
"learning_rate": 3.213746218294455e-05, |
|
"loss": 4.0958, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.7661691542288557, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 3.150225868687161e-05, |
|
"loss": 3.838, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.7686567164179104, |
|
"grad_norm": 7.46875, |
|
"learning_rate": 3.0872219784278354e-05, |
|
"loss": 3.9027, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.7711442786069652, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 3.02473929788452e-05, |
|
"loss": 3.9055, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.7736318407960199, |
|
"grad_norm": 6.0, |
|
"learning_rate": 2.96278253812707e-05, |
|
"loss": 3.9548, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.7761194029850746, |
|
"grad_norm": 6.71875, |
|
"learning_rate": 2.901356370571967e-05, |
|
"loss": 3.8413, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.7786069651741293, |
|
"grad_norm": 6.625, |
|
"learning_rate": 2.840465426630091e-05, |
|
"loss": 4.1502, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.7810945273631841, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 2.7801142973575243e-05, |
|
"loss": 3.851, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.7835820895522388, |
|
"grad_norm": 6.125, |
|
"learning_rate": 2.7203075331094017e-05, |
|
"loss": 4.0059, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.7860696517412935, |
|
"grad_norm": 7.0, |
|
"learning_rate": 2.6610496431968125e-05, |
|
"loss": 3.8795, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.7885572139303483, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 2.6023450955468176e-05, |
|
"loss": 3.8933, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.7910447761194029, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 2.54419831636557e-05, |
|
"loss": 4.1032, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.7935323383084577, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 2.4866136898045843e-05, |
|
"loss": 3.8866, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.7960199004975125, |
|
"grad_norm": 6.375, |
|
"learning_rate": 2.4295955576301965e-05, |
|
"loss": 4.0359, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.7985074626865671, |
|
"grad_norm": 6.75, |
|
"learning_rate": 2.3731482188961818e-05, |
|
"loss": 3.8639, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.8009950248756219, |
|
"grad_norm": 6.78125, |
|
"learning_rate": 2.317275929619627e-05, |
|
"loss": 4.0732, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.8034825870646766, |
|
"grad_norm": 7.625, |
|
"learning_rate": 2.261982902460039e-05, |
|
"loss": 3.9888, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.8059701492537313, |
|
"grad_norm": 6.0, |
|
"learning_rate": 2.2072733064017103e-05, |
|
"loss": 4.0829, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.8084577114427861, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 2.1531512664393838e-05, |
|
"loss": 4.1679, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.8109452736318408, |
|
"grad_norm": 7.0, |
|
"learning_rate": 2.0996208632672475e-05, |
|
"loss": 4.0939, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.8134328358208955, |
|
"grad_norm": 6.8125, |
|
"learning_rate": 2.0466861329712473e-05, |
|
"loss": 3.8609, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.8159203980099502, |
|
"grad_norm": 6.71875, |
|
"learning_rate": 1.9943510667247813e-05, |
|
"loss": 4.083, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.818407960199005, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 1.9426196104877735e-05, |
|
"loss": 3.9825, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.8208955223880597, |
|
"grad_norm": 6.96875, |
|
"learning_rate": 1.89149566470915e-05, |
|
"loss": 3.9559, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.8233830845771144, |
|
"grad_norm": 6.59375, |
|
"learning_rate": 1.8409830840327546e-05, |
|
"loss": 4.0314, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.8258706467661692, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 1.791085677006722e-05, |
|
"loss": 3.8751, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.8283582089552238, |
|
"grad_norm": 7.25, |
|
"learning_rate": 1.741807205796314e-05, |
|
"loss": 4.0051, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.8308457711442786, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 1.6931513859002635e-05, |
|
"loss": 3.9194, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 6.34375, |
|
"learning_rate": 1.6451218858706374e-05, |
|
"loss": 3.8528, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.835820895522388, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 1.5977223270362196e-05, |
|
"loss": 4.0617, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.8383084577114428, |
|
"grad_norm": 7.125, |
|
"learning_rate": 1.5509562832294944e-05, |
|
"loss": 3.7389, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.8407960199004975, |
|
"grad_norm": 6.25, |
|
"learning_rate": 1.5048272805171615e-05, |
|
"loss": 3.9292, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.8432835820895522, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 1.459338796934293e-05, |
|
"loss": 4.011, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.845771144278607, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 1.4144942622220902e-05, |
|
"loss": 3.8728, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.8482587064676617, |
|
"grad_norm": 6.71875, |
|
"learning_rate": 1.3702970575692975e-05, |
|
"loss": 4.0874, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.8507462686567164, |
|
"grad_norm": 6.9375, |
|
"learning_rate": 1.3267505153572501e-05, |
|
"loss": 4.0708, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.8532338308457711, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 1.2838579189086353e-05, |
|
"loss": 3.8598, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.8557213930348259, |
|
"grad_norm": 7.46875, |
|
"learning_rate": 1.2416225022399286e-05, |
|
"loss": 4.0435, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.8582089552238806, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 1.2000474498175552e-05, |
|
"loss": 4.054, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.8606965174129353, |
|
"grad_norm": 6.8125, |
|
"learning_rate": 1.1591358963177923e-05, |
|
"loss": 3.8522, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.8631840796019901, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 1.118890926390419e-05, |
|
"loss": 3.9089, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.8656716417910447, |
|
"grad_norm": 6.8125, |
|
"learning_rate": 1.0793155744261351e-05, |
|
"loss": 4.0584, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.8681592039800995, |
|
"grad_norm": 6.25, |
|
"learning_rate": 1.0404128243277777e-05, |
|
"loss": 3.9094, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.8706467661691543, |
|
"grad_norm": 6.75, |
|
"learning_rate": 1.0021856092853432e-05, |
|
"loss": 3.9982, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.8731343283582089, |
|
"grad_norm": 6.25, |
|
"learning_rate": 9.646368115548232e-06, |
|
"loss": 3.946, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.8756218905472637, |
|
"grad_norm": 6.71875, |
|
"learning_rate": 9.277692622409018e-06, |
|
"loss": 3.8958, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.8781094527363185, |
|
"grad_norm": 6.375, |
|
"learning_rate": 8.915857410834794e-06, |
|
"loss": 3.7367, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.8805970149253731, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 8.56088976248095e-06, |
|
"loss": 3.9724, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.8830845771144279, |
|
"grad_norm": 6.125, |
|
"learning_rate": 8.212816441202309e-06, |
|
"loss": 4.0212, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.8855721393034826, |
|
"grad_norm": 6.90625, |
|
"learning_rate": 7.871663691035103e-06, |
|
"loss": 3.6865, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.8880597014925373, |
|
"grad_norm": 6.78125, |
|
"learning_rate": 7.53745723421827e-06, |
|
"loss": 3.9914, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.8905472636815921, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 7.2102222692540415e-06, |
|
"loss": 3.9573, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.8930348258706468, |
|
"grad_norm": 6.78125, |
|
"learning_rate": 6.889983469008055e-06, |
|
"loss": 4.1287, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.8955223880597015, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 6.576764978849004e-06, |
|
"loss": 4.1117, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.8980099502487562, |
|
"grad_norm": 6.8125, |
|
"learning_rate": 6.27059041482817e-06, |
|
"loss": 3.8274, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.900497512437811, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 5.971482861898836e-06, |
|
"loss": 3.8697, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.9029850746268657, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 5.679464872175666e-06, |
|
"loss": 3.9326, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.9054726368159204, |
|
"grad_norm": 6.5, |
|
"learning_rate": 5.394558463234378e-06, |
|
"loss": 3.8915, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.9079601990049752, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 5.116785116451661e-06, |
|
"loss": 3.9306, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.9104477611940298, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 4.846165775385459e-06, |
|
"loss": 3.901, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.9129353233830846, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 4.5827208441959424e-06, |
|
"loss": 3.9952, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.9154228855721394, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 4.3264701861070345e-06, |
|
"loss": 4.0501, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.917910447761194, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 4.077433121908747e-06, |
|
"loss": 3.7784, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.9203980099502488, |
|
"grad_norm": 6.8125, |
|
"learning_rate": 3.835628428500515e-06, |
|
"loss": 3.928, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.9228855721393034, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 3.601074337475352e-06, |
|
"loss": 3.9705, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.9253731343283582, |
|
"grad_norm": 7.5, |
|
"learning_rate": 3.3737885337452814e-06, |
|
"loss": 4.0769, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.927860696517413, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 3.153788154207926e-06, |
|
"loss": 3.9098, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.9303482587064676, |
|
"grad_norm": 6.875, |
|
"learning_rate": 2.9410897864544206e-06, |
|
"loss": 4.037, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.9328358208955224, |
|
"grad_norm": 6.21875, |
|
"learning_rate": 2.735709467518699e-06, |
|
"loss": 3.9169, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.9353233830845771, |
|
"grad_norm": 6.5, |
|
"learning_rate": 2.5376626826683956e-06, |
|
"loss": 3.9237, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.9378109452736318, |
|
"grad_norm": 6.375, |
|
"learning_rate": 2.3469643642372586e-06, |
|
"loss": 3.9737, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.9402985074626866, |
|
"grad_norm": 7.15625, |
|
"learning_rate": 2.1636288904992585e-06, |
|
"loss": 4.0875, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.9427860696517413, |
|
"grad_norm": 6.25, |
|
"learning_rate": 1.9876700845845475e-06, |
|
"loss": 3.8926, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.945273631840796, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 1.8191012134371577e-06, |
|
"loss": 3.8843, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.9477611940298507, |
|
"grad_norm": 6.59375, |
|
"learning_rate": 1.6579349868147687e-06, |
|
"loss": 3.9296, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.9502487562189055, |
|
"grad_norm": 6.9375, |
|
"learning_rate": 1.5041835563303742e-06, |
|
"loss": 3.8406, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.9527363184079602, |
|
"grad_norm": 6.78125, |
|
"learning_rate": 1.3578585145360812e-06, |
|
"loss": 4.1326, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.9552238805970149, |
|
"grad_norm": 6.21875, |
|
"learning_rate": 1.2189708940490652e-06, |
|
"loss": 3.8935, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.9577114427860697, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 1.0875311667196908e-06, |
|
"loss": 3.7818, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.9601990049751243, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 9.635492428420434e-07, |
|
"loss": 3.6428, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.9626865671641791, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 8.470344704066046e-07, |
|
"loss": 3.8555, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.9651741293532339, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 7.379956343955386e-07, |
|
"loss": 3.8856, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.9676616915422885, |
|
"grad_norm": 6.0, |
|
"learning_rate": 6.364409561202323e-07, |
|
"loss": 4.0294, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.9701492537313433, |
|
"grad_norm": 7.15625, |
|
"learning_rate": 5.42378092601481e-07, |
|
"loss": 3.9593, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.972636815920398, |
|
"grad_norm": 6.25, |
|
"learning_rate": 4.558141359921386e-07, |
|
"loss": 3.9914, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.9751243781094527, |
|
"grad_norm": 8.375, |
|
"learning_rate": 3.7675561304238994e-07, |
|
"loss": 3.9707, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.9776119402985075, |
|
"grad_norm": 5.65625, |
|
"learning_rate": 3.0520848460765527e-07, |
|
"loss": 3.7689, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.9800995024875622, |
|
"grad_norm": 6.6875, |
|
"learning_rate": 2.4117814519911684e-07, |
|
"loss": 3.9225, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.9825870646766169, |
|
"grad_norm": 6.125, |
|
"learning_rate": 1.846694225770551e-07, |
|
"loss": 3.9233, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.9850746268656716, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 1.3568657738678435e-07, |
|
"loss": 3.8331, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.9875621890547264, |
|
"grad_norm": 6.40625, |
|
"learning_rate": 9.423330283742093e-08, |
|
"loss": 3.993, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.9900497512437811, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 6.031272442341696e-08, |
|
"loss": 3.8852, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.9925373134328358, |
|
"grad_norm": 6.75, |
|
"learning_rate": 3.392739968894887e-08, |
|
"loss": 3.821, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.9950248756218906, |
|
"grad_norm": 6.59375, |
|
"learning_rate": 1.5079318035016164e-08, |
|
"loss": 3.975, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.9975124378109452, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 3.769900569505769e-09, |
|
"loss": 4.0483, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 0.0, |
|
"loss": 3.9369, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 2010, |
|
"total_flos": 1275064289820672.0, |
|
"train_loss": 4.025861802029966, |
|
"train_runtime": 256.6461, |
|
"train_samples_per_second": 125.278, |
|
"train_steps_per_second": 7.832 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2010, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1275064289820672.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|