|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.985272459499264, |
|
"eval_steps": 500, |
|
"global_step": 3390, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.029455081001472753, |
|
"grad_norm": 10.109375, |
|
"learning_rate": 0.00019999570594853575, |
|
"loss": 2.6151, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05891016200294551, |
|
"grad_norm": 9.828125, |
|
"learning_rate": 0.00019998282416292055, |
|
"loss": 2.4649, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08836524300441827, |
|
"grad_norm": 8.59375, |
|
"learning_rate": 0.00019996135574945544, |
|
"loss": 2.4015, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.11782032400589101, |
|
"grad_norm": 7.8359375, |
|
"learning_rate": 0.00019993130255186977, |
|
"loss": 2.4925, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.14727540500736377, |
|
"grad_norm": 7.76953125, |
|
"learning_rate": 0.00019989266715116316, |
|
"loss": 2.4309, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.17673048600883653, |
|
"grad_norm": 10.2265625, |
|
"learning_rate": 0.0001998454528653836, |
|
"loss": 2.3814, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.20618556701030927, |
|
"grad_norm": 10.375, |
|
"learning_rate": 0.00019978966374934254, |
|
"loss": 2.3852, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.23564064801178203, |
|
"grad_norm": 9.4453125, |
|
"learning_rate": 0.00019972530459426663, |
|
"loss": 2.396, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2650957290132548, |
|
"grad_norm": 9.21875, |
|
"learning_rate": 0.00019965238092738643, |
|
"loss": 2.386, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.29455081001472755, |
|
"grad_norm": 10.1328125, |
|
"learning_rate": 0.00019957089901146148, |
|
"loss": 2.3984, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3240058910162003, |
|
"grad_norm": 8.484375, |
|
"learning_rate": 0.00019948086584424256, |
|
"loss": 2.3893, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.35346097201767307, |
|
"grad_norm": 7.83203125, |
|
"learning_rate": 0.0001993822891578708, |
|
"loss": 2.4262, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.38291605301914583, |
|
"grad_norm": 6.61328125, |
|
"learning_rate": 0.00019927517741821343, |
|
"loss": 2.3399, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.41237113402061853, |
|
"grad_norm": 7.91015625, |
|
"learning_rate": 0.0001991595398241369, |
|
"loss": 2.4667, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4418262150220913, |
|
"grad_norm": 5.46484375, |
|
"learning_rate": 0.0001990353863067169, |
|
"loss": 2.4316, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.47128129602356406, |
|
"grad_norm": 7.76171875, |
|
"learning_rate": 0.00019890272752838518, |
|
"loss": 2.4068, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5007363770250368, |
|
"grad_norm": 6.78515625, |
|
"learning_rate": 0.00019876157488201424, |
|
"loss": 2.4295, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5301914580265096, |
|
"grad_norm": 6.67578125, |
|
"learning_rate": 0.00019861194048993863, |
|
"loss": 2.4124, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5596465390279823, |
|
"grad_norm": 4.140625, |
|
"learning_rate": 0.00019845383720291392, |
|
"loss": 2.3651, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5891016200294551, |
|
"grad_norm": 3.337890625, |
|
"learning_rate": 0.00019828727859901317, |
|
"loss": 2.4137, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6185567010309279, |
|
"grad_norm": 4.01953125, |
|
"learning_rate": 0.0001981122789824607, |
|
"loss": 2.3851, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6480117820324006, |
|
"grad_norm": 4.94140625, |
|
"learning_rate": 0.00019792885338240374, |
|
"loss": 2.3647, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6774668630338734, |
|
"grad_norm": 4.5234375, |
|
"learning_rate": 0.00019773701755162158, |
|
"loss": 2.2891, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7069219440353461, |
|
"grad_norm": 3.619140625, |
|
"learning_rate": 0.00019753678796517282, |
|
"loss": 2.33, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7363770250368189, |
|
"grad_norm": 3.244140625, |
|
"learning_rate": 0.00019732818181898045, |
|
"loss": 2.3588, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7658321060382917, |
|
"grad_norm": 5.95703125, |
|
"learning_rate": 0.00019711121702835504, |
|
"loss": 2.4006, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7952871870397643, |
|
"grad_norm": 3.873046875, |
|
"learning_rate": 0.00019688591222645607, |
|
"loss": 2.3874, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8247422680412371, |
|
"grad_norm": 3.833984375, |
|
"learning_rate": 0.0001966522867626919, |
|
"loss": 2.3104, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8541973490427098, |
|
"grad_norm": 4.55078125, |
|
"learning_rate": 0.00019641036070105778, |
|
"loss": 2.3146, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8836524300441826, |
|
"grad_norm": 5.9765625, |
|
"learning_rate": 0.0001961601548184129, |
|
"loss": 2.3112, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9131075110456554, |
|
"grad_norm": 5.18359375, |
|
"learning_rate": 0.00019590169060269602, |
|
"loss": 2.2807, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9425625920471281, |
|
"grad_norm": 8.359375, |
|
"learning_rate": 0.00019563499025107998, |
|
"loss": 2.3536, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9720176730486009, |
|
"grad_norm": 5.60546875, |
|
"learning_rate": 0.00019536007666806556, |
|
"loss": 2.3461, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0014727540500736, |
|
"grad_norm": 4.32421875, |
|
"learning_rate": 0.00019507697346351414, |
|
"loss": 2.3032, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.0309278350515463, |
|
"grad_norm": 3.591796875, |
|
"learning_rate": 0.00019478570495062037, |
|
"loss": 2.2973, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.0603829160530192, |
|
"grad_norm": 5.73046875, |
|
"learning_rate": 0.0001944862961438239, |
|
"loss": 2.3204, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.0898379970544918, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 0.0001941787727566613, |
|
"loss": 2.3208, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.1192930780559647, |
|
"grad_norm": 4.33984375, |
|
"learning_rate": 0.00019386316119955756, |
|
"loss": 2.4171, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.1487481590574373, |
|
"grad_norm": 3.208984375, |
|
"learning_rate": 0.00019353948857755803, |
|
"loss": 2.2795, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.1782032400589102, |
|
"grad_norm": 4.07421875, |
|
"learning_rate": 0.00019320778268800066, |
|
"loss": 2.2644, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2076583210603828, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 0.00019286807201812867, |
|
"loss": 2.2787, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.2371134020618557, |
|
"grad_norm": 3.912109375, |
|
"learning_rate": 0.00019252038574264405, |
|
"loss": 2.2143, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.2665684830633284, |
|
"grad_norm": 3.607421875, |
|
"learning_rate": 0.00019216475372120197, |
|
"loss": 2.2773, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.2960235640648012, |
|
"grad_norm": 2.80859375, |
|
"learning_rate": 0.00019180120649584653, |
|
"loss": 2.2923, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.3254786450662739, |
|
"grad_norm": 2.791015625, |
|
"learning_rate": 0.00019142977528838762, |
|
"loss": 2.2583, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.3549337260677468, |
|
"grad_norm": 2.541015625, |
|
"learning_rate": 0.00019105049199771962, |
|
"loss": 2.2466, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.3843888070692194, |
|
"grad_norm": 2.541015625, |
|
"learning_rate": 0.00019066338919708197, |
|
"loss": 2.2686, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.413843888070692, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 0.00019026850013126157, |
|
"loss": 2.246, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.443298969072165, |
|
"grad_norm": 2.46484375, |
|
"learning_rate": 0.0001898658587137379, |
|
"loss": 2.2931, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.4727540500736378, |
|
"grad_norm": 2.740234375, |
|
"learning_rate": 0.0001894554995237703, |
|
"loss": 2.255, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5022091310751104, |
|
"grad_norm": 2.658203125, |
|
"learning_rate": 0.00018903745780342839, |
|
"loss": 2.2697, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.531664212076583, |
|
"grad_norm": 2.486328125, |
|
"learning_rate": 0.0001886117694545654, |
|
"loss": 2.2511, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.561119293078056, |
|
"grad_norm": 3.216796875, |
|
"learning_rate": 0.00018817847103573486, |
|
"loss": 2.2199, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.5905743740795288, |
|
"grad_norm": 2.8984375, |
|
"learning_rate": 0.00018773759975905098, |
|
"loss": 2.2668, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.6200294550810015, |
|
"grad_norm": 3.052734375, |
|
"learning_rate": 0.00018728919348699283, |
|
"loss": 2.283, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.6494845360824741, |
|
"grad_norm": 2.8515625, |
|
"learning_rate": 0.00018683329072915252, |
|
"loss": 2.2239, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.678939617083947, |
|
"grad_norm": 2.607421875, |
|
"learning_rate": 0.0001863699306389282, |
|
"loss": 2.1978, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.7083946980854199, |
|
"grad_norm": 2.978515625, |
|
"learning_rate": 0.0001858991530101613, |
|
"loss": 2.2468, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.7378497790868925, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 0.0001854209982737192, |
|
"loss": 2.2494, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.7673048600883652, |
|
"grad_norm": 2.75390625, |
|
"learning_rate": 0.00018493550749402278, |
|
"loss": 2.2163, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.7967599410898378, |
|
"grad_norm": 2.908203125, |
|
"learning_rate": 0.0001844427223655199, |
|
"loss": 2.2156, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.8262150220913107, |
|
"grad_norm": 3.236328125, |
|
"learning_rate": 0.00018394268520910466, |
|
"loss": 2.2757, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.8556701030927836, |
|
"grad_norm": 2.51953125, |
|
"learning_rate": 0.00018343543896848273, |
|
"loss": 2.1927, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.8851251840942562, |
|
"grad_norm": 3.451171875, |
|
"learning_rate": 0.00018292102720648333, |
|
"loss": 2.1343, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.9145802650957289, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 0.00018239949410131802, |
|
"loss": 2.2521, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.9440353460972017, |
|
"grad_norm": 3.177734375, |
|
"learning_rate": 0.00018187088444278674, |
|
"loss": 2.2107, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.9734904270986746, |
|
"grad_norm": 2.892578125, |
|
"learning_rate": 0.00018133524362843104, |
|
"loss": 2.1933, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.0029455081001473, |
|
"grad_norm": 2.4296875, |
|
"learning_rate": 0.00018079261765963537, |
|
"loss": 2.1848, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.03240058910162, |
|
"grad_norm": 2.189453125, |
|
"learning_rate": 0.00018024305313767646, |
|
"loss": 2.2219, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.0618556701030926, |
|
"grad_norm": 2.189453125, |
|
"learning_rate": 0.00017968659725972112, |
|
"loss": 2.1635, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.0913107511045657, |
|
"grad_norm": 2.607421875, |
|
"learning_rate": 0.00017912329781477287, |
|
"loss": 2.2145, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.1207658321060383, |
|
"grad_norm": 1.8662109375, |
|
"learning_rate": 0.00017855320317956784, |
|
"loss": 2.178, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.150220913107511, |
|
"grad_norm": 2.13671875, |
|
"learning_rate": 0.00017797636231442016, |
|
"loss": 2.1433, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.1796759941089836, |
|
"grad_norm": 2.98828125, |
|
"learning_rate": 0.000177392824759017, |
|
"loss": 2.1158, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.2091310751104567, |
|
"grad_norm": 2.25, |
|
"learning_rate": 0.0001768026406281642, |
|
"loss": 2.2127, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.2385861561119293, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 0.00017620586060748252, |
|
"loss": 2.1268, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.268041237113402, |
|
"grad_norm": 1.9482421875, |
|
"learning_rate": 0.00017560253594905425, |
|
"loss": 2.1628, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.2974963181148746, |
|
"grad_norm": 1.94140625, |
|
"learning_rate": 0.00017499271846702213, |
|
"loss": 2.1489, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.3269513991163477, |
|
"grad_norm": 1.6005859375, |
|
"learning_rate": 0.0001743764605331392, |
|
"loss": 2.1078, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.3564064801178204, |
|
"grad_norm": 1.677734375, |
|
"learning_rate": 0.00017375381507227108, |
|
"loss": 2.1365, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.385861561119293, |
|
"grad_norm": 1.537109375, |
|
"learning_rate": 0.00017312483555785086, |
|
"loss": 2.1179, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.4153166421207657, |
|
"grad_norm": 1.8291015625, |
|
"learning_rate": 0.00017248957600728664, |
|
"loss": 2.0896, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.444771723122239, |
|
"grad_norm": 1.9423828125, |
|
"learning_rate": 0.00017184809097732246, |
|
"loss": 2.1057, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.4742268041237114, |
|
"grad_norm": 1.443359375, |
|
"learning_rate": 0.00017120043555935298, |
|
"loss": 2.1325, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.503681885125184, |
|
"grad_norm": 2.630859375, |
|
"learning_rate": 0.00017054666537469213, |
|
"loss": 2.0701, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.5331369661266567, |
|
"grad_norm": 1.95703125, |
|
"learning_rate": 0.00016988683656979624, |
|
"loss": 2.1342, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.5625920471281294, |
|
"grad_norm": 1.50390625, |
|
"learning_rate": 0.00016922100581144228, |
|
"loss": 2.1223, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.5920471281296025, |
|
"grad_norm": 2.083984375, |
|
"learning_rate": 0.00016854923028186111, |
|
"loss": 2.1597, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.621502209131075, |
|
"grad_norm": 1.7548828125, |
|
"learning_rate": 0.00016787156767382659, |
|
"loss": 2.1915, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.6509572901325478, |
|
"grad_norm": 1.4912109375, |
|
"learning_rate": 0.00016718807618570106, |
|
"loss": 2.1678, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.680412371134021, |
|
"grad_norm": 3.720703125, |
|
"learning_rate": 0.00016649881451643705, |
|
"loss": 2.1261, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.7098674521354935, |
|
"grad_norm": 1.833984375, |
|
"learning_rate": 0.0001658038418605361, |
|
"loss": 2.0961, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.739322533136966, |
|
"grad_norm": 1.80859375, |
|
"learning_rate": 0.00016510321790296525, |
|
"loss": 2.106, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.768777614138439, |
|
"grad_norm": 1.5380859375, |
|
"learning_rate": 0.00016439700281403114, |
|
"loss": 2.112, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.7982326951399115, |
|
"grad_norm": 1.341796875, |
|
"learning_rate": 0.00016368525724421248, |
|
"loss": 2.1424, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.827687776141384, |
|
"grad_norm": 1.8505859375, |
|
"learning_rate": 0.00016296804231895142, |
|
"loss": 2.0889, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 1.58203125, |
|
"learning_rate": 0.00016224541963340391, |
|
"loss": 2.0933, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.88659793814433, |
|
"grad_norm": 1.447265625, |
|
"learning_rate": 0.00016151745124715002, |
|
"loss": 2.0716, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.9160530191458025, |
|
"grad_norm": 1.2646484375, |
|
"learning_rate": 0.00016078419967886402, |
|
"loss": 2.0821, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.9455081001472756, |
|
"grad_norm": 1.1318359375, |
|
"learning_rate": 0.00016004572790094535, |
|
"loss": 2.1024, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.9749631811487482, |
|
"grad_norm": 1.330078125, |
|
"learning_rate": 0.00015930209933411036, |
|
"loss": 2.0347, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.004418262150221, |
|
"grad_norm": 1.2509765625, |
|
"learning_rate": 0.00015855337784194577, |
|
"loss": 2.0021, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.0338733431516935, |
|
"grad_norm": 1.4970703125, |
|
"learning_rate": 0.00015779962772542402, |
|
"loss": 2.0147, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 3.063328424153166, |
|
"grad_norm": 1.4833984375, |
|
"learning_rate": 0.0001570409137173809, |
|
"loss": 2.078, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 3.0927835051546393, |
|
"grad_norm": 1.5166015625, |
|
"learning_rate": 0.00015627730097695638, |
|
"loss": 2.0246, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.122238586156112, |
|
"grad_norm": 1.5107421875, |
|
"learning_rate": 0.00015550885508399856, |
|
"loss": 2.0394, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 3.1516936671575846, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.00015473564203343174, |
|
"loss": 2.0159, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 3.1811487481590572, |
|
"grad_norm": 1.2431640625, |
|
"learning_rate": 0.00015395772822958845, |
|
"loss": 2.0241, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 3.2106038291605303, |
|
"grad_norm": 1.3994140625, |
|
"learning_rate": 0.00015317518048050697, |
|
"loss": 1.9786, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 3.240058910162003, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.00015238806599219336, |
|
"loss": 2.0276, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.2695139911634756, |
|
"grad_norm": 1.251953125, |
|
"learning_rate": 0.0001515964523628501, |
|
"loss": 1.9723, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 3.2989690721649483, |
|
"grad_norm": 1.392578125, |
|
"learning_rate": 0.00015080040757707046, |
|
"loss": 1.951, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.3284241531664214, |
|
"grad_norm": 1.380859375, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 1.989, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 3.357879234167894, |
|
"grad_norm": 1.5498046875, |
|
"learning_rate": 0.00014919529837146528, |
|
"loss": 2.0103, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.3873343151693667, |
|
"grad_norm": 1.5712890625, |
|
"learning_rate": 0.00014838637180007047, |
|
"loss": 1.9914, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.4167893961708393, |
|
"grad_norm": 1.8037109375, |
|
"learning_rate": 0.00014757328975726207, |
|
"loss": 1.9981, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.4462444771723124, |
|
"grad_norm": 1.3818359375, |
|
"learning_rate": 0.0001467561220713628, |
|
"loss": 2.0569, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 3.475699558173785, |
|
"grad_norm": 1.263671875, |
|
"learning_rate": 0.00014593493892157473, |
|
"loss": 2.0572, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.5051546391752577, |
|
"grad_norm": 1.322265625, |
|
"learning_rate": 0.00014510981083195188, |
|
"loss": 2.0181, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 3.5346097201767304, |
|
"grad_norm": 1.2841796875, |
|
"learning_rate": 0.00014428080866534396, |
|
"loss": 1.9768, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.564064801178203, |
|
"grad_norm": 1.0927734375, |
|
"learning_rate": 0.00014344800361731027, |
|
"loss": 1.9746, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 3.593519882179676, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 0.00014261146721000553, |
|
"loss": 1.9891, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 3.6229749631811488, |
|
"grad_norm": 1.3505859375, |
|
"learning_rate": 0.00014177127128603745, |
|
"loss": 1.9369, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 3.6524300441826214, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.00014092748800229683, |
|
"loss": 1.963, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 3.6818851251840945, |
|
"grad_norm": 1.123046875, |
|
"learning_rate": 0.00014008018982376044, |
|
"loss": 1.9741, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.711340206185567, |
|
"grad_norm": 1.1748046875, |
|
"learning_rate": 0.0001392294495172681, |
|
"loss": 1.9275, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 3.74079528718704, |
|
"grad_norm": 1.369140625, |
|
"learning_rate": 0.0001383753401452729, |
|
"loss": 1.9831, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 3.7702503681885124, |
|
"grad_norm": 1.2236328125, |
|
"learning_rate": 0.0001375179350595669, |
|
"loss": 1.9445, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 3.799705449189985, |
|
"grad_norm": 1.341796875, |
|
"learning_rate": 0.0001366573078949813, |
|
"loss": 2.014, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 3.829160530191458, |
|
"grad_norm": 1.1904296875, |
|
"learning_rate": 0.00013579353256306287, |
|
"loss": 2.006, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.858615611192931, |
|
"grad_norm": 1.2060546875, |
|
"learning_rate": 0.00013492668324572614, |
|
"loss": 2.007, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 3.8880706921944035, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.00013405683438888282, |
|
"loss": 1.9583, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 3.917525773195876, |
|
"grad_norm": 1.216796875, |
|
"learning_rate": 0.00013318406069604794, |
|
"loss": 2.0087, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 3.9469808541973492, |
|
"grad_norm": 1.2822265625, |
|
"learning_rate": 0.00013230843712192463, |
|
"loss": 1.993, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 3.976435935198822, |
|
"grad_norm": 1.3349609375, |
|
"learning_rate": 0.00013143003886596669, |
|
"loss": 1.9554, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 4.0058910162002945, |
|
"grad_norm": 1.1884765625, |
|
"learning_rate": 0.00013054894136592052, |
|
"loss": 1.8819, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 4.035346097201767, |
|
"grad_norm": 1.3740234375, |
|
"learning_rate": 0.00012966522029134623, |
|
"loss": 1.8809, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 4.06480117820324, |
|
"grad_norm": 1.3447265625, |
|
"learning_rate": 0.00012877895153711935, |
|
"loss": 1.8892, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 4.0942562592047125, |
|
"grad_norm": 1.462890625, |
|
"learning_rate": 0.00012789021121691274, |
|
"loss": 1.892, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 4.123711340206185, |
|
"grad_norm": 1.5107421875, |
|
"learning_rate": 0.00012699907565665982, |
|
"loss": 1.8828, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 4.153166421207659, |
|
"grad_norm": 1.376953125, |
|
"learning_rate": 0.00012610562138799978, |
|
"loss": 1.9302, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 4.182621502209131, |
|
"grad_norm": 1.4072265625, |
|
"learning_rate": 0.0001252099251417048, |
|
"loss": 1.8978, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 4.212076583210604, |
|
"grad_norm": 1.3017578125, |
|
"learning_rate": 0.00012431206384109044, |
|
"loss": 1.8759, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 4.241531664212077, |
|
"grad_norm": 1.302734375, |
|
"learning_rate": 0.0001234121145954094, |
|
"loss": 1.83, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 4.270986745213549, |
|
"grad_norm": 1.2998046875, |
|
"learning_rate": 0.00012251015469322916, |
|
"loss": 1.9297, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 4.300441826215022, |
|
"grad_norm": 1.1630859375, |
|
"learning_rate": 0.00012160626159579447, |
|
"loss": 1.8802, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 4.329896907216495, |
|
"grad_norm": 1.2919921875, |
|
"learning_rate": 0.00012070051293037492, |
|
"loss": 1.8425, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 4.359351988217967, |
|
"grad_norm": 1.2509765625, |
|
"learning_rate": 0.00011979298648359823, |
|
"loss": 1.9012, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 4.388807069219441, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.00011888376019476966, |
|
"loss": 1.8615, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 4.418262150220913, |
|
"grad_norm": 1.423828125, |
|
"learning_rate": 0.00011797291214917881, |
|
"loss": 1.8457, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.447717231222386, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.00011706052057139335, |
|
"loss": 1.8759, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 4.477172312223859, |
|
"grad_norm": 1.2333984375, |
|
"learning_rate": 0.00011614666381854107, |
|
"loss": 1.8354, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 4.506627393225331, |
|
"grad_norm": 1.4306640625, |
|
"learning_rate": 0.0001152314203735805, |
|
"loss": 1.9068, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 4.536082474226804, |
|
"grad_norm": 1.1806640625, |
|
"learning_rate": 0.00011431486883856082, |
|
"loss": 1.8969, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 4.565537555228277, |
|
"grad_norm": 1.11328125, |
|
"learning_rate": 0.00011339708792787119, |
|
"loss": 1.8408, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 4.594992636229749, |
|
"grad_norm": 1.2548828125, |
|
"learning_rate": 0.00011247815646148087, |
|
"loss": 1.9036, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 4.624447717231222, |
|
"grad_norm": 1.2119140625, |
|
"learning_rate": 0.00011155815335817011, |
|
"loss": 1.8163, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 4.6539027982326955, |
|
"grad_norm": 1.2255859375, |
|
"learning_rate": 0.00011063715762875225, |
|
"loss": 1.8382, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 4.683357879234168, |
|
"grad_norm": 1.3173828125, |
|
"learning_rate": 0.0001097152483692886, |
|
"loss": 1.8436, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 4.712812960235641, |
|
"grad_norm": 1.17578125, |
|
"learning_rate": 0.00010879250475429523, |
|
"loss": 1.8737, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.742268041237113, |
|
"grad_norm": 1.134765625, |
|
"learning_rate": 0.00010786900602994359, |
|
"loss": 1.7844, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 4.771723122238586, |
|
"grad_norm": 1.3564453125, |
|
"learning_rate": 0.00010694483150725458, |
|
"loss": 1.8593, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 4.801178203240059, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.0001060200605552876, |
|
"loss": 1.9182, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 4.830633284241531, |
|
"grad_norm": 1.2099609375, |
|
"learning_rate": 0.00010509477259432372, |
|
"loss": 1.8436, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 4.860088365243005, |
|
"grad_norm": 1.2333984375, |
|
"learning_rate": 0.00010416904708904548, |
|
"loss": 1.8418, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 4.889543446244478, |
|
"grad_norm": 1.240234375, |
|
"learning_rate": 0.00010324296354171207, |
|
"loss": 1.8142, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 4.91899852724595, |
|
"grad_norm": 1.169921875, |
|
"learning_rate": 0.00010231660148533183, |
|
"loss": 1.8852, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 4.948453608247423, |
|
"grad_norm": 1.142578125, |
|
"learning_rate": 0.00010139004047683151, |
|
"loss": 1.8006, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 4.9779086892488955, |
|
"grad_norm": 1.1513671875, |
|
"learning_rate": 0.00010046336009022435, |
|
"loss": 1.8859, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 5.007363770250368, |
|
"grad_norm": 1.189453125, |
|
"learning_rate": 9.953663990977568e-05, |
|
"loss": 1.8146, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 5.036818851251841, |
|
"grad_norm": 1.2392578125, |
|
"learning_rate": 9.860995952316851e-05, |
|
"loss": 1.7467, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 5.0662739322533135, |
|
"grad_norm": 1.302734375, |
|
"learning_rate": 9.768339851466818e-05, |
|
"loss": 1.8023, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 5.095729013254786, |
|
"grad_norm": 1.3076171875, |
|
"learning_rate": 9.675703645828794e-05, |
|
"loss": 1.746, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 5.125184094256259, |
|
"grad_norm": 1.2001953125, |
|
"learning_rate": 9.583095291095453e-05, |
|
"loss": 1.7308, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 5.154639175257732, |
|
"grad_norm": 1.3076171875, |
|
"learning_rate": 9.490522740567633e-05, |
|
"loss": 1.7452, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 5.184094256259205, |
|
"grad_norm": 1.201171875, |
|
"learning_rate": 9.397993944471244e-05, |
|
"loss": 1.7385, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 5.213549337260678, |
|
"grad_norm": 1.3212890625, |
|
"learning_rate": 9.305516849274541e-05, |
|
"loss": 1.7481, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 5.24300441826215, |
|
"grad_norm": 1.4599609375, |
|
"learning_rate": 9.213099397005646e-05, |
|
"loss": 1.7526, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 5.272459499263623, |
|
"grad_norm": 1.2783203125, |
|
"learning_rate": 9.12074952457048e-05, |
|
"loss": 1.7547, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 5.3019145802650955, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 9.028475163071141e-05, |
|
"loss": 1.7631, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 5.331369661266568, |
|
"grad_norm": 1.4521484375, |
|
"learning_rate": 8.936284237124778e-05, |
|
"loss": 1.8386, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 5.360824742268041, |
|
"grad_norm": 1.1865234375, |
|
"learning_rate": 8.844184664182993e-05, |
|
"loss": 1.8157, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 5.390279823269514, |
|
"grad_norm": 1.2294921875, |
|
"learning_rate": 8.752184353851916e-05, |
|
"loss": 1.7475, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 5.419734904270987, |
|
"grad_norm": 1.2275390625, |
|
"learning_rate": 8.660291207212882e-05, |
|
"loss": 1.7475, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 5.44918998527246, |
|
"grad_norm": 1.263671875, |
|
"learning_rate": 8.568513116143919e-05, |
|
"loss": 1.7088, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 5.478645066273932, |
|
"grad_norm": 1.23046875, |
|
"learning_rate": 8.47685796264195e-05, |
|
"loss": 1.7893, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 5.508100147275405, |
|
"grad_norm": 1.427734375, |
|
"learning_rate": 8.385333618145896e-05, |
|
"loss": 1.7206, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 5.537555228276878, |
|
"grad_norm": 1.1630859375, |
|
"learning_rate": 8.293947942860666e-05, |
|
"loss": 1.7444, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 5.56701030927835, |
|
"grad_norm": 1.1826171875, |
|
"learning_rate": 8.202708785082121e-05, |
|
"loss": 1.7685, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 5.596465390279823, |
|
"grad_norm": 1.2373046875, |
|
"learning_rate": 8.111623980523035e-05, |
|
"loss": 1.7494, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 5.625920471281296, |
|
"grad_norm": 1.248046875, |
|
"learning_rate": 8.020701351640182e-05, |
|
"loss": 1.6737, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 5.655375552282769, |
|
"grad_norm": 1.361328125, |
|
"learning_rate": 7.929948706962508e-05, |
|
"loss": 1.7643, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 5.684830633284242, |
|
"grad_norm": 1.1435546875, |
|
"learning_rate": 7.839373840420554e-05, |
|
"loss": 1.7467, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 5.714285714285714, |
|
"grad_norm": 1.111328125, |
|
"learning_rate": 7.748984530677089e-05, |
|
"loss": 1.7421, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 5.743740795287187, |
|
"grad_norm": 1.3056640625, |
|
"learning_rate": 7.658788540459062e-05, |
|
"loss": 1.7451, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 5.77319587628866, |
|
"grad_norm": 1.18359375, |
|
"learning_rate": 7.568793615890954e-05, |
|
"loss": 1.7545, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 5.802650957290132, |
|
"grad_norm": 1.115234375, |
|
"learning_rate": 7.479007485829523e-05, |
|
"loss": 1.7657, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 5.832106038291605, |
|
"grad_norm": 1.2001953125, |
|
"learning_rate": 7.389437861200024e-05, |
|
"loss": 1.8164, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 5.8615611192930785, |
|
"grad_norm": 1.1943359375, |
|
"learning_rate": 7.30009243433402e-05, |
|
"loss": 1.748, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 5.891016200294551, |
|
"grad_norm": 1.2333984375, |
|
"learning_rate": 7.210978878308729e-05, |
|
"loss": 1.7395, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 5.920471281296024, |
|
"grad_norm": 1.2294921875, |
|
"learning_rate": 7.122104846288064e-05, |
|
"loss": 1.7774, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 5.9499263622974965, |
|
"grad_norm": 1.2802734375, |
|
"learning_rate": 7.033477970865381e-05, |
|
"loss": 1.7505, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 5.979381443298969, |
|
"grad_norm": 1.2568359375, |
|
"learning_rate": 6.945105863407951e-05, |
|
"loss": 1.7314, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 6.008836524300442, |
|
"grad_norm": 1.240234375, |
|
"learning_rate": 6.85699611340333e-05, |
|
"loss": 1.7157, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 6.0382916053019144, |
|
"grad_norm": 1.3017578125, |
|
"learning_rate": 6.76915628780754e-05, |
|
"loss": 1.6333, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 6.067746686303387, |
|
"grad_norm": 1.3681640625, |
|
"learning_rate": 6.681593930395209e-05, |
|
"loss": 1.678, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 6.09720176730486, |
|
"grad_norm": 1.2626953125, |
|
"learning_rate": 6.594316561111724e-05, |
|
"loss": 1.7078, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 6.126656848306332, |
|
"grad_norm": 1.228515625, |
|
"learning_rate": 6.507331675427387e-05, |
|
"loss": 1.7065, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 6.156111929307806, |
|
"grad_norm": 1.267578125, |
|
"learning_rate": 6.420646743693714e-05, |
|
"loss": 1.6533, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 6.185567010309279, |
|
"grad_norm": 1.2919921875, |
|
"learning_rate": 6.334269210501875e-05, |
|
"loss": 1.6894, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 6.215022091310751, |
|
"grad_norm": 1.2294921875, |
|
"learning_rate": 6.248206494043313e-05, |
|
"loss": 1.6709, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 6.244477172312224, |
|
"grad_norm": 1.2490234375, |
|
"learning_rate": 6.16246598547271e-05, |
|
"loss": 1.6944, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 6.2739322533136965, |
|
"grad_norm": 1.373046875, |
|
"learning_rate": 6.0770550482731924e-05, |
|
"loss": 1.6468, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 6.303387334315169, |
|
"grad_norm": 1.3154296875, |
|
"learning_rate": 5.991981017623955e-05, |
|
"loss": 1.6622, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 6.332842415316642, |
|
"grad_norm": 1.2998046875, |
|
"learning_rate": 5.9072511997703226e-05, |
|
"loss": 1.7171, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 6.3622974963181145, |
|
"grad_norm": 1.1591796875, |
|
"learning_rate": 5.8228728713962543e-05, |
|
"loss": 1.6501, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 6.391752577319588, |
|
"grad_norm": 1.2763671875, |
|
"learning_rate": 5.7388532789994476e-05, |
|
"loss": 1.5946, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 6.421207658321061, |
|
"grad_norm": 1.1728515625, |
|
"learning_rate": 5.6551996382689776e-05, |
|
"loss": 1.6424, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 6.450662739322533, |
|
"grad_norm": 1.279296875, |
|
"learning_rate": 5.571919133465605e-05, |
|
"loss": 1.6173, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 6.480117820324006, |
|
"grad_norm": 1.26953125, |
|
"learning_rate": 5.489018916804813e-05, |
|
"loss": 1.6357, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 6.509572901325479, |
|
"grad_norm": 3.232421875, |
|
"learning_rate": 5.4065061078425315e-05, |
|
"loss": 1.6616, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 6.539027982326951, |
|
"grad_norm": 1.2939453125, |
|
"learning_rate": 5.324387792863719e-05, |
|
"loss": 1.7015, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 6.568483063328424, |
|
"grad_norm": 1.271484375, |
|
"learning_rate": 5.242671024273798e-05, |
|
"loss": 1.7161, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 6.597938144329897, |
|
"grad_norm": 1.2939453125, |
|
"learning_rate": 5.1613628199929544e-05, |
|
"loss": 1.6494, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 6.627393225331369, |
|
"grad_norm": 1.21484375, |
|
"learning_rate": 5.080470162853472e-05, |
|
"loss": 1.6409, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 6.656848306332843, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 1.648, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 6.686303387334315, |
|
"grad_norm": 2.451171875, |
|
"learning_rate": 4.919959242292954e-05, |
|
"loss": 1.7565, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 6.715758468335788, |
|
"grad_norm": 1.3623046875, |
|
"learning_rate": 4.840354763714991e-05, |
|
"loss": 1.6658, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 6.745213549337261, |
|
"grad_norm": 1.388671875, |
|
"learning_rate": 4.7611934007806666e-05, |
|
"loss": 1.5883, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 6.774668630338733, |
|
"grad_norm": 1.3837890625, |
|
"learning_rate": 4.6824819519493057e-05, |
|
"loss": 1.6502, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 6.804123711340206, |
|
"grad_norm": 1.333984375, |
|
"learning_rate": 4.604227177041156e-05, |
|
"loss": 1.632, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 6.833578792341679, |
|
"grad_norm": 1.3525390625, |
|
"learning_rate": 4.5264357966568306e-05, |
|
"loss": 1.6804, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 6.863033873343152, |
|
"grad_norm": 1.2587890625, |
|
"learning_rate": 4.4491144916001425e-05, |
|
"loss": 1.6897, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 6.892488954344625, |
|
"grad_norm": 1.2490234375, |
|
"learning_rate": 4.372269902304363e-05, |
|
"loss": 1.6649, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 6.9219440353460975, |
|
"grad_norm": 1.2705078125, |
|
"learning_rate": 4.29590862826191e-05, |
|
"loss": 1.7002, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 6.95139911634757, |
|
"grad_norm": 1.240234375, |
|
"learning_rate": 4.2200372274576e-05, |
|
"loss": 1.6725, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 6.980854197349043, |
|
"grad_norm": 1.26953125, |
|
"learning_rate": 4.144662215805426e-05, |
|
"loss": 1.6438, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 7.010309278350515, |
|
"grad_norm": 1.30859375, |
|
"learning_rate": 4.069790066588967e-05, |
|
"loss": 1.6172, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 7.039764359351988, |
|
"grad_norm": 1.58203125, |
|
"learning_rate": 3.995427209905469e-05, |
|
"loss": 1.6116, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 7.069219440353461, |
|
"grad_norm": 1.31640625, |
|
"learning_rate": 3.921580032113602e-05, |
|
"loss": 1.5673, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 7.098674521354933, |
|
"grad_norm": 1.275390625, |
|
"learning_rate": 3.848254875285e-05, |
|
"loss": 1.5971, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 7.128129602356406, |
|
"grad_norm": 1.521484375, |
|
"learning_rate": 3.7754580366596115e-05, |
|
"loss": 1.6331, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 7.15758468335788, |
|
"grad_norm": 1.4345703125, |
|
"learning_rate": 3.7031957681048604e-05, |
|
"loss": 1.5961, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 7.187039764359352, |
|
"grad_norm": 1.3564453125, |
|
"learning_rate": 3.631474275578754e-05, |
|
"loss": 1.6064, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 7.216494845360825, |
|
"grad_norm": 1.3017578125, |
|
"learning_rate": 3.560299718596889e-05, |
|
"loss": 1.5493, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 7.2459499263622975, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 3.489678209703475e-05, |
|
"loss": 1.6051, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 7.27540500736377, |
|
"grad_norm": 1.3056640625, |
|
"learning_rate": 3.4196158139463915e-05, |
|
"loss": 1.6425, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 7.304860088365243, |
|
"grad_norm": 1.423828125, |
|
"learning_rate": 3.3501185483562994e-05, |
|
"loss": 1.68, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 7.3343151693667155, |
|
"grad_norm": 1.1806640625, |
|
"learning_rate": 3.281192381429894e-05, |
|
"loss": 1.6, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 7.363770250368188, |
|
"grad_norm": 1.3212890625, |
|
"learning_rate": 3.212843232617343e-05, |
|
"loss": 1.5919, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 7.393225331369662, |
|
"grad_norm": 1.2099609375, |
|
"learning_rate": 3.145076971813891e-05, |
|
"loss": 1.6371, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 7.422680412371134, |
|
"grad_norm": 1.6533203125, |
|
"learning_rate": 3.077899418855772e-05, |
|
"loss": 1.6582, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 7.452135493372607, |
|
"grad_norm": 1.3759765625, |
|
"learning_rate": 3.0113163430203772e-05, |
|
"loss": 1.642, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 7.48159057437408, |
|
"grad_norm": 1.2509765625, |
|
"learning_rate": 2.945333462530788e-05, |
|
"loss": 1.5679, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 7.511045655375552, |
|
"grad_norm": 1.3740234375, |
|
"learning_rate": 2.879956444064703e-05, |
|
"loss": 1.5998, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 7.540500736377025, |
|
"grad_norm": 1.3916015625, |
|
"learning_rate": 2.815190902267757e-05, |
|
"loss": 1.5767, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 7.5699558173784975, |
|
"grad_norm": 1.3115234375, |
|
"learning_rate": 2.7510423992713374e-05, |
|
"loss": 1.597, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 7.59941089837997, |
|
"grad_norm": 1.3876953125, |
|
"learning_rate": 2.6875164442149147e-05, |
|
"loss": 1.559, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 7.628865979381443, |
|
"grad_norm": 1.330078125, |
|
"learning_rate": 2.624618492772891e-05, |
|
"loss": 1.6197, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 7.658321060382916, |
|
"grad_norm": 1.478515625, |
|
"learning_rate": 2.5623539466860813e-05, |
|
"loss": 1.6207, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 7.687776141384389, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 2.500728153297788e-05, |
|
"loss": 1.6783, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 7.717231222385862, |
|
"grad_norm": 1.3701171875, |
|
"learning_rate": 2.439746405094575e-05, |
|
"loss": 1.6265, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 7.746686303387334, |
|
"grad_norm": 1.3115234375, |
|
"learning_rate": 2.379413939251751e-05, |
|
"loss": 1.6028, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 7.776141384388807, |
|
"grad_norm": 1.70703125, |
|
"learning_rate": 2.3197359371835802e-05, |
|
"loss": 1.6263, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 7.80559646539028, |
|
"grad_norm": 1.318359375, |
|
"learning_rate": 2.2607175240983026e-05, |
|
"loss": 1.6417, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 7.835051546391752, |
|
"grad_norm": 1.3759765625, |
|
"learning_rate": 2.2023637685579856e-05, |
|
"loss": 1.5317, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 7.864506627393226, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 2.1446796820432167e-05, |
|
"loss": 1.5853, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 7.8939617083946985, |
|
"grad_norm": 1.3369140625, |
|
"learning_rate": 2.0876702185227137e-05, |
|
"loss": 1.5672, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 7.923416789396171, |
|
"grad_norm": 1.3466796875, |
|
"learning_rate": 2.0313402740278908e-05, |
|
"loss": 1.5869, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 7.952871870397644, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 1.9756946862323535e-05, |
|
"loss": 1.5456, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 7.982326951399116, |
|
"grad_norm": 1.310546875, |
|
"learning_rate": 1.9207382340364634e-05, |
|
"loss": 1.5585, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 8.011782032400589, |
|
"grad_norm": 1.47265625, |
|
"learning_rate": 1.866475637156898e-05, |
|
"loss": 1.6146, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 8.041237113402062, |
|
"grad_norm": 1.3291015625, |
|
"learning_rate": 1.8129115557213262e-05, |
|
"loss": 1.5909, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 8.070692194403534, |
|
"grad_norm": 1.4443359375, |
|
"learning_rate": 1.7600505898681997e-05, |
|
"loss": 1.5275, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 8.100147275405007, |
|
"grad_norm": 1.3486328125, |
|
"learning_rate": 1.707897279351671e-05, |
|
"loss": 1.4641, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 8.12960235640648, |
|
"grad_norm": 1.3115234375, |
|
"learning_rate": 1.656456103151728e-05, |
|
"loss": 1.61, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 8.159057437407952, |
|
"grad_norm": 1.3779296875, |
|
"learning_rate": 1.605731479089534e-05, |
|
"loss": 1.5912, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 8.188512518409425, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 1.5557277634480083e-05, |
|
"loss": 1.5664, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 8.217967599410898, |
|
"grad_norm": 1.458984375, |
|
"learning_rate": 1.5064492505977234e-05, |
|
"loss": 1.5658, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 8.24742268041237, |
|
"grad_norm": 1.3330078125, |
|
"learning_rate": 1.4579001726280828e-05, |
|
"loss": 1.6019, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 8.276877761413845, |
|
"grad_norm": 1.4013671875, |
|
"learning_rate": 1.41008469898387e-05, |
|
"loss": 1.6153, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 8.306332842415317, |
|
"grad_norm": 1.6611328125, |
|
"learning_rate": 1.363006936107183e-05, |
|
"loss": 1.6229, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 8.33578792341679, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 1.3166709270847511e-05, |
|
"loss": 1.5794, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 8.365243004418263, |
|
"grad_norm": 1.4560546875, |
|
"learning_rate": 1.271080651300719e-05, |
|
"loss": 1.5704, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 8.394698085419735, |
|
"grad_norm": 1.3349609375, |
|
"learning_rate": 1.2262400240949023e-05, |
|
"loss": 1.5124, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 8.424153166421208, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.182152896426515e-05, |
|
"loss": 1.5907, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 8.45360824742268, |
|
"grad_norm": 1.2841796875, |
|
"learning_rate": 1.1388230545434653e-05, |
|
"loss": 1.5517, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 8.483063328424153, |
|
"grad_norm": 1.3115234375, |
|
"learning_rate": 1.0962542196571634e-05, |
|
"loss": 1.567, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 8.512518409425626, |
|
"grad_norm": 1.41796875, |
|
"learning_rate": 1.0544500476229713e-05, |
|
"loss": 1.4715, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 8.541973490427099, |
|
"grad_norm": 1.5263671875, |
|
"learning_rate": 1.013414128626211e-05, |
|
"loss": 1.5595, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 8.571428571428571, |
|
"grad_norm": 1.44140625, |
|
"learning_rate": 9.731499868738447e-06, |
|
"loss": 1.5258, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 8.600883652430044, |
|
"grad_norm": 1.2353515625, |
|
"learning_rate": 9.336610802918044e-06, |
|
"loss": 1.5395, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 8.630338733431516, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 8.949508002280382e-06, |
|
"loss": 1.571, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 8.65979381443299, |
|
"grad_norm": 1.3330078125, |
|
"learning_rate": 8.570224711612385e-06, |
|
"loss": 1.6215, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 8.689248895434462, |
|
"grad_norm": 1.404296875, |
|
"learning_rate": 8.19879350415349e-06, |
|
"loss": 1.5996, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 8.718703976435934, |
|
"grad_norm": 1.5576171875, |
|
"learning_rate": 7.835246278798037e-06, |
|
"loss": 1.5238, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 8.748159057437409, |
|
"grad_norm": 1.501953125, |
|
"learning_rate": 7.479614257355971e-06, |
|
"loss": 1.593, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 8.777614138438881, |
|
"grad_norm": 1.5166015625, |
|
"learning_rate": 7.1319279818713445e-06, |
|
"loss": 1.5612, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 8.807069219440354, |
|
"grad_norm": 1.7763671875, |
|
"learning_rate": 6.7922173119993606e-06, |
|
"loss": 1.5534, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 8.836524300441827, |
|
"grad_norm": 1.388671875, |
|
"learning_rate": 6.460511422441984e-06, |
|
"loss": 1.5691, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 8.8659793814433, |
|
"grad_norm": 1.412109375, |
|
"learning_rate": 6.136838800442457e-06, |
|
"loss": 1.6044, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 8.895434462444772, |
|
"grad_norm": 1.36328125, |
|
"learning_rate": 5.821227243338712e-06, |
|
"loss": 1.6178, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 8.924889543446245, |
|
"grad_norm": 1.41796875, |
|
"learning_rate": 5.5137038561761115e-06, |
|
"loss": 1.6223, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 8.954344624447717, |
|
"grad_norm": 1.44140625, |
|
"learning_rate": 5.214295049379658e-06, |
|
"loss": 1.5837, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 8.98379970544919, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 4.923026536485875e-06, |
|
"loss": 1.523, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 9.013254786450663, |
|
"grad_norm": 1.33203125, |
|
"learning_rate": 4.639923331934471e-06, |
|
"loss": 1.5245, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 9.042709867452135, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 4.365009748920012e-06, |
|
"loss": 1.5816, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 9.072164948453608, |
|
"grad_norm": 1.255859375, |
|
"learning_rate": 4.098309397303978e-06, |
|
"loss": 1.5324, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 9.10162002945508, |
|
"grad_norm": 1.5517578125, |
|
"learning_rate": 3.839845181587098e-06, |
|
"loss": 1.6157, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 9.131075110456553, |
|
"grad_norm": 1.2978515625, |
|
"learning_rate": 3.5896392989422377e-06, |
|
"loss": 1.5071, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 9.160530191458026, |
|
"grad_norm": 1.537109375, |
|
"learning_rate": 3.3477132373081254e-06, |
|
"loss": 1.646, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 9.189985272459499, |
|
"grad_norm": 1.373046875, |
|
"learning_rate": 3.1140877735439387e-06, |
|
"loss": 1.5886, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 9.219440353460971, |
|
"grad_norm": 1.314453125, |
|
"learning_rate": 2.8887829716449876e-06, |
|
"loss": 1.5618, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 9.248895434462444, |
|
"grad_norm": 1.326171875, |
|
"learning_rate": 2.6718181810195696e-06, |
|
"loss": 1.5741, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 9.278350515463918, |
|
"grad_norm": 1.3994140625, |
|
"learning_rate": 2.4632120348272003e-06, |
|
"loss": 1.5673, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 9.307805596465391, |
|
"grad_norm": 1.4306640625, |
|
"learning_rate": 2.2629824483784366e-06, |
|
"loss": 1.5312, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 9.337260677466864, |
|
"grad_norm": 1.4052734375, |
|
"learning_rate": 2.0711466175962756e-06, |
|
"loss": 1.5329, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 9.366715758468336, |
|
"grad_norm": 1.3193359375, |
|
"learning_rate": 1.88772101753929e-06, |
|
"loss": 1.5319, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 9.396170839469809, |
|
"grad_norm": 1.3486328125, |
|
"learning_rate": 1.7127214009868385e-06, |
|
"loss": 1.4908, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 9.425625920471282, |
|
"grad_norm": 1.4267578125, |
|
"learning_rate": 1.5461627970860814e-06, |
|
"loss": 1.6365, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 9.455081001472754, |
|
"grad_norm": 1.4658203125, |
|
"learning_rate": 1.3880595100613792e-06, |
|
"loss": 1.5504, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 9.484536082474227, |
|
"grad_norm": 1.685546875, |
|
"learning_rate": 1.2384251179857643e-06, |
|
"loss": 1.5451, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 9.5139911634757, |
|
"grad_norm": 1.4287109375, |
|
"learning_rate": 1.0972724716148187e-06, |
|
"loss": 1.5691, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 9.543446244477172, |
|
"grad_norm": 1.345703125, |
|
"learning_rate": 9.64613693283123e-07, |
|
"loss": 1.5342, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 9.572901325478645, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 8.404601758630892e-07, |
|
"loss": 1.5575, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 9.602356406480117, |
|
"grad_norm": 1.3681640625, |
|
"learning_rate": 7.248225817865884e-07, |
|
"loss": 1.5327, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 9.63181148748159, |
|
"grad_norm": 1.37890625, |
|
"learning_rate": 6.177108421292266e-07, |
|
"loss": 1.5548, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 9.661266568483063, |
|
"grad_norm": 1.353515625, |
|
"learning_rate": 5.191341557574392e-07, |
|
"loss": 1.6446, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 9.690721649484535, |
|
"grad_norm": 1.3310546875, |
|
"learning_rate": 4.291009885385333e-07, |
|
"loss": 1.4653, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 9.720176730486008, |
|
"grad_norm": 1.380859375, |
|
"learning_rate": 3.4761907261356976e-07, |
|
"loss": 1.5371, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 9.749631811487482, |
|
"grad_norm": 1.4384765625, |
|
"learning_rate": 2.746954057333606e-07, |
|
"loss": 1.5145, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 9.779086892488955, |
|
"grad_norm": 1.3583984375, |
|
"learning_rate": 2.1033625065747242e-07, |
|
"loss": 1.6238, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 9.808541973490428, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 1.545471346164007e-07, |
|
"loss": 1.5669, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 9.8379970544919, |
|
"grad_norm": 1.34765625, |
|
"learning_rate": 1.0733284883682749e-07, |
|
"loss": 1.5792, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 9.867452135493373, |
|
"grad_norm": 1.4501953125, |
|
"learning_rate": 6.869744813023937e-08, |
|
"loss": 1.6, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 9.896907216494846, |
|
"grad_norm": 1.443359375, |
|
"learning_rate": 3.8644250544594975e-08, |
|
"loss": 1.5093, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 9.926362297496318, |
|
"grad_norm": 1.51171875, |
|
"learning_rate": 1.7175837079452804e-08, |
|
"loss": 1.5322, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 9.955817378497791, |
|
"grad_norm": 1.3876953125, |
|
"learning_rate": 4.2940514642597626e-09, |
|
"loss": 1.5623, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 9.985272459499264, |
|
"grad_norm": 1.30078125, |
|
"learning_rate": 0.0, |
|
"loss": 1.5123, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 9.985272459499264, |
|
"step": 3390, |
|
"total_flos": 2.22628436508672e+17, |
|
"train_loss": 1.877983266273431, |
|
"train_runtime": 3392.4525, |
|
"train_samples_per_second": 4.003, |
|
"train_steps_per_second": 0.999 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3390, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 2.22628436508672e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|