{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.36951013513513514, "eval_steps": 406, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "eval_loss": 10.780741691589355, "eval_runtime": 947.394, "eval_samples_per_second": 79.833, "eval_steps_per_second": 4.435, "step": 1 }, { "epoch": 0.0, "learning_rate": 2.25e-05, "loss": 10.7434, "step": 5 }, { "epoch": 0.0, "learning_rate": 4.5e-05, "loss": 9.9589, "step": 10 }, { "epoch": 0.0, "learning_rate": 6.75e-05, "loss": 8.6595, "step": 15 }, { "epoch": 0.0, "learning_rate": 9e-05, "loss": 7.9676, "step": 20 }, { "epoch": 0.01, "learning_rate": 0.0001125, "loss": 7.4358, "step": 25 }, { "epoch": 0.01, "learning_rate": 0.000135, "loss": 7.1452, "step": 30 }, { "epoch": 0.01, "learning_rate": 0.00015749999999999998, "loss": 6.9396, "step": 35 }, { "epoch": 0.01, "learning_rate": 0.00018, "loss": 6.8089, "step": 40 }, { "epoch": 0.01, "learning_rate": 0.0002025, "loss": 6.6897, "step": 45 }, { "epoch": 0.01, "learning_rate": 0.000225, "loss": 6.4553, "step": 50 }, { "epoch": 0.01, "learning_rate": 0.00022499913644577618, "loss": 6.3166, "step": 55 }, { "epoch": 0.01, "learning_rate": 0.0002249965457963621, "loss": 6.1718, "step": 60 }, { "epoch": 0.02, "learning_rate": 0.00022499222809152964, "loss": 6.0497, "step": 65 }, { "epoch": 0.02, "learning_rate": 0.00022498618339756446, "loss": 5.9306, "step": 70 }, { "epoch": 0.02, "learning_rate": 0.00022497841180726518, "loss": 5.825, "step": 75 }, { "epoch": 0.02, "learning_rate": 0.00022496891343994188, "loss": 5.7394, "step": 80 }, { "epoch": 0.02, "learning_rate": 0.00022495768844141414, "loss": 5.6247, "step": 85 }, { "epoch": 0.02, "learning_rate": 0.000224944736984009, "loss": 5.5139, "step": 90 }, { "epoch": 0.02, "learning_rate": 0.00022493005926655827, "loss": 5.3914, "step": 95 }, { "epoch": 0.02, "learning_rate": 0.0002249136555143953, "loss": 5.369, "step": 100 }, { "epoch": 0.03, "learning_rate": 0.00022489552597935173, "loss": 5.3003, "step": 105 }, { "epoch": 0.03, "learning_rate": 0.00022487567093975358, "loss": 5.2256, "step": 110 }, { "epoch": 0.03, "learning_rate": 0.00022485409070041688, "loss": 5.141, "step": 115 }, { "epoch": 0.03, "learning_rate": 0.00022483078559264308, "loss": 5.0825, "step": 120 }, { "epoch": 0.03, "learning_rate": 0.00022480575597421393, "loss": 5.0306, "step": 125 }, { "epoch": 0.03, "learning_rate": 0.0002247790022293861, "loss": 4.9651, "step": 130 }, { "epoch": 0.03, "learning_rate": 0.000224750524768885, "loss": 4.9608, "step": 135 }, { "epoch": 0.03, "learning_rate": 0.00022472032402989878, "loss": 4.9164, "step": 140 }, { "epoch": 0.04, "learning_rate": 0.00022468840047607143, "loss": 4.8746, "step": 145 }, { "epoch": 0.04, "learning_rate": 0.00022465475459749576, "loss": 4.7804, "step": 150 }, { "epoch": 0.04, "learning_rate": 0.00022461938691070582, "loss": 4.7936, "step": 155 }, { "epoch": 0.04, "learning_rate": 0.000224582297958669, "loss": 4.7272, "step": 160 }, { "epoch": 0.04, "learning_rate": 0.00022454348831077767, "loss": 4.7058, "step": 165 }, { "epoch": 0.04, "learning_rate": 0.00022450295856284047, "loss": 4.6486, "step": 170 }, { "epoch": 0.04, "learning_rate": 0.0002244607093370731, "loss": 4.6536, "step": 175 }, { "epoch": 0.04, "learning_rate": 0.0002244167412820889, "loss": 4.5721, "step": 180 }, { "epoch": 0.05, "learning_rate": 0.00022437105507288872, "loss": 4.5511, "step": 185 }, { "epoch": 0.05, "learning_rate": 0.00022432365141085068, "loss": 4.4932, "step": 190 }, { "epoch": 0.05, "learning_rate": 0.00022427453102371933, "loss": 4.5251, "step": 195 }, { "epoch": 0.05, "learning_rate": 0.0002242236946655946, "loss": 4.4763, "step": 200 }, { "epoch": 0.05, "learning_rate": 0.00022417114311692, "loss": 4.5116, "step": 205 }, { "epoch": 0.05, "learning_rate": 0.00022411687718447093, "loss": 4.4102, "step": 210 }, { "epoch": 0.05, "learning_rate": 0.00022406089770134205, "loss": 4.4343, "step": 215 }, { "epoch": 0.05, "learning_rate": 0.00022400320552693452, "loss": 4.4403, "step": 220 }, { "epoch": 0.06, "learning_rate": 0.000223943801546943, "loss": 4.4068, "step": 225 }, { "epoch": 0.06, "learning_rate": 0.0002238826866733418, "loss": 4.3988, "step": 230 }, { "epoch": 0.06, "learning_rate": 0.00022381986184437112, "loss": 4.3703, "step": 235 }, { "epoch": 0.06, "learning_rate": 0.00022375532802452238, "loss": 4.303, "step": 240 }, { "epoch": 0.06, "learning_rate": 0.00022368908620452367, "loss": 4.2979, "step": 245 }, { "epoch": 0.06, "learning_rate": 0.00022362113740132436, "loss": 4.2974, "step": 250 }, { "epoch": 0.06, "learning_rate": 0.00022355148265807966, "loss": 4.2652, "step": 255 }, { "epoch": 0.06, "learning_rate": 0.00022348012304413426, "loss": 4.203, "step": 260 }, { "epoch": 0.07, "learning_rate": 0.00022340705965500642, "loss": 4.2045, "step": 265 }, { "epoch": 0.07, "learning_rate": 0.00022333229361237082, "loss": 4.2342, "step": 270 }, { "epoch": 0.07, "learning_rate": 0.00022325582606404126, "loss": 4.1869, "step": 275 }, { "epoch": 0.07, "learning_rate": 0.00022317765818395332, "loss": 4.0939, "step": 280 }, { "epoch": 0.07, "learning_rate": 0.00022309779117214617, "loss": 4.1401, "step": 285 }, { "epoch": 0.07, "learning_rate": 0.00022301622625474417, "loss": 4.0744, "step": 290 }, { "epoch": 0.07, "learning_rate": 0.00022293296468393808, "loss": 4.0818, "step": 295 }, { "epoch": 0.07, "learning_rate": 0.0002228480077379657, "loss": 4.029, "step": 300 }, { "epoch": 0.08, "learning_rate": 0.00022276135672109258, "loss": 4.0574, "step": 305 }, { "epoch": 0.08, "learning_rate": 0.00022267301296359155, "loss": 4.0003, "step": 310 }, { "epoch": 0.08, "learning_rate": 0.00022258297782172258, "loss": 4.0143, "step": 315 }, { "epoch": 0.08, "learning_rate": 0.000222491252677712, "loss": 3.9962, "step": 320 }, { "epoch": 0.08, "learning_rate": 0.0002223978389397311, "loss": 3.9994, "step": 325 }, { "epoch": 0.08, "learning_rate": 0.00022230273804187456, "loss": 3.9424, "step": 330 }, { "epoch": 0.08, "learning_rate": 0.00022220595144413854, "loss": 3.9215, "step": 335 }, { "epoch": 0.08, "learning_rate": 0.00022210748063239815, "loss": 3.9483, "step": 340 }, { "epoch": 0.08, "learning_rate": 0.00022200732711838466, "loss": 3.9037, "step": 345 }, { "epoch": 0.09, "learning_rate": 0.00022190549243966234, "loss": 3.8959, "step": 350 }, { "epoch": 0.09, "learning_rate": 0.0002218019781596049, "loss": 3.8066, "step": 355 }, { "epoch": 0.09, "learning_rate": 0.00022169678586737127, "loss": 3.8306, "step": 360 }, { "epoch": 0.09, "learning_rate": 0.00022158991717788137, "loss": 3.7961, "step": 365 }, { "epoch": 0.09, "learning_rate": 0.00022148137373179146, "loss": 3.7739, "step": 370 }, { "epoch": 0.09, "learning_rate": 0.0002213711571954686, "loss": 3.7727, "step": 375 }, { "epoch": 0.09, "learning_rate": 0.00022125926926096538, "loss": 3.7895, "step": 380 }, { "epoch": 0.09, "learning_rate": 0.0002211457116459937, "loss": 3.7839, "step": 385 }, { "epoch": 0.1, "learning_rate": 0.00022103048609389868, "loss": 3.7261, "step": 390 }, { "epoch": 0.1, "learning_rate": 0.00022091359437363157, "loss": 3.7129, "step": 395 }, { "epoch": 0.1, "learning_rate": 0.00022079503827972293, "loss": 3.6765, "step": 400 }, { "epoch": 0.1, "learning_rate": 0.0002206748196322547, "loss": 3.6463, "step": 405 }, { "epoch": 0.1, "eval_loss": 3.6351399421691895, "eval_runtime": 955.1583, "eval_samples_per_second": 79.184, "eval_steps_per_second": 4.399, "step": 406 }, { "epoch": 0.1, "learning_rate": 0.00022055294027683266, "loss": 3.6138, "step": 410 }, { "epoch": 0.1, "learning_rate": 0.0002204294020845578, "loss": 3.6069, "step": 415 }, { "epoch": 0.1, "learning_rate": 0.00022030420695199774, "loss": 3.5781, "step": 420 }, { "epoch": 0.1, "learning_rate": 0.00022017735680115755, "loss": 3.5925, "step": 425 }, { "epoch": 0.11, "learning_rate": 0.00022004885357945026, "loss": 3.6038, "step": 430 }, { "epoch": 0.11, "learning_rate": 0.000219918699259667, "loss": 3.5368, "step": 435 }, { "epoch": 0.11, "learning_rate": 0.00021978689583994666, "loss": 3.5982, "step": 440 }, { "epoch": 0.11, "learning_rate": 0.00021965344534374522, "loss": 3.5501, "step": 445 }, { "epoch": 0.11, "learning_rate": 0.0002195183498198047, "loss": 3.5315, "step": 450 }, { "epoch": 0.11, "learning_rate": 0.00021938161134212177, "loss": 3.5229, "step": 455 }, { "epoch": 0.11, "learning_rate": 0.00021924323200991577, "loss": 3.5106, "step": 460 }, { "epoch": 0.11, "learning_rate": 0.00021910321394759662, "loss": 3.4851, "step": 465 }, { "epoch": 0.12, "learning_rate": 0.00021896155930473216, "loss": 3.4405, "step": 470 }, { "epoch": 0.12, "learning_rate": 0.00021881827025601504, "loss": 3.5036, "step": 475 }, { "epoch": 0.12, "learning_rate": 0.00021867334900122954, "loss": 3.4158, "step": 480 }, { "epoch": 0.12, "learning_rate": 0.0002185267977652176, "loss": 3.465, "step": 485 }, { "epoch": 0.12, "learning_rate": 0.00021837861879784484, "loss": 3.3843, "step": 490 }, { "epoch": 0.12, "learning_rate": 0.0002182288143739659, "loss": 3.3678, "step": 495 }, { "epoch": 0.12, "learning_rate": 0.00021807738679338953, "loss": 3.4079, "step": 500 }, { "epoch": 0.12, "learning_rate": 0.0002179243383808433, "loss": 3.3826, "step": 505 }, { "epoch": 0.13, "learning_rate": 0.00021776967148593793, "loss": 3.4016, "step": 510 }, { "epoch": 0.13, "learning_rate": 0.00021761338848313123, "loss": 3.3715, "step": 515 }, { "epoch": 0.13, "learning_rate": 0.0002174554917716916, "loss": 3.4046, "step": 520 }, { "epoch": 0.13, "learning_rate": 0.00021729598377566122, "loss": 3.3304, "step": 525 }, { "epoch": 0.13, "learning_rate": 0.00021713486694381875, "loss": 3.3419, "step": 530 }, { "epoch": 0.13, "learning_rate": 0.00021697214374964195, "loss": 3.3681, "step": 535 }, { "epoch": 0.13, "learning_rate": 0.0002168078166912695, "loss": 3.3012, "step": 540 }, { "epoch": 0.13, "learning_rate": 0.00021664188829146277, "loss": 3.3551, "step": 545 }, { "epoch": 0.14, "learning_rate": 0.000216474361097567, "loss": 3.2824, "step": 550 }, { "epoch": 0.14, "learning_rate": 0.00021630523768147218, "loss": 3.3024, "step": 555 }, { "epoch": 0.14, "learning_rate": 0.00021613452063957379, "loss": 3.2661, "step": 560 }, { "epoch": 0.14, "learning_rate": 0.00021596221259273266, "loss": 3.2882, "step": 565 }, { "epoch": 0.14, "learning_rate": 0.0002157883161862348, "loss": 3.3181, "step": 570 }, { "epoch": 0.14, "learning_rate": 0.00021561283408975097, "loss": 3.2574, "step": 575 }, { "epoch": 0.14, "learning_rate": 0.00021543576899729543, "loss": 3.2339, "step": 580 }, { "epoch": 0.14, "learning_rate": 0.00021525712362718483, "loss": 3.2554, "step": 585 }, { "epoch": 0.15, "learning_rate": 0.00021507690072199625, "loss": 3.1892, "step": 590 }, { "epoch": 0.15, "learning_rate": 0.00021489510304852536, "loss": 3.2946, "step": 595 }, { "epoch": 0.15, "learning_rate": 0.00021471173339774363, "loss": 3.2303, "step": 600 }, { "epoch": 0.15, "learning_rate": 0.00021452679458475567, "loss": 3.2891, "step": 605 }, { "epoch": 0.15, "learning_rate": 0.00021434028944875607, "loss": 3.224, "step": 610 }, { "epoch": 0.15, "learning_rate": 0.00021415222085298573, "loss": 3.2337, "step": 615 }, { "epoch": 0.15, "learning_rate": 0.00021396259168468773, "loss": 3.2158, "step": 620 }, { "epoch": 0.15, "learning_rate": 0.0002137714048550634, "loss": 3.2126, "step": 625 }, { "epoch": 0.16, "learning_rate": 0.0002135786632992273, "loss": 3.1934, "step": 630 }, { "epoch": 0.16, "learning_rate": 0.00021338436997616223, "loss": 3.2051, "step": 635 }, { "epoch": 0.16, "learning_rate": 0.00021318852786867388, "loss": 3.2667, "step": 640 }, { "epoch": 0.16, "learning_rate": 0.00021299113998334503, "loss": 3.1956, "step": 645 }, { "epoch": 0.16, "learning_rate": 0.00021279220935048926, "loss": 3.1771, "step": 650 }, { "epoch": 0.16, "learning_rate": 0.0002125917390241046, "loss": 3.1467, "step": 655 }, { "epoch": 0.16, "learning_rate": 0.00021238973208182659, "loss": 3.1788, "step": 660 }, { "epoch": 0.16, "learning_rate": 0.00021218619162488095, "loss": 3.1967, "step": 665 }, { "epoch": 0.17, "learning_rate": 0.00021198112077803607, "loss": 3.149, "step": 670 }, { "epoch": 0.17, "learning_rate": 0.00021177452268955496, "loss": 3.154, "step": 675 }, { "epoch": 0.17, "learning_rate": 0.000211566400531147, "loss": 3.1652, "step": 680 }, { "epoch": 0.17, "learning_rate": 0.00021135675749791924, "loss": 3.1433, "step": 685 }, { "epoch": 0.17, "learning_rate": 0.00021114559680832722, "loss": 3.1893, "step": 690 }, { "epoch": 0.17, "learning_rate": 0.0002109329217041257, "loss": 3.1788, "step": 695 }, { "epoch": 0.17, "learning_rate": 0.00021071873545031885, "loss": 3.1549, "step": 700 }, { "epoch": 0.17, "learning_rate": 0.00021050304133511018, "loss": 3.1294, "step": 705 }, { "epoch": 0.17, "learning_rate": 0.00021028584266985186, "loss": 3.1109, "step": 710 }, { "epoch": 0.18, "learning_rate": 0.00021006714278899415, "loss": 3.1713, "step": 715 }, { "epoch": 0.18, "learning_rate": 0.00020984694505003402, "loss": 3.1304, "step": 720 }, { "epoch": 0.18, "learning_rate": 0.00020962525283346376, "loss": 3.1285, "step": 725 }, { "epoch": 0.18, "learning_rate": 0.0002094020695427188, "loss": 3.0582, "step": 730 }, { "epoch": 0.18, "learning_rate": 0.00020917739860412592, "loss": 3.1063, "step": 735 }, { "epoch": 0.18, "learning_rate": 0.00020895124346685017, "loss": 3.1309, "step": 740 }, { "epoch": 0.18, "learning_rate": 0.00020872360760284219, "loss": 3.1125, "step": 745 }, { "epoch": 0.18, "learning_rate": 0.0002084944945067849, "loss": 3.1108, "step": 750 }, { "epoch": 0.19, "learning_rate": 0.00020826390769603968, "loss": 3.0765, "step": 755 }, { "epoch": 0.19, "learning_rate": 0.00020803185071059267, "loss": 3.0634, "step": 760 }, { "epoch": 0.19, "learning_rate": 0.000207798327113, "loss": 3.0993, "step": 765 }, { "epoch": 0.19, "learning_rate": 0.0002075633404883336, "loss": 3.1127, "step": 770 }, { "epoch": 0.19, "learning_rate": 0.00020732689444412573, "loss": 3.0502, "step": 775 }, { "epoch": 0.19, "learning_rate": 0.0002070889926103138, "loss": 3.0436, "step": 780 }, { "epoch": 0.19, "learning_rate": 0.0002068496386391846, "loss": 3.0305, "step": 785 }, { "epoch": 0.19, "learning_rate": 0.0002066088362053184, "loss": 3.0996, "step": 790 }, { "epoch": 0.2, "learning_rate": 0.00020636658900553213, "loss": 3.0584, "step": 795 }, { "epoch": 0.2, "learning_rate": 0.00020612290075882296, "loss": 3.0508, "step": 800 }, { "epoch": 0.2, "learning_rate": 0.00020587777520631126, "loss": 3.0578, "step": 805 }, { "epoch": 0.2, "learning_rate": 0.00020563121611118286, "loss": 3.0308, "step": 810 }, { "epoch": 0.2, "eval_loss": 3.0524611473083496, "eval_runtime": 955.714, "eval_samples_per_second": 79.138, "eval_steps_per_second": 4.397, "step": 812 }, { "epoch": 0.2, "learning_rate": 0.00020538322725863146, "loss": 3.122, "step": 815 }, { "epoch": 0.2, "learning_rate": 0.00020513381245580064, "loss": 2.9886, "step": 820 }, { "epoch": 0.2, "learning_rate": 0.00020488297553172515, "loss": 3.0606, "step": 825 }, { "epoch": 0.2, "learning_rate": 0.00020463072033727225, "loss": 2.993, "step": 830 }, { "epoch": 0.21, "learning_rate": 0.00020437705074508264, "loss": 2.9999, "step": 835 }, { "epoch": 0.21, "learning_rate": 0.00020412197064951097, "loss": 3.0143, "step": 840 }, { "epoch": 0.21, "learning_rate": 0.000203865483966566, "loss": 2.9886, "step": 845 }, { "epoch": 0.21, "learning_rate": 0.00020360759463385053, "loss": 3.0219, "step": 850 }, { "epoch": 0.21, "learning_rate": 0.00020334830661050102, "loss": 2.9888, "step": 855 }, { "epoch": 0.21, "learning_rate": 0.00020308762387712662, "loss": 3.0271, "step": 860 }, { "epoch": 0.21, "learning_rate": 0.00020282555043574823, "loss": 3.0063, "step": 865 }, { "epoch": 0.21, "learning_rate": 0.00020256209030973708, "loss": 3.0198, "step": 870 }, { "epoch": 0.22, "learning_rate": 0.00020229724754375266, "loss": 3.0135, "step": 875 }, { "epoch": 0.22, "learning_rate": 0.00020203102620368113, "loss": 3.0008, "step": 880 }, { "epoch": 0.22, "learning_rate": 0.00020176343037657242, "loss": 3.0168, "step": 885 }, { "epoch": 0.22, "learning_rate": 0.00020149446417057782, "loss": 3.042, "step": 890 }, { "epoch": 0.22, "learning_rate": 0.00020122413171488667, "loss": 2.9954, "step": 895 }, { "epoch": 0.22, "learning_rate": 0.00020095243715966316, "loss": 3.0191, "step": 900 }, { "epoch": 0.22, "learning_rate": 0.0002006793846759825, "loss": 3.0343, "step": 905 }, { "epoch": 0.22, "learning_rate": 0.0002004049784557669, "loss": 2.9843, "step": 910 }, { "epoch": 0.23, "learning_rate": 0.00020012922271172128, "loss": 3.0263, "step": 915 }, { "epoch": 0.23, "learning_rate": 0.00019985212167726853, "loss": 2.975, "step": 920 }, { "epoch": 0.23, "learning_rate": 0.0001995736796064845, "loss": 2.9858, "step": 925 }, { "epoch": 0.23, "learning_rate": 0.0001992939007740328, "loss": 2.9526, "step": 930 }, { "epoch": 0.23, "learning_rate": 0.0001990127894750991, "loss": 2.971, "step": 935 }, { "epoch": 0.23, "learning_rate": 0.00019873035002532512, "loss": 2.9635, "step": 940 }, { "epoch": 0.23, "learning_rate": 0.00019844658676074255, "loss": 2.9807, "step": 945 }, { "epoch": 0.23, "learning_rate": 0.0001981615040377063, "loss": 2.9822, "step": 950 }, { "epoch": 0.24, "learning_rate": 0.00019787510623282776, "loss": 2.9552, "step": 955 }, { "epoch": 0.24, "learning_rate": 0.00019758739774290753, "loss": 2.9877, "step": 960 }, { "epoch": 0.24, "learning_rate": 0.00019729838298486793, "loss": 2.974, "step": 965 }, { "epoch": 0.24, "learning_rate": 0.00019700806639568524, "loss": 2.9613, "step": 970 }, { "epoch": 0.24, "learning_rate": 0.00019671645243232155, "loss": 2.9949, "step": 975 }, { "epoch": 0.24, "learning_rate": 0.00019642354557165633, "loss": 2.9876, "step": 980 }, { "epoch": 0.24, "learning_rate": 0.00019612935031041768, "loss": 2.9644, "step": 985 }, { "epoch": 0.24, "learning_rate": 0.00019583387116511335, "loss": 2.9204, "step": 990 }, { "epoch": 0.25, "learning_rate": 0.00019553711267196136, "loss": 2.9849, "step": 995 }, { "epoch": 0.25, "learning_rate": 0.00019523907938682038, "loss": 2.9622, "step": 1000 }, { "epoch": 0.25, "learning_rate": 0.00019493977588511978, "loss": 2.9457, "step": 1005 }, { "epoch": 0.25, "learning_rate": 0.0001946392067617894, "loss": 2.9558, "step": 1010 }, { "epoch": 0.25, "learning_rate": 0.00019433737663118898, "loss": 2.9107, "step": 1015 }, { "epoch": 0.25, "learning_rate": 0.0001940342901270374, "loss": 2.963, "step": 1020 }, { "epoch": 0.25, "learning_rate": 0.00019372995190234136, "loss": 2.8945, "step": 1025 }, { "epoch": 0.25, "learning_rate": 0.00019342436662932416, "loss": 2.9756, "step": 1030 }, { "epoch": 0.25, "learning_rate": 0.00019311753899935389, "loss": 2.9392, "step": 1035 }, { "epoch": 0.26, "learning_rate": 0.00019280947372287132, "loss": 2.9293, "step": 1040 }, { "epoch": 0.26, "learning_rate": 0.00019250017552931774, "loss": 2.947, "step": 1045 }, { "epoch": 0.26, "learning_rate": 0.00019218964916706223, "loss": 2.9317, "step": 1050 }, { "epoch": 0.26, "learning_rate": 0.00019187789940332882, "loss": 2.8816, "step": 1055 }, { "epoch": 0.26, "learning_rate": 0.0001915649310241233, "loss": 2.9644, "step": 1060 }, { "epoch": 0.26, "learning_rate": 0.0001912507488341597, "loss": 2.9219, "step": 1065 }, { "epoch": 0.26, "learning_rate": 0.00019093535765678663, "loss": 2.8983, "step": 1070 }, { "epoch": 0.26, "learning_rate": 0.00019061876233391313, "loss": 2.8816, "step": 1075 }, { "epoch": 0.27, "learning_rate": 0.00019030096772593432, "loss": 2.8844, "step": 1080 }, { "epoch": 0.27, "learning_rate": 0.00018998197871165692, "loss": 2.9207, "step": 1085 }, { "epoch": 0.27, "learning_rate": 0.00018966180018822423, "loss": 2.9553, "step": 1090 }, { "epoch": 0.27, "learning_rate": 0.00018934043707104098, "loss": 2.893, "step": 1095 }, { "epoch": 0.27, "learning_rate": 0.0001890178942936979, "loss": 2.9049, "step": 1100 }, { "epoch": 0.27, "learning_rate": 0.00018869417680789587, "loss": 2.8872, "step": 1105 }, { "epoch": 0.27, "learning_rate": 0.00018836928958337009, "loss": 2.9219, "step": 1110 }, { "epoch": 0.27, "learning_rate": 0.00018804323760781362, "loss": 2.9005, "step": 1115 }, { "epoch": 0.28, "learning_rate": 0.00018771602588680083, "loss": 2.8814, "step": 1120 }, { "epoch": 0.28, "learning_rate": 0.00018738765944371067, "loss": 2.8915, "step": 1125 }, { "epoch": 0.28, "learning_rate": 0.00018705814331964945, "loss": 2.8777, "step": 1130 }, { "epoch": 0.28, "learning_rate": 0.0001867274825733734, "loss": 2.8821, "step": 1135 }, { "epoch": 0.28, "learning_rate": 0.0001863956822812112, "loss": 2.8803, "step": 1140 }, { "epoch": 0.28, "learning_rate": 0.00018606274753698576, "loss": 2.8787, "step": 1145 }, { "epoch": 0.28, "learning_rate": 0.00018572868345193632, "loss": 2.9365, "step": 1150 }, { "epoch": 0.28, "learning_rate": 0.0001853934951546398, "loss": 2.8642, "step": 1155 }, { "epoch": 0.29, "learning_rate": 0.00018505718779093206, "loss": 2.8333, "step": 1160 }, { "epoch": 0.29, "learning_rate": 0.0001847197665238291, "loss": 2.9019, "step": 1165 }, { "epoch": 0.29, "learning_rate": 0.00018438123653344746, "loss": 2.8726, "step": 1170 }, { "epoch": 0.29, "learning_rate": 0.00018404160301692504, "loss": 2.8655, "step": 1175 }, { "epoch": 0.29, "learning_rate": 0.00018370087118834102, "loss": 2.8639, "step": 1180 }, { "epoch": 0.29, "learning_rate": 0.00018335904627863605, "loss": 2.8742, "step": 1185 }, { "epoch": 0.29, "learning_rate": 0.00018301613353553182, "loss": 2.9013, "step": 1190 }, { "epoch": 0.29, "learning_rate": 0.0001826721382234505, "loss": 2.9131, "step": 1195 }, { "epoch": 0.3, "learning_rate": 0.000182327065623434, "loss": 2.9068, "step": 1200 }, { "epoch": 0.3, "learning_rate": 0.0001819809210330627, "loss": 2.9222, "step": 1205 }, { "epoch": 0.3, "learning_rate": 0.0001816337097663744, "loss": 2.8718, "step": 1210 }, { "epoch": 0.3, "learning_rate": 0.00018128543715378252, "loss": 2.8504, "step": 1215 }, { "epoch": 0.3, "eval_loss": 2.8658618927001953, "eval_runtime": 957.687, "eval_samples_per_second": 78.975, "eval_steps_per_second": 4.388, "step": 1218 }, { "epoch": 0.3, "learning_rate": 0.00018093610854199438, "loss": 2.8823, "step": 1220 }, { "epoch": 0.3, "learning_rate": 0.00018058572929392902, "loss": 2.8528, "step": 1225 }, { "epoch": 0.3, "learning_rate": 0.000180234304788635, "loss": 2.9415, "step": 1230 }, { "epoch": 0.3, "learning_rate": 0.0001798818404212077, "loss": 2.8599, "step": 1235 }, { "epoch": 0.31, "learning_rate": 0.00017952834160270655, "loss": 2.8218, "step": 1240 }, { "epoch": 0.31, "learning_rate": 0.000179173813760072, "loss": 2.8258, "step": 1245 }, { "epoch": 0.31, "learning_rate": 0.00017881826233604204, "loss": 2.8397, "step": 1250 }, { "epoch": 0.31, "learning_rate": 0.00017846169278906888, "loss": 2.8817, "step": 1255 }, { "epoch": 0.31, "learning_rate": 0.00017810411059323498, "loss": 2.8424, "step": 1260 }, { "epoch": 0.31, "learning_rate": 0.00017774552123816904, "loss": 2.845, "step": 1265 }, { "epoch": 0.31, "learning_rate": 0.00017738593022896177, "loss": 2.8272, "step": 1270 }, { "epoch": 0.31, "learning_rate": 0.00017702534308608133, "loss": 2.8452, "step": 1275 }, { "epoch": 0.32, "learning_rate": 0.00017666376534528866, "loss": 2.8805, "step": 1280 }, { "epoch": 0.32, "learning_rate": 0.00017630120255755235, "loss": 2.8824, "step": 1285 }, { "epoch": 0.32, "learning_rate": 0.00017593766028896357, "loss": 2.8669, "step": 1290 }, { "epoch": 0.32, "learning_rate": 0.0001755731441206505, "loss": 2.8883, "step": 1295 }, { "epoch": 0.32, "learning_rate": 0.0001752076596486927, "loss": 2.813, "step": 1300 }, { "epoch": 0.32, "learning_rate": 0.0001748412124840353, "loss": 2.8275, "step": 1305 }, { "epoch": 0.32, "learning_rate": 0.00017447380825240264, "loss": 2.8017, "step": 1310 }, { "epoch": 0.32, "learning_rate": 0.00017410545259421208, "loss": 2.8368, "step": 1315 }, { "epoch": 0.33, "learning_rate": 0.00017373615116448736, "loss": 2.8448, "step": 1320 }, { "epoch": 0.33, "learning_rate": 0.00017336590963277173, "loss": 2.8353, "step": 1325 }, { "epoch": 0.33, "learning_rate": 0.00017299473368304102, "loss": 2.8326, "step": 1330 }, { "epoch": 0.33, "learning_rate": 0.00017262262901361627, "loss": 2.7981, "step": 1335 }, { "epoch": 0.33, "learning_rate": 0.00017224960133707627, "loss": 2.828, "step": 1340 }, { "epoch": 0.33, "learning_rate": 0.00017187565638017, "loss": 2.8209, "step": 1345 }, { "epoch": 0.33, "learning_rate": 0.00017150079988372842, "loss": 2.8166, "step": 1350 }, { "epoch": 0.33, "learning_rate": 0.0001711250376025767, "loss": 2.7783, "step": 1355 }, { "epoch": 0.34, "learning_rate": 0.00017074837530544557, "loss": 2.7564, "step": 1360 }, { "epoch": 0.34, "learning_rate": 0.00017037081877488284, "loss": 2.7957, "step": 1365 }, { "epoch": 0.34, "learning_rate": 0.0001699923738071648, "loss": 2.8139, "step": 1370 }, { "epoch": 0.34, "learning_rate": 0.00016961304621220696, "loss": 2.7938, "step": 1375 }, { "epoch": 0.34, "learning_rate": 0.00016923284181347506, "loss": 2.8097, "step": 1380 }, { "epoch": 0.34, "learning_rate": 0.00016885176644789557, "loss": 2.8043, "step": 1385 }, { "epoch": 0.34, "learning_rate": 0.00016846982596576614, "loss": 2.7577, "step": 1390 }, { "epoch": 0.34, "learning_rate": 0.0001680870262306657, "loss": 2.7921, "step": 1395 }, { "epoch": 0.34, "learning_rate": 0.00016770337311936456, "loss": 2.7836, "step": 1400 }, { "epoch": 0.35, "learning_rate": 0.00016731887252173408, "loss": 2.7963, "step": 1405 }, { "epoch": 0.35, "learning_rate": 0.0001669335303406563, "loss": 2.8547, "step": 1410 }, { "epoch": 0.35, "learning_rate": 0.00016654735249193334, "loss": 2.808, "step": 1415 }, { "epoch": 0.35, "learning_rate": 0.00016616034490419648, "loss": 2.7782, "step": 1420 }, { "epoch": 0.35, "learning_rate": 0.00016577251351881532, "loss": 2.76, "step": 1425 }, { "epoch": 0.35, "learning_rate": 0.00016538386428980638, "loss": 2.7957, "step": 1430 }, { "epoch": 0.35, "learning_rate": 0.0001649944031837418, "loss": 2.819, "step": 1435 }, { "epoch": 0.35, "learning_rate": 0.0001646041361796578, "loss": 2.7574, "step": 1440 }, { "epoch": 0.36, "learning_rate": 0.00016421306926896266, "loss": 2.7939, "step": 1445 }, { "epoch": 0.36, "learning_rate": 0.00016382120845534497, "loss": 2.7814, "step": 1450 }, { "epoch": 0.36, "learning_rate": 0.00016342855975468135, "loss": 2.7271, "step": 1455 }, { "epoch": 0.36, "learning_rate": 0.0001630351291949442, "loss": 2.8333, "step": 1460 }, { "epoch": 0.36, "learning_rate": 0.0001626409228161089, "loss": 2.8128, "step": 1465 }, { "epoch": 0.36, "learning_rate": 0.0001622459466700615, "loss": 2.7752, "step": 1470 }, { "epoch": 0.36, "learning_rate": 0.00016185020682050541, "loss": 2.8474, "step": 1475 }, { "epoch": 0.36, "learning_rate": 0.0001614537093428685, "loss": 2.8078, "step": 1480 }, { "epoch": 0.37, "learning_rate": 0.00016105646032420982, "loss": 2.7696, "step": 1485 }, { "epoch": 0.37, "learning_rate": 0.00016065846586312617, "loss": 2.8652, "step": 1490 }, { "epoch": 0.37, "learning_rate": 0.0001602597320696584, "loss": 2.7888, "step": 1495 }, { "epoch": 0.37, "learning_rate": 0.00015986026506519755, "loss": 2.7349, "step": 1500 } ], "logging_steps": 5, "max_steps": 4059, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 1.8026854820610048e+19, "train_batch_size": 18, "trial_name": null, "trial_params": null }