{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.8837229214830038, "eval_steps": 500, "global_step": 5500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.5656424760818481, "learning_rate": 0.00019972583961617548, "loss": 2.6115, "step": 10 }, { "epoch": 0.01, "grad_norm": 0.9568225145339966, "learning_rate": 0.0001993831391363948, "loss": 2.3674, "step": 20 }, { "epoch": 0.01, "grad_norm": 0.8508038520812988, "learning_rate": 0.00019904043865661412, "loss": 2.1487, "step": 30 }, { "epoch": 0.01, "grad_norm": 1.0186710357666016, "learning_rate": 0.00019869773817683345, "loss": 2.1421, "step": 40 }, { "epoch": 0.02, "grad_norm": 1.7234398126602173, "learning_rate": 0.0001983550376970528, "loss": 2.0021, "step": 50 }, { "epoch": 0.02, "grad_norm": 1.197520136833191, "learning_rate": 0.0001980123372172721, "loss": 2.0381, "step": 60 }, { "epoch": 0.02, "grad_norm": 1.0739991664886475, "learning_rate": 0.00019766963673749144, "loss": 1.8883, "step": 70 }, { "epoch": 0.03, "grad_norm": 1.0150132179260254, "learning_rate": 0.00019732693625771076, "loss": 1.8494, "step": 80 }, { "epoch": 0.03, "grad_norm": 1.236234426498413, "learning_rate": 0.0001969842357779301, "loss": 1.9171, "step": 90 }, { "epoch": 0.03, "grad_norm": 1.0886958837509155, "learning_rate": 0.00019664153529814942, "loss": 1.9387, "step": 100 }, { "epoch": 0.04, "grad_norm": 1.1191097497940063, "learning_rate": 0.00019629883481836875, "loss": 1.976, "step": 110 }, { "epoch": 0.04, "grad_norm": 1.0738675594329834, "learning_rate": 0.00019595613433858808, "loss": 1.8378, "step": 120 }, { "epoch": 0.04, "grad_norm": 0.648668646812439, "learning_rate": 0.0001956134338588074, "loss": 1.768, "step": 130 }, { "epoch": 0.05, "grad_norm": 0.9386289119720459, "learning_rate": 0.00019527073337902674, "loss": 2.0067, "step": 140 }, { "epoch": 0.05, "grad_norm": 1.1613832712173462, "learning_rate": 0.00019492803289924607, "loss": 1.9355, "step": 150 }, { "epoch": 0.05, "grad_norm": 0.7319044470787048, "learning_rate": 0.0001945853324194654, "loss": 1.9042, "step": 160 }, { "epoch": 0.06, "grad_norm": 0.9041644930839539, "learning_rate": 0.00019424263193968473, "loss": 1.7163, "step": 170 }, { "epoch": 0.06, "grad_norm": 0.9293299317359924, "learning_rate": 0.00019389993145990406, "loss": 1.807, "step": 180 }, { "epoch": 0.07, "grad_norm": 0.9214122295379639, "learning_rate": 0.00019355723098012336, "loss": 1.9056, "step": 190 }, { "epoch": 0.07, "grad_norm": 0.7177646160125732, "learning_rate": 0.0001932145305003427, "loss": 1.9574, "step": 200 }, { "epoch": 0.07, "grad_norm": 0.813965916633606, "learning_rate": 0.00019287183002056205, "loss": 1.7995, "step": 210 }, { "epoch": 0.08, "grad_norm": 1.0333760976791382, "learning_rate": 0.00019252912954078138, "loss": 1.814, "step": 220 }, { "epoch": 0.08, "grad_norm": 0.6691217422485352, "learning_rate": 0.0001921864290610007, "loss": 1.8261, "step": 230 }, { "epoch": 0.08, "grad_norm": 1.1737751960754395, "learning_rate": 0.00019184372858122, "loss": 1.9473, "step": 240 }, { "epoch": 0.09, "grad_norm": 1.1508344411849976, "learning_rate": 0.00019150102810143934, "loss": 1.9176, "step": 250 }, { "epoch": 0.09, "grad_norm": 0.6660133600234985, "learning_rate": 0.00019115832762165867, "loss": 1.835, "step": 260 }, { "epoch": 0.09, "grad_norm": 0.6423531174659729, "learning_rate": 0.00019081562714187803, "loss": 1.7194, "step": 270 }, { "epoch": 0.1, "grad_norm": 0.8241636157035828, "learning_rate": 0.00019047292666209733, "loss": 1.8679, "step": 280 }, { "epoch": 0.1, "grad_norm": 0.7184795141220093, "learning_rate": 0.00019013022618231666, "loss": 1.8129, "step": 290 }, { "epoch": 0.1, "grad_norm": 0.8253782391548157, "learning_rate": 0.000189787525702536, "loss": 1.8567, "step": 300 }, { "epoch": 0.11, "grad_norm": 1.417243242263794, "learning_rate": 0.00018944482522275532, "loss": 1.7741, "step": 310 }, { "epoch": 0.11, "grad_norm": 0.9040454626083374, "learning_rate": 0.00018910212474297465, "loss": 1.8458, "step": 320 }, { "epoch": 0.11, "grad_norm": 0.6580069065093994, "learning_rate": 0.00018875942426319398, "loss": 1.7982, "step": 330 }, { "epoch": 0.12, "grad_norm": 0.8849833011627197, "learning_rate": 0.0001884167237834133, "loss": 1.8622, "step": 340 }, { "epoch": 0.12, "grad_norm": 1.0523239374160767, "learning_rate": 0.00018807402330363264, "loss": 1.8608, "step": 350 }, { "epoch": 0.12, "grad_norm": 1.0496423244476318, "learning_rate": 0.00018773132282385194, "loss": 1.8245, "step": 360 }, { "epoch": 0.13, "grad_norm": 0.9488272070884705, "learning_rate": 0.00018738862234407127, "loss": 1.8933, "step": 370 }, { "epoch": 0.13, "grad_norm": 0.9461072087287903, "learning_rate": 0.00018704592186429063, "loss": 1.7277, "step": 380 }, { "epoch": 0.13, "grad_norm": 0.6415026187896729, "learning_rate": 0.00018670322138450996, "loss": 1.7843, "step": 390 }, { "epoch": 0.14, "grad_norm": 1.0457078218460083, "learning_rate": 0.0001863605209047293, "loss": 1.8874, "step": 400 }, { "epoch": 0.14, "grad_norm": 1.0890721082687378, "learning_rate": 0.0001860178204249486, "loss": 1.8536, "step": 410 }, { "epoch": 0.14, "grad_norm": 0.8896569013595581, "learning_rate": 0.00018567511994516792, "loss": 1.8297, "step": 420 }, { "epoch": 0.15, "grad_norm": 0.9457584023475647, "learning_rate": 0.00018533241946538728, "loss": 1.8061, "step": 430 }, { "epoch": 0.15, "grad_norm": 0.8208130598068237, "learning_rate": 0.0001849897189856066, "loss": 1.8238, "step": 440 }, { "epoch": 0.15, "grad_norm": 0.7884149551391602, "learning_rate": 0.0001846470185058259, "loss": 1.7419, "step": 450 }, { "epoch": 0.16, "grad_norm": 1.5733205080032349, "learning_rate": 0.00018430431802604524, "loss": 1.8829, "step": 460 }, { "epoch": 0.16, "grad_norm": 0.963455319404602, "learning_rate": 0.00018396161754626457, "loss": 1.822, "step": 470 }, { "epoch": 0.16, "grad_norm": 0.616909384727478, "learning_rate": 0.0001836189170664839, "loss": 1.7923, "step": 480 }, { "epoch": 0.17, "grad_norm": 0.5382218360900879, "learning_rate": 0.00018327621658670323, "loss": 1.719, "step": 490 }, { "epoch": 0.17, "grad_norm": 1.171004056930542, "learning_rate": 0.00018293351610692256, "loss": 1.8522, "step": 500 }, { "epoch": 0.17, "eval_loss": 1.9394277334213257, "eval_runtime": 33.4276, "eval_samples_per_second": 29.915, "eval_steps_per_second": 3.739, "step": 500 }, { "epoch": 0.17, "grad_norm": 0.7731293439865112, "learning_rate": 0.0001825908156271419, "loss": 1.9151, "step": 510 }, { "epoch": 0.18, "grad_norm": 0.8664043545722961, "learning_rate": 0.00018224811514736122, "loss": 1.6679, "step": 520 }, { "epoch": 0.18, "grad_norm": 1.3886076211929321, "learning_rate": 0.00018190541466758055, "loss": 1.8509, "step": 530 }, { "epoch": 0.18, "grad_norm": 0.7000617384910583, "learning_rate": 0.00018156271418779988, "loss": 1.8046, "step": 540 }, { "epoch": 0.19, "grad_norm": 0.8490706086158752, "learning_rate": 0.0001812200137080192, "loss": 1.748, "step": 550 }, { "epoch": 0.19, "grad_norm": 1.4293190240859985, "learning_rate": 0.00018087731322823854, "loss": 1.9725, "step": 560 }, { "epoch": 0.2, "grad_norm": 0.7126957178115845, "learning_rate": 0.00018053461274845787, "loss": 1.6888, "step": 570 }, { "epoch": 0.2, "grad_norm": 0.9974524974822998, "learning_rate": 0.00018019191226867717, "loss": 1.8405, "step": 580 }, { "epoch": 0.2, "grad_norm": 0.9911081790924072, "learning_rate": 0.0001798492117888965, "loss": 1.7753, "step": 590 }, { "epoch": 0.21, "grad_norm": 1.3659840822219849, "learning_rate": 0.00017950651130911585, "loss": 1.7435, "step": 600 }, { "epoch": 0.21, "grad_norm": 0.4976978302001953, "learning_rate": 0.00017916381082933518, "loss": 1.759, "step": 610 }, { "epoch": 0.21, "grad_norm": 0.7868736982345581, "learning_rate": 0.0001788211103495545, "loss": 1.7654, "step": 620 }, { "epoch": 0.22, "grad_norm": 1.006628155708313, "learning_rate": 0.00017847840986977382, "loss": 1.7862, "step": 630 }, { "epoch": 0.22, "grad_norm": 0.8664697408676147, "learning_rate": 0.00017813570938999315, "loss": 1.8815, "step": 640 }, { "epoch": 0.22, "grad_norm": 0.44789645075798035, "learning_rate": 0.00017779300891021248, "loss": 1.779, "step": 650 }, { "epoch": 0.23, "grad_norm": 0.9740760326385498, "learning_rate": 0.00017745030843043183, "loss": 1.7026, "step": 660 }, { "epoch": 0.23, "grad_norm": 0.9802984595298767, "learning_rate": 0.00017710760795065114, "loss": 1.8359, "step": 670 }, { "epoch": 0.23, "grad_norm": 1.0521053075790405, "learning_rate": 0.00017676490747087047, "loss": 1.7777, "step": 680 }, { "epoch": 0.24, "grad_norm": 0.6399825215339661, "learning_rate": 0.0001764222069910898, "loss": 1.8129, "step": 690 }, { "epoch": 0.24, "grad_norm": 1.1847810745239258, "learning_rate": 0.00017607950651130912, "loss": 1.8775, "step": 700 }, { "epoch": 0.24, "grad_norm": 0.7050787806510925, "learning_rate": 0.00017573680603152845, "loss": 1.8454, "step": 710 }, { "epoch": 0.25, "grad_norm": 0.8241177797317505, "learning_rate": 0.00017539410555174778, "loss": 1.7047, "step": 720 }, { "epoch": 0.25, "grad_norm": 1.743680477142334, "learning_rate": 0.00017505140507196711, "loss": 1.8251, "step": 730 }, { "epoch": 0.25, "grad_norm": 0.776196300983429, "learning_rate": 0.00017470870459218644, "loss": 1.8341, "step": 740 }, { "epoch": 0.26, "grad_norm": 0.6896054744720459, "learning_rate": 0.00017436600411240575, "loss": 1.7569, "step": 750 }, { "epoch": 0.26, "grad_norm": 0.703697919845581, "learning_rate": 0.0001740233036326251, "loss": 1.7696, "step": 760 }, { "epoch": 0.26, "grad_norm": 0.6734452247619629, "learning_rate": 0.00017368060315284443, "loss": 1.6639, "step": 770 }, { "epoch": 0.27, "grad_norm": 0.6856238842010498, "learning_rate": 0.00017333790267306376, "loss": 1.8419, "step": 780 }, { "epoch": 0.27, "grad_norm": 1.1194758415222168, "learning_rate": 0.00017299520219328306, "loss": 1.7916, "step": 790 }, { "epoch": 0.27, "grad_norm": 1.455841064453125, "learning_rate": 0.0001726525017135024, "loss": 1.7368, "step": 800 }, { "epoch": 0.28, "grad_norm": 0.5988683700561523, "learning_rate": 0.00017230980123372172, "loss": 1.8434, "step": 810 }, { "epoch": 0.28, "grad_norm": 0.9031710028648376, "learning_rate": 0.00017196710075394108, "loss": 1.7447, "step": 820 }, { "epoch": 0.28, "grad_norm": 1.2125264406204224, "learning_rate": 0.0001716244002741604, "loss": 1.9449, "step": 830 }, { "epoch": 0.29, "grad_norm": 0.9563066959381104, "learning_rate": 0.0001712816997943797, "loss": 1.7063, "step": 840 }, { "epoch": 0.29, "grad_norm": 0.8778769969940186, "learning_rate": 0.00017093899931459904, "loss": 1.802, "step": 850 }, { "epoch": 0.29, "grad_norm": 1.0570799112319946, "learning_rate": 0.00017059629883481837, "loss": 1.7331, "step": 860 }, { "epoch": 0.3, "grad_norm": 0.8234407305717468, "learning_rate": 0.0001702535983550377, "loss": 1.7943, "step": 870 }, { "epoch": 0.3, "grad_norm": 0.968658983707428, "learning_rate": 0.00016991089787525703, "loss": 1.8527, "step": 880 }, { "epoch": 0.3, "grad_norm": 0.6607180237770081, "learning_rate": 0.00016956819739547636, "loss": 1.8521, "step": 890 }, { "epoch": 0.31, "grad_norm": 0.8055354952812195, "learning_rate": 0.0001692254969156957, "loss": 1.6901, "step": 900 }, { "epoch": 0.31, "grad_norm": 0.8606925010681152, "learning_rate": 0.00016888279643591502, "loss": 1.7248, "step": 910 }, { "epoch": 0.32, "grad_norm": 0.9894892573356628, "learning_rate": 0.00016854009595613432, "loss": 1.7541, "step": 920 }, { "epoch": 0.32, "grad_norm": 0.8559629321098328, "learning_rate": 0.00016819739547635368, "loss": 1.7803, "step": 930 }, { "epoch": 0.32, "grad_norm": 0.8917673826217651, "learning_rate": 0.000167854694996573, "loss": 1.8224, "step": 940 }, { "epoch": 0.33, "grad_norm": 1.2621186971664429, "learning_rate": 0.00016751199451679234, "loss": 1.8253, "step": 950 }, { "epoch": 0.33, "grad_norm": 1.1135177612304688, "learning_rate": 0.00016716929403701167, "loss": 1.6519, "step": 960 }, { "epoch": 0.33, "grad_norm": 0.7034028172492981, "learning_rate": 0.00016682659355723097, "loss": 1.7079, "step": 970 }, { "epoch": 0.34, "grad_norm": 0.7942814826965332, "learning_rate": 0.0001664838930774503, "loss": 1.828, "step": 980 }, { "epoch": 0.34, "grad_norm": 0.9687950611114502, "learning_rate": 0.00016614119259766966, "loss": 1.7203, "step": 990 }, { "epoch": 0.34, "grad_norm": 1.1074302196502686, "learning_rate": 0.000165798492117889, "loss": 1.7146, "step": 1000 }, { "epoch": 0.34, "eval_loss": 1.9078810214996338, "eval_runtime": 33.2486, "eval_samples_per_second": 30.076, "eval_steps_per_second": 3.76, "step": 1000 }, { "epoch": 0.35, "grad_norm": 0.9533829689025879, "learning_rate": 0.0001654557916381083, "loss": 1.7596, "step": 1010 }, { "epoch": 0.35, "grad_norm": 1.0547090768814087, "learning_rate": 0.00016511309115832762, "loss": 1.9113, "step": 1020 }, { "epoch": 0.35, "grad_norm": 1.0186220407485962, "learning_rate": 0.00016477039067854695, "loss": 1.7845, "step": 1030 }, { "epoch": 0.36, "grad_norm": 0.9044001698493958, "learning_rate": 0.00016442769019876628, "loss": 1.8174, "step": 1040 }, { "epoch": 0.36, "grad_norm": 0.6433171033859253, "learning_rate": 0.0001640849897189856, "loss": 1.7702, "step": 1050 }, { "epoch": 0.36, "grad_norm": 1.2511520385742188, "learning_rate": 0.00016374228923920494, "loss": 1.9304, "step": 1060 }, { "epoch": 0.37, "grad_norm": 0.7901211977005005, "learning_rate": 0.00016339958875942427, "loss": 1.8432, "step": 1070 }, { "epoch": 0.37, "grad_norm": 1.515535831451416, "learning_rate": 0.0001630568882796436, "loss": 1.8818, "step": 1080 }, { "epoch": 0.37, "grad_norm": 0.9449120759963989, "learning_rate": 0.00016271418779986293, "loss": 1.8594, "step": 1090 }, { "epoch": 0.38, "grad_norm": 0.7776308059692383, "learning_rate": 0.00016237148732008226, "loss": 1.8896, "step": 1100 }, { "epoch": 0.38, "grad_norm": 1.3541969060897827, "learning_rate": 0.0001620287868403016, "loss": 1.8208, "step": 1110 }, { "epoch": 0.38, "grad_norm": 0.7614444494247437, "learning_rate": 0.00016168608636052092, "loss": 1.759, "step": 1120 }, { "epoch": 0.39, "grad_norm": 1.170345425605774, "learning_rate": 0.00016134338588074025, "loss": 1.6713, "step": 1130 }, { "epoch": 0.39, "grad_norm": 0.8094021081924438, "learning_rate": 0.00016100068540095955, "loss": 1.7394, "step": 1140 }, { "epoch": 0.39, "grad_norm": 1.169124722480774, "learning_rate": 0.0001606579849211789, "loss": 1.7609, "step": 1150 }, { "epoch": 0.4, "grad_norm": 0.6766496300697327, "learning_rate": 0.00016031528444139824, "loss": 1.7812, "step": 1160 }, { "epoch": 0.4, "grad_norm": 1.0808138847351074, "learning_rate": 0.00015997258396161757, "loss": 1.7777, "step": 1170 }, { "epoch": 0.4, "grad_norm": 0.6450923681259155, "learning_rate": 0.00015962988348183687, "loss": 1.8539, "step": 1180 }, { "epoch": 0.41, "grad_norm": 1.0518946647644043, "learning_rate": 0.0001592871830020562, "loss": 1.7799, "step": 1190 }, { "epoch": 0.41, "grad_norm": 0.7807414531707764, "learning_rate": 0.00015894448252227553, "loss": 1.774, "step": 1200 }, { "epoch": 0.41, "grad_norm": 1.4259986877441406, "learning_rate": 0.00015860178204249488, "loss": 1.8153, "step": 1210 }, { "epoch": 0.42, "grad_norm": 0.9342586994171143, "learning_rate": 0.0001582590815627142, "loss": 1.7495, "step": 1220 }, { "epoch": 0.42, "grad_norm": 0.7621099948883057, "learning_rate": 0.00015791638108293352, "loss": 1.7964, "step": 1230 }, { "epoch": 0.42, "grad_norm": 0.8253260254859924, "learning_rate": 0.00015757368060315285, "loss": 1.7669, "step": 1240 }, { "epoch": 0.43, "grad_norm": 0.6914420127868652, "learning_rate": 0.00015723098012337218, "loss": 1.803, "step": 1250 }, { "epoch": 0.43, "grad_norm": 0.7147281765937805, "learning_rate": 0.0001568882796435915, "loss": 1.8226, "step": 1260 }, { "epoch": 0.43, "grad_norm": 2.0851213932037354, "learning_rate": 0.00015654557916381084, "loss": 1.6957, "step": 1270 }, { "epoch": 0.44, "grad_norm": 0.6254770159721375, "learning_rate": 0.00015620287868403017, "loss": 1.75, "step": 1280 }, { "epoch": 0.44, "grad_norm": 1.0984652042388916, "learning_rate": 0.0001558601782042495, "loss": 1.8425, "step": 1290 }, { "epoch": 0.45, "grad_norm": 1.0353467464447021, "learning_rate": 0.00015551747772446882, "loss": 1.7995, "step": 1300 }, { "epoch": 0.45, "grad_norm": 0.6647160053253174, "learning_rate": 0.00015517477724468813, "loss": 1.866, "step": 1310 }, { "epoch": 0.45, "grad_norm": 0.6671775579452515, "learning_rate": 0.00015483207676490748, "loss": 1.6871, "step": 1320 }, { "epoch": 0.46, "grad_norm": 1.0024131536483765, "learning_rate": 0.00015448937628512681, "loss": 1.7424, "step": 1330 }, { "epoch": 0.46, "grad_norm": 1.0090551376342773, "learning_rate": 0.00015414667580534614, "loss": 1.7001, "step": 1340 }, { "epoch": 0.46, "grad_norm": 0.9725455045700073, "learning_rate": 0.00015380397532556545, "loss": 1.7114, "step": 1350 }, { "epoch": 0.47, "grad_norm": 0.6556392312049866, "learning_rate": 0.00015346127484578478, "loss": 1.5969, "step": 1360 }, { "epoch": 0.47, "grad_norm": 1.156596302986145, "learning_rate": 0.00015311857436600413, "loss": 1.7334, "step": 1370 }, { "epoch": 0.47, "grad_norm": 0.9172496199607849, "learning_rate": 0.00015277587388622346, "loss": 1.7373, "step": 1380 }, { "epoch": 0.48, "grad_norm": 0.9010474681854248, "learning_rate": 0.0001524331734064428, "loss": 1.8032, "step": 1390 }, { "epoch": 0.48, "grad_norm": 0.9486579298973083, "learning_rate": 0.0001520904729266621, "loss": 1.6388, "step": 1400 }, { "epoch": 0.48, "grad_norm": 0.8411978483200073, "learning_rate": 0.00015174777244688142, "loss": 1.7671, "step": 1410 }, { "epoch": 0.49, "grad_norm": 0.9575003385543823, "learning_rate": 0.00015140507196710075, "loss": 1.6523, "step": 1420 }, { "epoch": 0.49, "grad_norm": 0.7651090025901794, "learning_rate": 0.0001510623714873201, "loss": 1.812, "step": 1430 }, { "epoch": 0.49, "grad_norm": 0.8477165699005127, "learning_rate": 0.0001507196710075394, "loss": 1.7125, "step": 1440 }, { "epoch": 0.5, "grad_norm": 0.9737070202827454, "learning_rate": 0.00015037697052775874, "loss": 1.7506, "step": 1450 }, { "epoch": 0.5, "grad_norm": 1.0645496845245361, "learning_rate": 0.00015003427004797807, "loss": 1.7335, "step": 1460 }, { "epoch": 0.5, "grad_norm": 0.9303259253501892, "learning_rate": 0.0001496915695681974, "loss": 1.8838, "step": 1470 }, { "epoch": 0.51, "grad_norm": 0.6571500897407532, "learning_rate": 0.00014934886908841673, "loss": 1.8093, "step": 1480 }, { "epoch": 0.51, "grad_norm": 0.7994106411933899, "learning_rate": 0.00014900616860863606, "loss": 1.6691, "step": 1490 }, { "epoch": 0.51, "grad_norm": 0.8453437685966492, "learning_rate": 0.0001486634681288554, "loss": 1.6731, "step": 1500 }, { "epoch": 0.51, "eval_loss": 1.8940061330795288, "eval_runtime": 33.2126, "eval_samples_per_second": 30.109, "eval_steps_per_second": 3.764, "step": 1500 }, { "epoch": 0.52, "grad_norm": 1.0370814800262451, "learning_rate": 0.00014832076764907472, "loss": 1.7869, "step": 1510 }, { "epoch": 0.52, "grad_norm": 1.0886887311935425, "learning_rate": 0.00014797806716929405, "loss": 1.7887, "step": 1520 }, { "epoch": 0.52, "grad_norm": 0.9058669209480286, "learning_rate": 0.00014763536668951335, "loss": 1.6781, "step": 1530 }, { "epoch": 0.53, "grad_norm": 0.46401920914649963, "learning_rate": 0.0001472926662097327, "loss": 1.6465, "step": 1540 }, { "epoch": 0.53, "grad_norm": 0.6265978813171387, "learning_rate": 0.00014694996572995204, "loss": 1.8399, "step": 1550 }, { "epoch": 0.53, "grad_norm": 0.7882290482521057, "learning_rate": 0.00014660726525017137, "loss": 1.7707, "step": 1560 }, { "epoch": 0.54, "grad_norm": 0.7576068043708801, "learning_rate": 0.00014626456477039067, "loss": 1.8781, "step": 1570 }, { "epoch": 0.54, "grad_norm": 0.8988894820213318, "learning_rate": 0.00014592186429061, "loss": 1.7109, "step": 1580 }, { "epoch": 0.54, "grad_norm": 0.7934654951095581, "learning_rate": 0.00014557916381082933, "loss": 1.8261, "step": 1590 }, { "epoch": 0.55, "grad_norm": 0.9526162147521973, "learning_rate": 0.0001452364633310487, "loss": 1.7286, "step": 1600 }, { "epoch": 0.55, "grad_norm": 0.8650903701782227, "learning_rate": 0.000144893762851268, "loss": 1.8075, "step": 1610 }, { "epoch": 0.55, "grad_norm": 0.8737215399742126, "learning_rate": 0.00014455106237148732, "loss": 1.7683, "step": 1620 }, { "epoch": 0.56, "grad_norm": 1.0927869081497192, "learning_rate": 0.00014420836189170665, "loss": 1.8238, "step": 1630 }, { "epoch": 0.56, "grad_norm": 0.7490981817245483, "learning_rate": 0.00014386566141192598, "loss": 1.7528, "step": 1640 }, { "epoch": 0.57, "grad_norm": 0.6721557974815369, "learning_rate": 0.0001435229609321453, "loss": 1.7212, "step": 1650 }, { "epoch": 0.57, "grad_norm": 0.8125373125076294, "learning_rate": 0.00014318026045236464, "loss": 1.8369, "step": 1660 }, { "epoch": 0.57, "grad_norm": 0.598507821559906, "learning_rate": 0.00014283755997258397, "loss": 1.8455, "step": 1670 }, { "epoch": 0.58, "grad_norm": 1.2567535638809204, "learning_rate": 0.0001424948594928033, "loss": 1.7656, "step": 1680 }, { "epoch": 0.58, "grad_norm": 1.5279853343963623, "learning_rate": 0.00014215215901302263, "loss": 1.8297, "step": 1690 }, { "epoch": 0.58, "grad_norm": 1.1410638093948364, "learning_rate": 0.00014180945853324196, "loss": 1.7489, "step": 1700 }, { "epoch": 0.59, "grad_norm": 0.9007987976074219, "learning_rate": 0.0001414667580534613, "loss": 1.7473, "step": 1710 }, { "epoch": 0.59, "grad_norm": 0.5736974477767944, "learning_rate": 0.00014112405757368062, "loss": 1.8022, "step": 1720 }, { "epoch": 0.59, "grad_norm": 0.6310347318649292, "learning_rate": 0.00014078135709389995, "loss": 1.7676, "step": 1730 }, { "epoch": 0.6, "grad_norm": 0.9788106679916382, "learning_rate": 0.00014043865661411925, "loss": 1.7303, "step": 1740 }, { "epoch": 0.6, "grad_norm": 0.6612042784690857, "learning_rate": 0.00014009595613433858, "loss": 1.675, "step": 1750 }, { "epoch": 0.6, "grad_norm": 0.8740193247795105, "learning_rate": 0.00013975325565455794, "loss": 1.7945, "step": 1760 }, { "epoch": 0.61, "grad_norm": 0.9548364877700806, "learning_rate": 0.00013941055517477727, "loss": 1.7485, "step": 1770 }, { "epoch": 0.61, "grad_norm": 0.6676565408706665, "learning_rate": 0.00013906785469499657, "loss": 1.7479, "step": 1780 }, { "epoch": 0.61, "grad_norm": 0.6287640333175659, "learning_rate": 0.0001387251542152159, "loss": 1.7007, "step": 1790 }, { "epoch": 0.62, "grad_norm": 1.5443295240402222, "learning_rate": 0.00013838245373543523, "loss": 1.8916, "step": 1800 }, { "epoch": 0.62, "grad_norm": 0.9970656037330627, "learning_rate": 0.00013803975325565456, "loss": 1.6733, "step": 1810 }, { "epoch": 0.62, "grad_norm": 0.9320075511932373, "learning_rate": 0.00013769705277587391, "loss": 1.8622, "step": 1820 }, { "epoch": 0.63, "grad_norm": 0.8384440541267395, "learning_rate": 0.00013735435229609322, "loss": 1.6825, "step": 1830 }, { "epoch": 0.63, "grad_norm": 1.1807342767715454, "learning_rate": 0.00013701165181631255, "loss": 1.6548, "step": 1840 }, { "epoch": 0.63, "grad_norm": 0.7640541195869446, "learning_rate": 0.00013666895133653188, "loss": 1.8134, "step": 1850 }, { "epoch": 0.64, "grad_norm": 0.9137887358665466, "learning_rate": 0.0001363262508567512, "loss": 1.7685, "step": 1860 }, { "epoch": 0.64, "grad_norm": 0.8986667394638062, "learning_rate": 0.00013598355037697054, "loss": 1.7455, "step": 1870 }, { "epoch": 0.64, "grad_norm": 0.96836918592453, "learning_rate": 0.00013564084989718987, "loss": 1.8705, "step": 1880 }, { "epoch": 0.65, "grad_norm": 1.381028175354004, "learning_rate": 0.0001352981494174092, "loss": 1.7644, "step": 1890 }, { "epoch": 0.65, "grad_norm": 0.617438018321991, "learning_rate": 0.00013495544893762853, "loss": 1.6194, "step": 1900 }, { "epoch": 0.65, "grad_norm": 0.8686628937721252, "learning_rate": 0.00013461274845784783, "loss": 1.7171, "step": 1910 }, { "epoch": 0.66, "grad_norm": 0.7735409140586853, "learning_rate": 0.00013427004797806716, "loss": 1.725, "step": 1920 }, { "epoch": 0.66, "grad_norm": 1.0692516565322876, "learning_rate": 0.00013392734749828651, "loss": 1.762, "step": 1930 }, { "epoch": 0.66, "grad_norm": 0.763136625289917, "learning_rate": 0.00013358464701850584, "loss": 1.6546, "step": 1940 }, { "epoch": 0.67, "grad_norm": 0.9908429980278015, "learning_rate": 0.00013324194653872517, "loss": 1.6499, "step": 1950 }, { "epoch": 0.67, "grad_norm": 0.9493003487586975, "learning_rate": 0.00013289924605894448, "loss": 1.5616, "step": 1960 }, { "epoch": 0.67, "grad_norm": 0.8336248993873596, "learning_rate": 0.0001325565455791638, "loss": 1.7914, "step": 1970 }, { "epoch": 0.68, "grad_norm": 0.8938840627670288, "learning_rate": 0.00013221384509938314, "loss": 1.7274, "step": 1980 }, { "epoch": 0.68, "grad_norm": 1.0243479013442993, "learning_rate": 0.0001318711446196025, "loss": 1.6643, "step": 1990 }, { "epoch": 0.68, "grad_norm": 1.0226181745529175, "learning_rate": 0.0001315284441398218, "loss": 1.7626, "step": 2000 }, { "epoch": 0.68, "eval_loss": 1.8913378715515137, "eval_runtime": 33.1473, "eval_samples_per_second": 30.168, "eval_steps_per_second": 3.771, "step": 2000 }, { "epoch": 0.69, "grad_norm": 1.1059471368789673, "learning_rate": 0.00013118574366004112, "loss": 1.6362, "step": 2010 }, { "epoch": 0.69, "grad_norm": 1.3754314184188843, "learning_rate": 0.00013084304318026045, "loss": 1.8308, "step": 2020 }, { "epoch": 0.7, "grad_norm": 1.3899627923965454, "learning_rate": 0.00013050034270047978, "loss": 1.6982, "step": 2030 }, { "epoch": 0.7, "grad_norm": 0.8804599046707153, "learning_rate": 0.00013015764222069911, "loss": 1.8138, "step": 2040 }, { "epoch": 0.7, "grad_norm": 0.6578095555305481, "learning_rate": 0.00012981494174091844, "loss": 1.7211, "step": 2050 }, { "epoch": 0.71, "grad_norm": 1.5725558996200562, "learning_rate": 0.00012947224126113777, "loss": 1.8684, "step": 2060 }, { "epoch": 0.71, "grad_norm": 1.097717523574829, "learning_rate": 0.0001291295407813571, "loss": 1.7705, "step": 2070 }, { "epoch": 0.71, "grad_norm": 0.7564202547073364, "learning_rate": 0.00012878684030157643, "loss": 1.5935, "step": 2080 }, { "epoch": 0.72, "grad_norm": 0.732243537902832, "learning_rate": 0.00012844413982179576, "loss": 1.7694, "step": 2090 }, { "epoch": 0.72, "grad_norm": 0.6464608907699585, "learning_rate": 0.0001281014393420151, "loss": 1.8418, "step": 2100 }, { "epoch": 0.72, "grad_norm": 0.7090341448783875, "learning_rate": 0.00012775873886223442, "loss": 1.8122, "step": 2110 }, { "epoch": 0.73, "grad_norm": 1.1480237245559692, "learning_rate": 0.00012741603838245375, "loss": 1.7766, "step": 2120 }, { "epoch": 0.73, "grad_norm": 0.6737000346183777, "learning_rate": 0.00012707333790267305, "loss": 1.7876, "step": 2130 }, { "epoch": 0.73, "grad_norm": 0.7794924378395081, "learning_rate": 0.00012673063742289238, "loss": 1.8529, "step": 2140 }, { "epoch": 0.74, "grad_norm": 1.3136320114135742, "learning_rate": 0.00012638793694311174, "loss": 1.6699, "step": 2150 }, { "epoch": 0.74, "grad_norm": 0.884027361869812, "learning_rate": 0.00012604523646333107, "loss": 1.7689, "step": 2160 }, { "epoch": 0.74, "grad_norm": 1.103605031967163, "learning_rate": 0.00012570253598355037, "loss": 1.8594, "step": 2170 }, { "epoch": 0.75, "grad_norm": 1.3322539329528809, "learning_rate": 0.0001253598355037697, "loss": 1.6765, "step": 2180 }, { "epoch": 0.75, "grad_norm": 0.7840645909309387, "learning_rate": 0.00012501713502398903, "loss": 1.65, "step": 2190 }, { "epoch": 0.75, "grad_norm": 0.9259356260299683, "learning_rate": 0.00012467443454420836, "loss": 1.7805, "step": 2200 }, { "epoch": 0.76, "grad_norm": 1.3709288835525513, "learning_rate": 0.0001243317340644277, "loss": 1.7086, "step": 2210 }, { "epoch": 0.76, "grad_norm": 0.6325123310089111, "learning_rate": 0.00012398903358464702, "loss": 1.7124, "step": 2220 }, { "epoch": 0.76, "grad_norm": 0.854541003704071, "learning_rate": 0.00012364633310486635, "loss": 1.7089, "step": 2230 }, { "epoch": 0.77, "grad_norm": 0.8861531019210815, "learning_rate": 0.00012330363262508568, "loss": 1.8369, "step": 2240 }, { "epoch": 0.77, "grad_norm": 1.269750714302063, "learning_rate": 0.000122960932145305, "loss": 1.7598, "step": 2250 }, { "epoch": 0.77, "grad_norm": 0.999598503112793, "learning_rate": 0.00012261823166552434, "loss": 1.8376, "step": 2260 }, { "epoch": 0.78, "grad_norm": 0.7654330134391785, "learning_rate": 0.00012227553118574367, "loss": 1.7236, "step": 2270 }, { "epoch": 0.78, "grad_norm": 1.11728835105896, "learning_rate": 0.000121932830705963, "loss": 1.7375, "step": 2280 }, { "epoch": 0.78, "grad_norm": 0.7219797968864441, "learning_rate": 0.00012159013022618233, "loss": 1.7786, "step": 2290 }, { "epoch": 0.79, "grad_norm": 1.0127757787704468, "learning_rate": 0.00012124742974640165, "loss": 1.7003, "step": 2300 }, { "epoch": 0.79, "grad_norm": 1.0450137853622437, "learning_rate": 0.00012090472926662097, "loss": 1.7425, "step": 2310 }, { "epoch": 0.79, "grad_norm": 0.9303760528564453, "learning_rate": 0.0001205620287868403, "loss": 1.632, "step": 2320 }, { "epoch": 0.8, "grad_norm": 0.7303478717803955, "learning_rate": 0.00012021932830705965, "loss": 1.6918, "step": 2330 }, { "epoch": 0.8, "grad_norm": 0.6323578953742981, "learning_rate": 0.00011987662782727895, "loss": 1.672, "step": 2340 }, { "epoch": 0.8, "grad_norm": 0.715811014175415, "learning_rate": 0.00011953392734749828, "loss": 1.7613, "step": 2350 }, { "epoch": 0.81, "grad_norm": 0.7297527194023132, "learning_rate": 0.00011919122686771762, "loss": 1.7277, "step": 2360 }, { "epoch": 0.81, "grad_norm": 1.0844471454620361, "learning_rate": 0.00011884852638793695, "loss": 1.8143, "step": 2370 }, { "epoch": 0.82, "grad_norm": 0.9260643720626831, "learning_rate": 0.00011850582590815628, "loss": 1.7228, "step": 2380 }, { "epoch": 0.82, "grad_norm": 0.9541537761688232, "learning_rate": 0.0001181631254283756, "loss": 1.7143, "step": 2390 }, { "epoch": 0.82, "grad_norm": 1.0506033897399902, "learning_rate": 0.00011782042494859493, "loss": 1.7659, "step": 2400 }, { "epoch": 0.83, "grad_norm": 0.7201717495918274, "learning_rate": 0.00011747772446881427, "loss": 1.7257, "step": 2410 }, { "epoch": 0.83, "grad_norm": 0.8612362742424011, "learning_rate": 0.0001171350239890336, "loss": 1.7009, "step": 2420 }, { "epoch": 0.83, "grad_norm": 0.8745547533035278, "learning_rate": 0.0001167923235092529, "loss": 1.733, "step": 2430 }, { "epoch": 0.84, "grad_norm": 0.5927043557167053, "learning_rate": 0.00011644962302947225, "loss": 1.7724, "step": 2440 }, { "epoch": 0.84, "grad_norm": 0.6471837162971497, "learning_rate": 0.00011610692254969158, "loss": 1.7103, "step": 2450 }, { "epoch": 0.84, "grad_norm": 1.1340347528457642, "learning_rate": 0.0001157642220699109, "loss": 1.7053, "step": 2460 }, { "epoch": 0.85, "grad_norm": 0.8819349408149719, "learning_rate": 0.00011542152159013022, "loss": 1.7552, "step": 2470 }, { "epoch": 0.85, "grad_norm": 0.6587919592857361, "learning_rate": 0.00011507882111034955, "loss": 1.6482, "step": 2480 }, { "epoch": 0.85, "grad_norm": 1.0057884454727173, "learning_rate": 0.00011473612063056888, "loss": 1.7711, "step": 2490 }, { "epoch": 0.86, "grad_norm": 0.6465263962745667, "learning_rate": 0.00011439342015078823, "loss": 1.7565, "step": 2500 }, { "epoch": 0.86, "eval_loss": 1.8792312145233154, "eval_runtime": 33.1087, "eval_samples_per_second": 30.204, "eval_steps_per_second": 3.775, "step": 2500 }, { "epoch": 0.86, "grad_norm": 0.5970360040664673, "learning_rate": 0.00011405071967100756, "loss": 1.7179, "step": 2510 }, { "epoch": 0.86, "grad_norm": 1.3015583753585815, "learning_rate": 0.00011370801919122687, "loss": 1.7225, "step": 2520 }, { "epoch": 0.87, "grad_norm": 0.9235218167304993, "learning_rate": 0.0001133653187114462, "loss": 1.7657, "step": 2530 }, { "epoch": 0.87, "grad_norm": 1.025038480758667, "learning_rate": 0.00011302261823166553, "loss": 1.7755, "step": 2540 }, { "epoch": 0.87, "grad_norm": 0.8988834619522095, "learning_rate": 0.00011267991775188486, "loss": 1.8187, "step": 2550 }, { "epoch": 0.88, "grad_norm": 0.7810622453689575, "learning_rate": 0.00011233721727210418, "loss": 1.6565, "step": 2560 }, { "epoch": 0.88, "grad_norm": 1.6817054748535156, "learning_rate": 0.0001119945167923235, "loss": 1.7764, "step": 2570 }, { "epoch": 0.88, "grad_norm": 0.9688411355018616, "learning_rate": 0.00011165181631254285, "loss": 1.6599, "step": 2580 }, { "epoch": 0.89, "grad_norm": 0.742932915687561, "learning_rate": 0.00011130911583276218, "loss": 1.7552, "step": 2590 }, { "epoch": 0.89, "grad_norm": 0.5261206030845642, "learning_rate": 0.0001109664153529815, "loss": 1.6432, "step": 2600 }, { "epoch": 0.89, "grad_norm": 0.8997339606285095, "learning_rate": 0.00011062371487320082, "loss": 1.8438, "step": 2610 }, { "epoch": 0.9, "grad_norm": 0.8077126741409302, "learning_rate": 0.00011028101439342015, "loss": 1.8144, "step": 2620 }, { "epoch": 0.9, "grad_norm": 0.9872453212738037, "learning_rate": 0.00010993831391363948, "loss": 1.7427, "step": 2630 }, { "epoch": 0.9, "grad_norm": 1.1201390027999878, "learning_rate": 0.00010959561343385883, "loss": 1.7696, "step": 2640 }, { "epoch": 0.91, "grad_norm": 1.1584488153457642, "learning_rate": 0.00010925291295407813, "loss": 1.6236, "step": 2650 }, { "epoch": 0.91, "grad_norm": 0.8254250884056091, "learning_rate": 0.00010891021247429747, "loss": 1.6214, "step": 2660 }, { "epoch": 0.91, "grad_norm": 0.9825947284698486, "learning_rate": 0.0001085675119945168, "loss": 1.7889, "step": 2670 }, { "epoch": 0.92, "grad_norm": 1.0265246629714966, "learning_rate": 0.00010822481151473613, "loss": 1.7283, "step": 2680 }, { "epoch": 0.92, "grad_norm": 0.891777515411377, "learning_rate": 0.00010788211103495545, "loss": 1.8176, "step": 2690 }, { "epoch": 0.92, "grad_norm": 0.8920706510543823, "learning_rate": 0.00010753941055517478, "loss": 1.7676, "step": 2700 }, { "epoch": 0.93, "grad_norm": 1.072204828262329, "learning_rate": 0.00010719671007539411, "loss": 1.5836, "step": 2710 }, { "epoch": 0.93, "grad_norm": 0.9175311923027039, "learning_rate": 0.00010685400959561345, "loss": 1.8073, "step": 2720 }, { "epoch": 0.94, "grad_norm": 0.6199253797531128, "learning_rate": 0.00010651130911583275, "loss": 1.828, "step": 2730 }, { "epoch": 0.94, "grad_norm": 0.653229296207428, "learning_rate": 0.0001061686086360521, "loss": 1.7308, "step": 2740 }, { "epoch": 0.94, "grad_norm": 0.790413498878479, "learning_rate": 0.00010582590815627143, "loss": 1.8169, "step": 2750 }, { "epoch": 0.95, "grad_norm": 0.8657679557800293, "learning_rate": 0.00010548320767649076, "loss": 1.7453, "step": 2760 }, { "epoch": 0.95, "grad_norm": 0.6758552193641663, "learning_rate": 0.00010514050719671007, "loss": 1.7171, "step": 2770 }, { "epoch": 0.95, "grad_norm": 1.0935484170913696, "learning_rate": 0.0001047978067169294, "loss": 1.6754, "step": 2780 }, { "epoch": 0.96, "grad_norm": 0.8095535635948181, "learning_rate": 0.00010445510623714873, "loss": 1.8387, "step": 2790 }, { "epoch": 0.96, "grad_norm": 0.8804395198822021, "learning_rate": 0.00010411240575736808, "loss": 1.7839, "step": 2800 }, { "epoch": 0.96, "grad_norm": 0.945090115070343, "learning_rate": 0.0001037697052775874, "loss": 1.7196, "step": 2810 }, { "epoch": 0.97, "grad_norm": 0.6158414483070374, "learning_rate": 0.00010342700479780672, "loss": 1.8011, "step": 2820 }, { "epoch": 0.97, "grad_norm": 0.7917384505271912, "learning_rate": 0.00010308430431802605, "loss": 1.744, "step": 2830 }, { "epoch": 0.97, "grad_norm": 0.6415919065475464, "learning_rate": 0.00010274160383824538, "loss": 1.6379, "step": 2840 }, { "epoch": 0.98, "grad_norm": 0.6077090501785278, "learning_rate": 0.00010239890335846471, "loss": 1.657, "step": 2850 }, { "epoch": 0.98, "grad_norm": 1.036901593208313, "learning_rate": 0.00010205620287868403, "loss": 1.7059, "step": 2860 }, { "epoch": 0.98, "grad_norm": 0.7633301019668579, "learning_rate": 0.00010171350239890336, "loss": 1.8085, "step": 2870 }, { "epoch": 0.99, "grad_norm": 1.04219651222229, "learning_rate": 0.0001013708019191227, "loss": 1.6641, "step": 2880 }, { "epoch": 0.99, "grad_norm": 0.9899976849555969, "learning_rate": 0.00010102810143934203, "loss": 1.6819, "step": 2890 }, { "epoch": 0.99, "grad_norm": 0.755636990070343, "learning_rate": 0.00010068540095956133, "loss": 1.7573, "step": 2900 }, { "epoch": 1.0, "grad_norm": 1.1326630115509033, "learning_rate": 0.00010034270047978068, "loss": 1.6942, "step": 2910 }, { "epoch": 1.0, "grad_norm": 0.7579949498176575, "learning_rate": 0.0001, "loss": 1.6903, "step": 2920 }, { "epoch": 1.0, "grad_norm": 0.7203909754753113, "learning_rate": 9.965729952021933e-05, "loss": 1.5946, "step": 2930 }, { "epoch": 1.01, "grad_norm": 0.8731165528297424, "learning_rate": 9.931459904043866e-05, "loss": 1.6224, "step": 2940 }, { "epoch": 1.01, "grad_norm": 0.6287246942520142, "learning_rate": 9.8971898560658e-05, "loss": 1.6279, "step": 2950 }, { "epoch": 1.01, "grad_norm": 0.8794381618499756, "learning_rate": 9.862919808087731e-05, "loss": 1.7103, "step": 2960 }, { "epoch": 1.02, "grad_norm": 1.1305402517318726, "learning_rate": 9.828649760109665e-05, "loss": 1.5876, "step": 2970 }, { "epoch": 1.02, "grad_norm": 1.349693775177002, "learning_rate": 9.794379712131597e-05, "loss": 1.549, "step": 2980 }, { "epoch": 1.02, "grad_norm": 1.1124284267425537, "learning_rate": 9.76010966415353e-05, "loss": 1.5688, "step": 2990 }, { "epoch": 1.03, "grad_norm": 0.5864982604980469, "learning_rate": 9.725839616175463e-05, "loss": 1.6437, "step": 3000 }, { "epoch": 1.03, "eval_loss": 1.8886157274246216, "eval_runtime": 33.1481, "eval_samples_per_second": 30.168, "eval_steps_per_second": 3.771, "step": 3000 }, { "epoch": 1.03, "grad_norm": 0.8807237148284912, "learning_rate": 9.691569568197396e-05, "loss": 1.5888, "step": 3010 }, { "epoch": 1.03, "grad_norm": 0.8454139232635498, "learning_rate": 9.657299520219329e-05, "loss": 1.5414, "step": 3020 }, { "epoch": 1.04, "grad_norm": 0.9541159272193909, "learning_rate": 9.623029472241262e-05, "loss": 1.7525, "step": 3030 }, { "epoch": 1.04, "grad_norm": 1.38509202003479, "learning_rate": 9.588759424263193e-05, "loss": 1.5302, "step": 3040 }, { "epoch": 1.04, "grad_norm": 1.242966651916504, "learning_rate": 9.554489376285128e-05, "loss": 1.6085, "step": 3050 }, { "epoch": 1.05, "grad_norm": 1.1269468069076538, "learning_rate": 9.52021932830706e-05, "loss": 1.603, "step": 3060 }, { "epoch": 1.05, "grad_norm": 1.1521382331848145, "learning_rate": 9.485949280328992e-05, "loss": 1.6984, "step": 3070 }, { "epoch": 1.05, "grad_norm": 1.3359086513519287, "learning_rate": 9.451679232350927e-05, "loss": 1.4839, "step": 3080 }, { "epoch": 1.06, "grad_norm": 1.057581901550293, "learning_rate": 9.417409184372858e-05, "loss": 1.5541, "step": 3090 }, { "epoch": 1.06, "grad_norm": 1.090909719467163, "learning_rate": 9.383139136394791e-05, "loss": 1.5811, "step": 3100 }, { "epoch": 1.07, "grad_norm": 1.3244885206222534, "learning_rate": 9.348869088416724e-05, "loss": 1.6006, "step": 3110 }, { "epoch": 1.07, "grad_norm": 0.8855965733528137, "learning_rate": 9.314599040438657e-05, "loss": 1.5577, "step": 3120 }, { "epoch": 1.07, "grad_norm": 0.9480008482933044, "learning_rate": 9.28032899246059e-05, "loss": 1.6064, "step": 3130 }, { "epoch": 1.08, "grad_norm": 1.397888422012329, "learning_rate": 9.246058944482523e-05, "loss": 1.5708, "step": 3140 }, { "epoch": 1.08, "grad_norm": 0.8178092241287231, "learning_rate": 9.211788896504455e-05, "loss": 1.4722, "step": 3150 }, { "epoch": 1.08, "grad_norm": 1.3776417970657349, "learning_rate": 9.177518848526389e-05, "loss": 1.6941, "step": 3160 }, { "epoch": 1.09, "grad_norm": 1.3224530220031738, "learning_rate": 9.14324880054832e-05, "loss": 1.5414, "step": 3170 }, { "epoch": 1.09, "grad_norm": 1.3367009162902832, "learning_rate": 9.108978752570254e-05, "loss": 1.6275, "step": 3180 }, { "epoch": 1.09, "grad_norm": 1.0063951015472412, "learning_rate": 9.074708704592187e-05, "loss": 1.6761, "step": 3190 }, { "epoch": 1.1, "grad_norm": 1.320760726928711, "learning_rate": 9.04043865661412e-05, "loss": 1.5896, "step": 3200 }, { "epoch": 1.1, "grad_norm": 1.0159672498703003, "learning_rate": 9.006168608636053e-05, "loss": 1.5728, "step": 3210 }, { "epoch": 1.1, "grad_norm": 1.095314383506775, "learning_rate": 8.971898560657985e-05, "loss": 1.5329, "step": 3220 }, { "epoch": 1.11, "grad_norm": 1.212713360786438, "learning_rate": 8.937628512679918e-05, "loss": 1.5746, "step": 3230 }, { "epoch": 1.11, "grad_norm": 0.8203460574150085, "learning_rate": 8.903358464701851e-05, "loss": 1.6119, "step": 3240 }, { "epoch": 1.11, "grad_norm": 0.9643343091011047, "learning_rate": 8.869088416723784e-05, "loss": 1.5893, "step": 3250 }, { "epoch": 1.12, "grad_norm": 1.2415894269943237, "learning_rate": 8.834818368745716e-05, "loss": 1.6291, "step": 3260 }, { "epoch": 1.12, "grad_norm": 1.826658844947815, "learning_rate": 8.80054832076765e-05, "loss": 1.6394, "step": 3270 }, { "epoch": 1.12, "grad_norm": 1.3455665111541748, "learning_rate": 8.766278272789582e-05, "loss": 1.568, "step": 3280 }, { "epoch": 1.13, "grad_norm": 1.8909701108932495, "learning_rate": 8.732008224811515e-05, "loss": 1.5733, "step": 3290 }, { "epoch": 1.13, "grad_norm": 1.4277849197387695, "learning_rate": 8.697738176833448e-05, "loss": 1.6339, "step": 3300 }, { "epoch": 1.13, "grad_norm": 0.9563093185424805, "learning_rate": 8.663468128855381e-05, "loss": 1.5775, "step": 3310 }, { "epoch": 1.14, "grad_norm": 0.8461637496948242, "learning_rate": 8.629198080877314e-05, "loss": 1.653, "step": 3320 }, { "epoch": 1.14, "grad_norm": 1.0858458280563354, "learning_rate": 8.594928032899247e-05, "loss": 1.4778, "step": 3330 }, { "epoch": 1.14, "grad_norm": 1.1627178192138672, "learning_rate": 8.560657984921178e-05, "loss": 1.5374, "step": 3340 }, { "epoch": 1.15, "grad_norm": 1.196664571762085, "learning_rate": 8.526387936943113e-05, "loss": 1.6483, "step": 3350 }, { "epoch": 1.15, "grad_norm": 1.1990993022918701, "learning_rate": 8.492117888965046e-05, "loss": 1.5993, "step": 3360 }, { "epoch": 1.15, "grad_norm": 1.0623687505722046, "learning_rate": 8.457847840986977e-05, "loss": 1.5743, "step": 3370 }, { "epoch": 1.16, "grad_norm": 1.1684637069702148, "learning_rate": 8.423577793008912e-05, "loss": 1.5546, "step": 3380 }, { "epoch": 1.16, "grad_norm": 1.2448011636734009, "learning_rate": 8.389307745030843e-05, "loss": 1.496, "step": 3390 }, { "epoch": 1.16, "grad_norm": 0.9411953091621399, "learning_rate": 8.355037697052776e-05, "loss": 1.5966, "step": 3400 }, { "epoch": 1.17, "grad_norm": 1.0667563676834106, "learning_rate": 8.320767649074709e-05, "loss": 1.5128, "step": 3410 }, { "epoch": 1.17, "grad_norm": 1.50753653049469, "learning_rate": 8.286497601096642e-05, "loss": 1.5772, "step": 3420 }, { "epoch": 1.17, "grad_norm": 0.9346134662628174, "learning_rate": 8.252227553118574e-05, "loss": 1.6321, "step": 3430 }, { "epoch": 1.18, "grad_norm": 1.304190754890442, "learning_rate": 8.217957505140508e-05, "loss": 1.5656, "step": 3440 }, { "epoch": 1.18, "grad_norm": 1.058018684387207, "learning_rate": 8.18368745716244e-05, "loss": 1.5413, "step": 3450 }, { "epoch": 1.19, "grad_norm": 1.15809166431427, "learning_rate": 8.149417409184373e-05, "loss": 1.5673, "step": 3460 }, { "epoch": 1.19, "grad_norm": 1.092393159866333, "learning_rate": 8.115147361206306e-05, "loss": 1.5962, "step": 3470 }, { "epoch": 1.19, "grad_norm": 0.9390305876731873, "learning_rate": 8.080877313228239e-05, "loss": 1.5565, "step": 3480 }, { "epoch": 1.2, "grad_norm": 1.002120852470398, "learning_rate": 8.046607265250173e-05, "loss": 1.5803, "step": 3490 }, { "epoch": 1.2, "grad_norm": 1.0857172012329102, "learning_rate": 8.012337217272105e-05, "loss": 1.6345, "step": 3500 }, { "epoch": 1.2, "eval_loss": 1.8998303413391113, "eval_runtime": 33.1629, "eval_samples_per_second": 30.154, "eval_steps_per_second": 3.769, "step": 3500 }, { "epoch": 1.2, "grad_norm": 0.9931670427322388, "learning_rate": 7.978067169294038e-05, "loss": 1.605, "step": 3510 }, { "epoch": 1.21, "grad_norm": 1.3759890794754028, "learning_rate": 7.94379712131597e-05, "loss": 1.5059, "step": 3520 }, { "epoch": 1.21, "grad_norm": 1.2301968336105347, "learning_rate": 7.909527073337903e-05, "loss": 1.582, "step": 3530 }, { "epoch": 1.21, "grad_norm": 1.1518924236297607, "learning_rate": 7.875257025359835e-05, "loss": 1.5839, "step": 3540 }, { "epoch": 1.22, "grad_norm": 0.9161165952682495, "learning_rate": 7.84098697738177e-05, "loss": 1.5494, "step": 3550 }, { "epoch": 1.22, "grad_norm": 1.250705599784851, "learning_rate": 7.806716929403701e-05, "loss": 1.5178, "step": 3560 }, { "epoch": 1.22, "grad_norm": 0.7702249884605408, "learning_rate": 7.772446881425634e-05, "loss": 1.644, "step": 3570 }, { "epoch": 1.23, "grad_norm": 1.4425973892211914, "learning_rate": 7.738176833447567e-05, "loss": 1.5009, "step": 3580 }, { "epoch": 1.23, "grad_norm": 1.2036337852478027, "learning_rate": 7.7039067854695e-05, "loss": 1.5456, "step": 3590 }, { "epoch": 1.23, "grad_norm": 1.4006402492523193, "learning_rate": 7.669636737491433e-05, "loss": 1.5511, "step": 3600 }, { "epoch": 1.24, "grad_norm": 1.1983481645584106, "learning_rate": 7.635366689513366e-05, "loss": 1.5645, "step": 3610 }, { "epoch": 1.24, "grad_norm": 1.2755049467086792, "learning_rate": 7.601096641535297e-05, "loss": 1.6512, "step": 3620 }, { "epoch": 1.24, "grad_norm": 1.3783161640167236, "learning_rate": 7.566826593557232e-05, "loss": 1.6747, "step": 3630 }, { "epoch": 1.25, "grad_norm": 1.1947081089019775, "learning_rate": 7.532556545579165e-05, "loss": 1.6605, "step": 3640 }, { "epoch": 1.25, "grad_norm": 1.2230151891708374, "learning_rate": 7.498286497601096e-05, "loss": 1.6187, "step": 3650 }, { "epoch": 1.25, "grad_norm": 1.372226595878601, "learning_rate": 7.464016449623031e-05, "loss": 1.6354, "step": 3660 }, { "epoch": 1.26, "grad_norm": 1.2375085353851318, "learning_rate": 7.429746401644962e-05, "loss": 1.656, "step": 3670 }, { "epoch": 1.26, "grad_norm": 0.9703730940818787, "learning_rate": 7.395476353666895e-05, "loss": 1.5571, "step": 3680 }, { "epoch": 1.26, "grad_norm": 1.3475947380065918, "learning_rate": 7.361206305688828e-05, "loss": 1.5487, "step": 3690 }, { "epoch": 1.27, "grad_norm": 1.3879302740097046, "learning_rate": 7.326936257710761e-05, "loss": 1.6702, "step": 3700 }, { "epoch": 1.27, "grad_norm": 1.4043548107147217, "learning_rate": 7.292666209732694e-05, "loss": 1.5555, "step": 3710 }, { "epoch": 1.27, "grad_norm": 1.2937321662902832, "learning_rate": 7.258396161754627e-05, "loss": 1.5959, "step": 3720 }, { "epoch": 1.28, "grad_norm": 1.4525338411331177, "learning_rate": 7.224126113776559e-05, "loss": 1.6252, "step": 3730 }, { "epoch": 1.28, "grad_norm": 1.1089144945144653, "learning_rate": 7.189856065798493e-05, "loss": 1.5027, "step": 3740 }, { "epoch": 1.28, "grad_norm": 1.2625998258590698, "learning_rate": 7.155586017820425e-05, "loss": 1.5907, "step": 3750 }, { "epoch": 1.29, "grad_norm": 1.2458665370941162, "learning_rate": 7.121315969842358e-05, "loss": 1.54, "step": 3760 }, { "epoch": 1.29, "grad_norm": 1.2830859422683716, "learning_rate": 7.087045921864292e-05, "loss": 1.5867, "step": 3770 }, { "epoch": 1.29, "grad_norm": 1.0032719373703003, "learning_rate": 7.052775873886224e-05, "loss": 1.5374, "step": 3780 }, { "epoch": 1.3, "grad_norm": 0.9105421304702759, "learning_rate": 7.018505825908157e-05, "loss": 1.528, "step": 3790 }, { "epoch": 1.3, "grad_norm": 1.3588030338287354, "learning_rate": 6.98423577793009e-05, "loss": 1.6368, "step": 3800 }, { "epoch": 1.3, "grad_norm": 1.4903500080108643, "learning_rate": 6.949965729952023e-05, "loss": 1.675, "step": 3810 }, { "epoch": 1.31, "grad_norm": 1.229722261428833, "learning_rate": 6.915695681973956e-05, "loss": 1.555, "step": 3820 }, { "epoch": 1.31, "grad_norm": 0.9523776769638062, "learning_rate": 6.881425633995888e-05, "loss": 1.6608, "step": 3830 }, { "epoch": 1.32, "grad_norm": 1.986708164215088, "learning_rate": 6.84715558601782e-05, "loss": 1.7199, "step": 3840 }, { "epoch": 1.32, "grad_norm": 0.79183429479599, "learning_rate": 6.812885538039754e-05, "loss": 1.5034, "step": 3850 }, { "epoch": 1.32, "grad_norm": 1.1760715246200562, "learning_rate": 6.778615490061686e-05, "loss": 1.6812, "step": 3860 }, { "epoch": 1.33, "grad_norm": 1.7899055480957031, "learning_rate": 6.744345442083619e-05, "loss": 1.7389, "step": 3870 }, { "epoch": 1.33, "grad_norm": 1.2628593444824219, "learning_rate": 6.710075394105552e-05, "loss": 1.5317, "step": 3880 }, { "epoch": 1.33, "grad_norm": 1.037351131439209, "learning_rate": 6.675805346127485e-05, "loss": 1.5858, "step": 3890 }, { "epoch": 1.34, "grad_norm": 1.2006704807281494, "learning_rate": 6.641535298149417e-05, "loss": 1.4587, "step": 3900 }, { "epoch": 1.34, "grad_norm": 1.0877715349197388, "learning_rate": 6.607265250171351e-05, "loss": 1.5306, "step": 3910 }, { "epoch": 1.34, "grad_norm": 1.4047476053237915, "learning_rate": 6.572995202193284e-05, "loss": 1.5603, "step": 3920 }, { "epoch": 1.35, "grad_norm": 1.2444441318511963, "learning_rate": 6.538725154215215e-05, "loss": 1.5809, "step": 3930 }, { "epoch": 1.35, "grad_norm": 1.5738134384155273, "learning_rate": 6.50445510623715e-05, "loss": 1.5606, "step": 3940 }, { "epoch": 1.35, "grad_norm": 1.4850690364837646, "learning_rate": 6.470185058259081e-05, "loss": 1.4945, "step": 3950 }, { "epoch": 1.36, "grad_norm": 1.3746342658996582, "learning_rate": 6.435915010281016e-05, "loss": 1.5152, "step": 3960 }, { "epoch": 1.36, "grad_norm": 1.139249324798584, "learning_rate": 6.401644962302947e-05, "loss": 1.6004, "step": 3970 }, { "epoch": 1.36, "grad_norm": 1.3590480089187622, "learning_rate": 6.36737491432488e-05, "loss": 1.4926, "step": 3980 }, { "epoch": 1.37, "grad_norm": 1.6366995573043823, "learning_rate": 6.333104866346813e-05, "loss": 1.6734, "step": 3990 }, { "epoch": 1.37, "grad_norm": 1.1154892444610596, "learning_rate": 6.298834818368746e-05, "loss": 1.5628, "step": 4000 }, { "epoch": 1.37, "eval_loss": 1.9075069427490234, "eval_runtime": 33.1019, "eval_samples_per_second": 30.21, "eval_steps_per_second": 3.776, "step": 4000 }, { "epoch": 1.37, "grad_norm": 1.123923897743225, "learning_rate": 6.264564770390678e-05, "loss": 1.6206, "step": 4010 }, { "epoch": 1.38, "grad_norm": 1.3015213012695312, "learning_rate": 6.230294722412612e-05, "loss": 1.6292, "step": 4020 }, { "epoch": 1.38, "grad_norm": 1.8867294788360596, "learning_rate": 6.196024674434544e-05, "loss": 1.6625, "step": 4030 }, { "epoch": 1.38, "grad_norm": 1.5840169191360474, "learning_rate": 6.161754626456477e-05, "loss": 1.6224, "step": 4040 }, { "epoch": 1.39, "grad_norm": 0.9141889810562134, "learning_rate": 6.12748457847841e-05, "loss": 1.5051, "step": 4050 }, { "epoch": 1.39, "grad_norm": 1.5261061191558838, "learning_rate": 6.093214530500343e-05, "loss": 1.4289, "step": 4060 }, { "epoch": 1.39, "grad_norm": 1.2253016233444214, "learning_rate": 6.0589444825222764e-05, "loss": 1.6065, "step": 4070 }, { "epoch": 1.4, "grad_norm": 1.7163646221160889, "learning_rate": 6.0246744345442087e-05, "loss": 1.5978, "step": 4080 }, { "epoch": 1.4, "grad_norm": 1.0204969644546509, "learning_rate": 5.9904043865661416e-05, "loss": 1.6267, "step": 4090 }, { "epoch": 1.4, "grad_norm": 1.9314994812011719, "learning_rate": 5.956134338588074e-05, "loss": 1.6486, "step": 4100 }, { "epoch": 1.41, "grad_norm": 1.1685149669647217, "learning_rate": 5.9218642906100076e-05, "loss": 1.6397, "step": 4110 }, { "epoch": 1.41, "grad_norm": 1.422166347503662, "learning_rate": 5.88759424263194e-05, "loss": 1.6419, "step": 4120 }, { "epoch": 1.41, "grad_norm": 1.3074285984039307, "learning_rate": 5.853324194653873e-05, "loss": 1.565, "step": 4130 }, { "epoch": 1.42, "grad_norm": 0.965584933757782, "learning_rate": 5.819054146675805e-05, "loss": 1.5841, "step": 4140 }, { "epoch": 1.42, "grad_norm": 0.9101732969284058, "learning_rate": 5.784784098697739e-05, "loss": 1.6144, "step": 4150 }, { "epoch": 1.42, "grad_norm": 1.183640718460083, "learning_rate": 5.750514050719671e-05, "loss": 1.5998, "step": 4160 }, { "epoch": 1.43, "grad_norm": 1.1072790622711182, "learning_rate": 5.716244002741604e-05, "loss": 1.4634, "step": 4170 }, { "epoch": 1.43, "grad_norm": 1.608017086982727, "learning_rate": 5.681973954763536e-05, "loss": 1.5629, "step": 4180 }, { "epoch": 1.44, "grad_norm": 1.4969751834869385, "learning_rate": 5.64770390678547e-05, "loss": 1.5966, "step": 4190 }, { "epoch": 1.44, "grad_norm": 1.727695107460022, "learning_rate": 5.613433858807403e-05, "loss": 1.5456, "step": 4200 }, { "epoch": 1.44, "grad_norm": 1.4587767124176025, "learning_rate": 5.579163810829335e-05, "loss": 1.5238, "step": 4210 }, { "epoch": 1.45, "grad_norm": 1.5338579416275024, "learning_rate": 5.544893762851269e-05, "loss": 1.5485, "step": 4220 }, { "epoch": 1.45, "grad_norm": 0.8002244234085083, "learning_rate": 5.510623714873201e-05, "loss": 1.634, "step": 4230 }, { "epoch": 1.45, "grad_norm": 1.281417727470398, "learning_rate": 5.476353666895134e-05, "loss": 1.589, "step": 4240 }, { "epoch": 1.46, "grad_norm": 0.906808078289032, "learning_rate": 5.4420836189170664e-05, "loss": 1.57, "step": 4250 }, { "epoch": 1.46, "grad_norm": 1.799028992652893, "learning_rate": 5.407813570939e-05, "loss": 1.623, "step": 4260 }, { "epoch": 1.46, "grad_norm": 1.2560220956802368, "learning_rate": 5.3735435229609324e-05, "loss": 1.4231, "step": 4270 }, { "epoch": 1.47, "grad_norm": 1.315132737159729, "learning_rate": 5.339273474982865e-05, "loss": 1.553, "step": 4280 }, { "epoch": 1.47, "grad_norm": 1.1687719821929932, "learning_rate": 5.3050034270047976e-05, "loss": 1.691, "step": 4290 }, { "epoch": 1.47, "grad_norm": 1.182626724243164, "learning_rate": 5.270733379026731e-05, "loss": 1.58, "step": 4300 }, { "epoch": 1.48, "grad_norm": 0.819560170173645, "learning_rate": 5.2364633310486636e-05, "loss": 1.574, "step": 4310 }, { "epoch": 1.48, "grad_norm": 1.4093881845474243, "learning_rate": 5.2021932830705965e-05, "loss": 1.5805, "step": 4320 }, { "epoch": 1.48, "grad_norm": 2.079927921295166, "learning_rate": 5.167923235092529e-05, "loss": 1.6296, "step": 4330 }, { "epoch": 1.49, "grad_norm": 1.1056098937988281, "learning_rate": 5.1336531871144625e-05, "loss": 1.4964, "step": 4340 }, { "epoch": 1.49, "grad_norm": 1.924827218055725, "learning_rate": 5.0993831391363954e-05, "loss": 1.5223, "step": 4350 }, { "epoch": 1.49, "grad_norm": 1.461719274520874, "learning_rate": 5.065113091158328e-05, "loss": 1.5323, "step": 4360 }, { "epoch": 1.5, "grad_norm": 1.6647108793258667, "learning_rate": 5.0308430431802614e-05, "loss": 1.6025, "step": 4370 }, { "epoch": 1.5, "grad_norm": 1.33492910861969, "learning_rate": 4.996572995202194e-05, "loss": 1.5554, "step": 4380 }, { "epoch": 1.5, "grad_norm": 1.134582757949829, "learning_rate": 4.962302947224126e-05, "loss": 1.621, "step": 4390 }, { "epoch": 1.51, "grad_norm": 1.315508246421814, "learning_rate": 4.928032899246059e-05, "loss": 1.5828, "step": 4400 }, { "epoch": 1.51, "grad_norm": 1.3290214538574219, "learning_rate": 4.8937628512679926e-05, "loss": 1.578, "step": 4410 }, { "epoch": 1.51, "grad_norm": 1.2616337537765503, "learning_rate": 4.859492803289925e-05, "loss": 1.6177, "step": 4420 }, { "epoch": 1.52, "grad_norm": 1.4099230766296387, "learning_rate": 4.825222755311858e-05, "loss": 1.4926, "step": 4430 }, { "epoch": 1.52, "grad_norm": 0.9720429182052612, "learning_rate": 4.790952707333791e-05, "loss": 1.6552, "step": 4440 }, { "epoch": 1.52, "grad_norm": 1.1491189002990723, "learning_rate": 4.756682659355723e-05, "loss": 1.7001, "step": 4450 }, { "epoch": 1.53, "grad_norm": 1.1790263652801514, "learning_rate": 4.722412611377656e-05, "loss": 1.544, "step": 4460 }, { "epoch": 1.53, "grad_norm": 1.1880890130996704, "learning_rate": 4.688142563399589e-05, "loss": 1.6053, "step": 4470 }, { "epoch": 1.53, "grad_norm": 1.0895016193389893, "learning_rate": 4.653872515421522e-05, "loss": 1.455, "step": 4480 }, { "epoch": 1.54, "grad_norm": 1.230600118637085, "learning_rate": 4.619602467443454e-05, "loss": 1.5752, "step": 4490 }, { "epoch": 1.54, "grad_norm": 1.4027129411697388, "learning_rate": 4.585332419465387e-05, "loss": 1.5461, "step": 4500 }, { "epoch": 1.54, "eval_loss": 1.9048413038253784, "eval_runtime": 33.129, "eval_samples_per_second": 30.185, "eval_steps_per_second": 3.773, "step": 4500 }, { "epoch": 1.54, "grad_norm": 0.8590341806411743, "learning_rate": 4.55106237148732e-05, "loss": 1.6303, "step": 4510 }, { "epoch": 1.55, "grad_norm": 1.0827935934066772, "learning_rate": 4.516792323509253e-05, "loss": 1.5712, "step": 4520 }, { "epoch": 1.55, "grad_norm": 0.8795824646949768, "learning_rate": 4.4825222755311855e-05, "loss": 1.4882, "step": 4530 }, { "epoch": 1.55, "grad_norm": 1.509653091430664, "learning_rate": 4.4482522275531185e-05, "loss": 1.5534, "step": 4540 }, { "epoch": 1.56, "grad_norm": 1.0400638580322266, "learning_rate": 4.413982179575052e-05, "loss": 1.5681, "step": 4550 }, { "epoch": 1.56, "grad_norm": 1.1006004810333252, "learning_rate": 4.3797121315969844e-05, "loss": 1.5715, "step": 4560 }, { "epoch": 1.57, "grad_norm": 1.1621884107589722, "learning_rate": 4.3454420836189174e-05, "loss": 1.6373, "step": 4570 }, { "epoch": 1.57, "grad_norm": 1.0296626091003418, "learning_rate": 4.3111720356408503e-05, "loss": 1.6076, "step": 4580 }, { "epoch": 1.57, "grad_norm": 1.6784312725067139, "learning_rate": 4.276901987662783e-05, "loss": 1.6046, "step": 4590 }, { "epoch": 1.58, "grad_norm": 1.0730016231536865, "learning_rate": 4.2426319396847156e-05, "loss": 1.6317, "step": 4600 }, { "epoch": 1.58, "grad_norm": 1.0070710182189941, "learning_rate": 4.2083618917066486e-05, "loss": 1.5472, "step": 4610 }, { "epoch": 1.58, "grad_norm": 1.143546462059021, "learning_rate": 4.1740918437285815e-05, "loss": 1.4993, "step": 4620 }, { "epoch": 1.59, "grad_norm": 1.8565304279327393, "learning_rate": 4.1398217957505145e-05, "loss": 1.6021, "step": 4630 }, { "epoch": 1.59, "grad_norm": 1.1914728879928589, "learning_rate": 4.105551747772447e-05, "loss": 1.7231, "step": 4640 }, { "epoch": 1.59, "grad_norm": 1.6387224197387695, "learning_rate": 4.07128169979438e-05, "loss": 1.5804, "step": 4650 }, { "epoch": 1.6, "grad_norm": 1.65473210811615, "learning_rate": 4.037011651816313e-05, "loss": 1.6404, "step": 4660 }, { "epoch": 1.6, "grad_norm": 1.6097077131271362, "learning_rate": 4.002741603838245e-05, "loss": 1.4651, "step": 4670 }, { "epoch": 1.6, "grad_norm": 1.4290515184402466, "learning_rate": 3.968471555860178e-05, "loss": 1.5668, "step": 4680 }, { "epoch": 1.61, "grad_norm": 1.047481894493103, "learning_rate": 3.934201507882111e-05, "loss": 1.5275, "step": 4690 }, { "epoch": 1.61, "grad_norm": 1.3638914823532104, "learning_rate": 3.8999314599040446e-05, "loss": 1.6588, "step": 4700 }, { "epoch": 1.61, "grad_norm": 1.7712153196334839, "learning_rate": 3.865661411925977e-05, "loss": 1.6079, "step": 4710 }, { "epoch": 1.62, "grad_norm": 1.0898468494415283, "learning_rate": 3.83139136394791e-05, "loss": 1.574, "step": 4720 }, { "epoch": 1.62, "grad_norm": 1.4913599491119385, "learning_rate": 3.797121315969843e-05, "loss": 1.5376, "step": 4730 }, { "epoch": 1.62, "grad_norm": 1.225707769393921, "learning_rate": 3.762851267991775e-05, "loss": 1.5925, "step": 4740 }, { "epoch": 1.63, "grad_norm": 1.5699125528335571, "learning_rate": 3.728581220013708e-05, "loss": 1.593, "step": 4750 }, { "epoch": 1.63, "grad_norm": 1.318574070930481, "learning_rate": 3.694311172035641e-05, "loss": 1.5525, "step": 4760 }, { "epoch": 1.63, "grad_norm": 1.4544116258621216, "learning_rate": 3.660041124057574e-05, "loss": 1.5678, "step": 4770 }, { "epoch": 1.64, "grad_norm": 1.7460687160491943, "learning_rate": 3.6257710760795063e-05, "loss": 1.6081, "step": 4780 }, { "epoch": 1.64, "grad_norm": 1.4106998443603516, "learning_rate": 3.591501028101439e-05, "loss": 1.5687, "step": 4790 }, { "epoch": 1.64, "grad_norm": 1.0583499670028687, "learning_rate": 3.557230980123372e-05, "loss": 1.5467, "step": 4800 }, { "epoch": 1.65, "grad_norm": 1.2292665243148804, "learning_rate": 3.522960932145305e-05, "loss": 1.5491, "step": 4810 }, { "epoch": 1.65, "grad_norm": 1.3556251525878906, "learning_rate": 3.4886908841672375e-05, "loss": 1.5568, "step": 4820 }, { "epoch": 1.65, "grad_norm": 1.6374377012252808, "learning_rate": 3.4544208361891705e-05, "loss": 1.6016, "step": 4830 }, { "epoch": 1.66, "grad_norm": 1.0343750715255737, "learning_rate": 3.420150788211104e-05, "loss": 1.4693, "step": 4840 }, { "epoch": 1.66, "grad_norm": 1.378056526184082, "learning_rate": 3.3858807402330365e-05, "loss": 1.6081, "step": 4850 }, { "epoch": 1.66, "grad_norm": 1.370970368385315, "learning_rate": 3.3516106922549694e-05, "loss": 1.515, "step": 4860 }, { "epoch": 1.67, "grad_norm": 1.3780639171600342, "learning_rate": 3.3173406442769024e-05, "loss": 1.5644, "step": 4870 }, { "epoch": 1.67, "grad_norm": 1.0907922983169556, "learning_rate": 3.2830705962988354e-05, "loss": 1.5701, "step": 4880 }, { "epoch": 1.67, "grad_norm": 1.4807682037353516, "learning_rate": 3.2488005483207677e-05, "loss": 1.5535, "step": 4890 }, { "epoch": 1.68, "grad_norm": 1.7207825183868408, "learning_rate": 3.2145305003427006e-05, "loss": 1.6049, "step": 4900 }, { "epoch": 1.68, "grad_norm": 0.8784241676330566, "learning_rate": 3.1802604523646336e-05, "loss": 1.5213, "step": 4910 }, { "epoch": 1.69, "grad_norm": 1.6283917427062988, "learning_rate": 3.1459904043865666e-05, "loss": 1.4902, "step": 4920 }, { "epoch": 1.69, "grad_norm": 1.0017669200897217, "learning_rate": 3.111720356408499e-05, "loss": 1.5147, "step": 4930 }, { "epoch": 1.69, "grad_norm": 1.4256327152252197, "learning_rate": 3.077450308430432e-05, "loss": 1.518, "step": 4940 }, { "epoch": 1.7, "grad_norm": 1.4298090934753418, "learning_rate": 3.0431802604523645e-05, "loss": 1.5843, "step": 4950 }, { "epoch": 1.7, "grad_norm": 1.0894560813903809, "learning_rate": 3.0089102124742974e-05, "loss": 1.5931, "step": 4960 }, { "epoch": 1.7, "grad_norm": 1.8101505041122437, "learning_rate": 2.97464016449623e-05, "loss": 1.5629, "step": 4970 }, { "epoch": 1.71, "grad_norm": 0.966204047203064, "learning_rate": 2.9403701165181634e-05, "loss": 1.5048, "step": 4980 }, { "epoch": 1.71, "grad_norm": 1.2718944549560547, "learning_rate": 2.9061000685400963e-05, "loss": 1.6685, "step": 4990 }, { "epoch": 1.71, "grad_norm": 0.9012284874916077, "learning_rate": 2.871830020562029e-05, "loss": 1.5769, "step": 5000 }, { "epoch": 1.71, "eval_loss": 1.9053254127502441, "eval_runtime": 33.1389, "eval_samples_per_second": 30.176, "eval_steps_per_second": 3.772, "step": 5000 }, { "epoch": 1.72, "grad_norm": 1.4876313209533691, "learning_rate": 2.837559972583962e-05, "loss": 1.4846, "step": 5010 }, { "epoch": 1.72, "grad_norm": 0.9953039288520813, "learning_rate": 2.8032899246058946e-05, "loss": 1.6145, "step": 5020 }, { "epoch": 1.72, "grad_norm": 1.4575115442276, "learning_rate": 2.7690198766278275e-05, "loss": 1.5442, "step": 5030 }, { "epoch": 1.73, "grad_norm": 1.3410977125167847, "learning_rate": 2.73474982864976e-05, "loss": 1.5617, "step": 5040 }, { "epoch": 1.73, "grad_norm": 1.5489014387130737, "learning_rate": 2.700479780671693e-05, "loss": 1.6061, "step": 5050 }, { "epoch": 1.73, "grad_norm": 2.2693567276000977, "learning_rate": 2.6662097326936258e-05, "loss": 1.576, "step": 5060 }, { "epoch": 1.74, "grad_norm": 1.776106595993042, "learning_rate": 2.6319396847155587e-05, "loss": 1.57, "step": 5070 }, { "epoch": 1.74, "grad_norm": 1.0588148832321167, "learning_rate": 2.5976696367374914e-05, "loss": 1.476, "step": 5080 }, { "epoch": 1.74, "grad_norm": 1.133484125137329, "learning_rate": 2.5633995887594243e-05, "loss": 1.5103, "step": 5090 }, { "epoch": 1.75, "grad_norm": 1.3961825370788574, "learning_rate": 2.529129540781357e-05, "loss": 1.59, "step": 5100 }, { "epoch": 1.75, "grad_norm": 1.7427486181259155, "learning_rate": 2.49485949280329e-05, "loss": 1.5608, "step": 5110 }, { "epoch": 1.75, "grad_norm": 1.961029291152954, "learning_rate": 2.460589444825223e-05, "loss": 1.6278, "step": 5120 }, { "epoch": 1.76, "grad_norm": 1.2870323657989502, "learning_rate": 2.4263193968471555e-05, "loss": 1.5877, "step": 5130 }, { "epoch": 1.76, "grad_norm": 1.204353928565979, "learning_rate": 2.3920493488690885e-05, "loss": 1.5, "step": 5140 }, { "epoch": 1.76, "grad_norm": 0.9764713644981384, "learning_rate": 2.357779300891021e-05, "loss": 1.6451, "step": 5150 }, { "epoch": 1.77, "grad_norm": 1.2140144109725952, "learning_rate": 2.3235092529129544e-05, "loss": 1.4958, "step": 5160 }, { "epoch": 1.77, "grad_norm": 1.9167425632476807, "learning_rate": 2.289239204934887e-05, "loss": 1.569, "step": 5170 }, { "epoch": 1.77, "grad_norm": 1.864986538887024, "learning_rate": 2.25496915695682e-05, "loss": 1.5232, "step": 5180 }, { "epoch": 1.78, "grad_norm": 1.2421759366989136, "learning_rate": 2.2206991089787527e-05, "loss": 1.5894, "step": 5190 }, { "epoch": 1.78, "grad_norm": 1.412864089012146, "learning_rate": 2.1898560657984922e-05, "loss": 1.5737, "step": 5200 }, { "epoch": 1.78, "grad_norm": 1.105542778968811, "learning_rate": 2.1555860178204252e-05, "loss": 1.5747, "step": 5210 }, { "epoch": 1.79, "grad_norm": 1.7511961460113525, "learning_rate": 2.1213159698423578e-05, "loss": 1.5455, "step": 5220 }, { "epoch": 1.79, "grad_norm": 1.4287422895431519, "learning_rate": 2.0870459218642908e-05, "loss": 1.4871, "step": 5230 }, { "epoch": 1.79, "grad_norm": 0.835995614528656, "learning_rate": 2.0527758738862234e-05, "loss": 1.6128, "step": 5240 }, { "epoch": 1.8, "grad_norm": 1.8323885202407837, "learning_rate": 2.0185058259081564e-05, "loss": 1.6333, "step": 5250 }, { "epoch": 1.8, "grad_norm": 1.5953247547149658, "learning_rate": 1.984235777930089e-05, "loss": 1.5631, "step": 5260 }, { "epoch": 1.8, "grad_norm": 1.4622983932495117, "learning_rate": 1.9499657299520223e-05, "loss": 1.5065, "step": 5270 }, { "epoch": 1.81, "grad_norm": 1.6321667432785034, "learning_rate": 1.915695681973955e-05, "loss": 1.59, "step": 5280 }, { "epoch": 1.81, "grad_norm": 1.3693170547485352, "learning_rate": 1.8814256339958876e-05, "loss": 1.5847, "step": 5290 }, { "epoch": 1.82, "grad_norm": 1.5187667608261108, "learning_rate": 1.8471555860178205e-05, "loss": 1.541, "step": 5300 }, { "epoch": 1.82, "grad_norm": 1.1000255346298218, "learning_rate": 1.8128855380397532e-05, "loss": 1.4775, "step": 5310 }, { "epoch": 1.82, "grad_norm": 1.4071645736694336, "learning_rate": 1.778615490061686e-05, "loss": 1.6336, "step": 5320 }, { "epoch": 1.83, "grad_norm": 1.5703157186508179, "learning_rate": 1.7443454420836188e-05, "loss": 1.6725, "step": 5330 }, { "epoch": 1.83, "grad_norm": 1.0555702447891235, "learning_rate": 1.710075394105552e-05, "loss": 1.4712, "step": 5340 }, { "epoch": 1.83, "grad_norm": 1.4873102903366089, "learning_rate": 1.6758053461274847e-05, "loss": 1.5741, "step": 5350 }, { "epoch": 1.84, "grad_norm": 1.1715468168258667, "learning_rate": 1.6415352981494177e-05, "loss": 1.4884, "step": 5360 }, { "epoch": 1.84, "grad_norm": 1.82741379737854, "learning_rate": 1.6072652501713503e-05, "loss": 1.5778, "step": 5370 }, { "epoch": 1.84, "grad_norm": 1.6479945182800293, "learning_rate": 1.5729952021932833e-05, "loss": 1.6802, "step": 5380 }, { "epoch": 1.85, "grad_norm": 1.0871607065200806, "learning_rate": 1.538725154215216e-05, "loss": 1.5509, "step": 5390 }, { "epoch": 1.85, "grad_norm": 1.7326961755752563, "learning_rate": 1.5044551062371487e-05, "loss": 1.5746, "step": 5400 }, { "epoch": 1.85, "grad_norm": 1.3573588132858276, "learning_rate": 1.4701850582590817e-05, "loss": 1.6147, "step": 5410 }, { "epoch": 1.86, "grad_norm": 1.807897925376892, "learning_rate": 1.4359150102810145e-05, "loss": 1.6446, "step": 5420 }, { "epoch": 1.86, "grad_norm": 1.0243467092514038, "learning_rate": 1.4016449623029473e-05, "loss": 1.5844, "step": 5430 }, { "epoch": 1.86, "grad_norm": 1.709069013595581, "learning_rate": 1.36737491432488e-05, "loss": 1.5774, "step": 5440 }, { "epoch": 1.87, "grad_norm": 1.717564582824707, "learning_rate": 1.3331048663468129e-05, "loss": 1.5898, "step": 5450 }, { "epoch": 1.87, "grad_norm": 1.1066781282424927, "learning_rate": 1.2988348183687457e-05, "loss": 1.5828, "step": 5460 }, { "epoch": 1.87, "grad_norm": 1.119360089302063, "learning_rate": 1.2645647703906785e-05, "loss": 1.5321, "step": 5470 }, { "epoch": 1.88, "grad_norm": 1.0519651174545288, "learning_rate": 1.2302947224126114e-05, "loss": 1.5691, "step": 5480 }, { "epoch": 1.88, "grad_norm": 1.7377208471298218, "learning_rate": 1.1960246744345442e-05, "loss": 1.5568, "step": 5490 }, { "epoch": 1.88, "grad_norm": 1.4080170392990112, "learning_rate": 1.1617546264564772e-05, "loss": 1.6109, "step": 5500 }, { "epoch": 1.88, "eval_loss": 1.9016900062561035, "eval_runtime": 33.1979, "eval_samples_per_second": 30.122, "eval_steps_per_second": 3.765, "step": 5500 } ], "logging_steps": 10, "max_steps": 5838, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 2.980420245786624e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }