{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 29459, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0033945483553413217, "grad_norm": 1.450656533241272, "learning_rate": 4.983027258223293e-05, "loss": 1.3353, "step": 100 }, { "epoch": 0.0067890967106826435, "grad_norm": 1.7437301874160767, "learning_rate": 4.966224243864354e-05, "loss": 1.2966, "step": 200 }, { "epoch": 0.010183645066023966, "grad_norm": 1.5103371143341064, "learning_rate": 4.949251502087647e-05, "loss": 1.2919, "step": 300 }, { "epoch": 0.013578193421365287, "grad_norm": 1.2713204622268677, "learning_rate": 4.932278760310941e-05, "loss": 1.417, "step": 400 }, { "epoch": 0.01697274177670661, "grad_norm": 1.9554837942123413, "learning_rate": 4.915306018534234e-05, "loss": 1.3327, "step": 500 }, { "epoch": 0.02036729013204793, "grad_norm": 5.13068962097168, "learning_rate": 4.898333276757528e-05, "loss": 1.3058, "step": 600 }, { "epoch": 0.023761838487389254, "grad_norm": 1.6552410125732422, "learning_rate": 4.881360534980821e-05, "loss": 1.3226, "step": 700 }, { "epoch": 0.027156386842730574, "grad_norm": 1.807737946510315, "learning_rate": 4.864387793204114e-05, "loss": 1.334, "step": 800 }, { "epoch": 0.030550935198071897, "grad_norm": 1.570865273475647, "learning_rate": 4.847415051427408e-05, "loss": 1.3456, "step": 900 }, { "epoch": 0.03394548355341322, "grad_norm": 1.8917468786239624, "learning_rate": 4.830442309650701e-05, "loss": 1.3255, "step": 1000 }, { "epoch": 0.03734003190875454, "grad_norm": 1.649667739868164, "learning_rate": 4.813469567873995e-05, "loss": 1.3013, "step": 1100 }, { "epoch": 0.04073458026409586, "grad_norm": 2.6449484825134277, "learning_rate": 4.796666553515055e-05, "loss": 1.3369, "step": 1200 }, { "epoch": 0.044129128619437186, "grad_norm": 1.874516487121582, "learning_rate": 4.779693811738348e-05, "loss": 1.27, "step": 1300 }, { "epoch": 0.04752367697477851, "grad_norm": 1.7284377813339233, "learning_rate": 4.762721069961642e-05, "loss": 1.3931, "step": 1400 }, { "epoch": 0.050918225330119825, "grad_norm": 3.4324212074279785, "learning_rate": 4.745748328184935e-05, "loss": 1.3457, "step": 1500 }, { "epoch": 0.05431277368546115, "grad_norm": 1.3490582704544067, "learning_rate": 4.7287755864082286e-05, "loss": 1.2487, "step": 1600 }, { "epoch": 0.05770732204080247, "grad_norm": 1.493403673171997, "learning_rate": 4.711802844631522e-05, "loss": 1.227, "step": 1700 }, { "epoch": 0.061101870396143794, "grad_norm": 1.8711298704147339, "learning_rate": 4.694830102854815e-05, "loss": 1.2983, "step": 1800 }, { "epoch": 0.06449641875148511, "grad_norm": 1.6430071592330933, "learning_rate": 4.677857361078109e-05, "loss": 1.3099, "step": 1900 }, { "epoch": 0.06789096710682643, "grad_norm": 1.425639033317566, "learning_rate": 4.660884619301402e-05, "loss": 1.2607, "step": 2000 }, { "epoch": 0.07128551546216776, "grad_norm": 1.7377012968063354, "learning_rate": 4.6439118775246956e-05, "loss": 1.2527, "step": 2100 }, { "epoch": 0.07468006381750908, "grad_norm": 1.5430407524108887, "learning_rate": 4.626939135747989e-05, "loss": 1.2877, "step": 2200 }, { "epoch": 0.0780746121728504, "grad_norm": 1.3293898105621338, "learning_rate": 4.6101361213890495e-05, "loss": 1.1781, "step": 2300 }, { "epoch": 0.08146916052819173, "grad_norm": 2.114269733428955, "learning_rate": 4.593163379612343e-05, "loss": 1.3405, "step": 2400 }, { "epoch": 0.08486370888353305, "grad_norm": 1.370626449584961, "learning_rate": 4.576190637835636e-05, "loss": 1.303, "step": 2500 }, { "epoch": 0.08825825723887437, "grad_norm": 1.6202032566070557, "learning_rate": 4.5592178960589295e-05, "loss": 1.2427, "step": 2600 }, { "epoch": 0.0916528055942157, "grad_norm": 1.6252601146697998, "learning_rate": 4.542245154282223e-05, "loss": 1.2768, "step": 2700 }, { "epoch": 0.09504735394955702, "grad_norm": 1.4289278984069824, "learning_rate": 4.5252724125055165e-05, "loss": 1.2522, "step": 2800 }, { "epoch": 0.09844190230489833, "grad_norm": 1.1954665184020996, "learning_rate": 4.50829967072881e-05, "loss": 1.3008, "step": 2900 }, { "epoch": 0.10183645066023965, "grad_norm": 2.3695414066314697, "learning_rate": 4.491326928952103e-05, "loss": 1.3439, "step": 3000 }, { "epoch": 0.10523099901558097, "grad_norm": 1.5015544891357422, "learning_rate": 4.4743541871753965e-05, "loss": 1.2042, "step": 3100 }, { "epoch": 0.1086255473709223, "grad_norm": 1.3509881496429443, "learning_rate": 4.45738144539869e-05, "loss": 1.2834, "step": 3200 }, { "epoch": 0.11202009572626362, "grad_norm": 1.1645573377609253, "learning_rate": 4.4404087036219835e-05, "loss": 1.3342, "step": 3300 }, { "epoch": 0.11541464408160494, "grad_norm": 1.3376731872558594, "learning_rate": 4.4234359618452764e-05, "loss": 1.2501, "step": 3400 }, { "epoch": 0.11880919243694626, "grad_norm": 1.184652328491211, "learning_rate": 4.40646322006857e-05, "loss": 1.2067, "step": 3500 }, { "epoch": 0.12220374079228759, "grad_norm": 1.5623388290405273, "learning_rate": 4.3894904782918635e-05, "loss": 1.2109, "step": 3600 }, { "epoch": 0.1255982891476289, "grad_norm": 1.537017583847046, "learning_rate": 4.372517736515157e-05, "loss": 1.2207, "step": 3700 }, { "epoch": 0.12899283750297022, "grad_norm": 1.5127204656600952, "learning_rate": 4.3555449947384505e-05, "loss": 1.3273, "step": 3800 }, { "epoch": 0.13238738585831156, "grad_norm": 1.504813313484192, "learning_rate": 4.3385722529617434e-05, "loss": 1.2347, "step": 3900 }, { "epoch": 0.13578193421365287, "grad_norm": 1.5462582111358643, "learning_rate": 4.321599511185037e-05, "loss": 1.3142, "step": 4000 }, { "epoch": 0.1391764825689942, "grad_norm": 1.377742886543274, "learning_rate": 4.3046267694083305e-05, "loss": 1.2685, "step": 4100 }, { "epoch": 0.1425710309243355, "grad_norm": 1.5139976739883423, "learning_rate": 4.287654027631624e-05, "loss": 1.2718, "step": 4200 }, { "epoch": 0.14596557927967685, "grad_norm": 1.2067663669586182, "learning_rate": 4.2706812858549175e-05, "loss": 1.2405, "step": 4300 }, { "epoch": 0.14936012763501816, "grad_norm": 1.2739530801773071, "learning_rate": 4.2537085440782104e-05, "loss": 1.2186, "step": 4400 }, { "epoch": 0.1527546759903595, "grad_norm": 1.4101356267929077, "learning_rate": 4.236735802301504e-05, "loss": 1.222, "step": 4500 }, { "epoch": 0.1561492243457008, "grad_norm": 1.8474892377853394, "learning_rate": 4.2197630605247975e-05, "loss": 1.2685, "step": 4600 }, { "epoch": 0.1595437727010421, "grad_norm": 1.5274946689605713, "learning_rate": 4.202790318748091e-05, "loss": 1.2161, "step": 4700 }, { "epoch": 0.16293832105638345, "grad_norm": 1.8831485509872437, "learning_rate": 4.1859873043891514e-05, "loss": 1.2828, "step": 4800 }, { "epoch": 0.16633286941172476, "grad_norm": 2.1567959785461426, "learning_rate": 4.169014562612444e-05, "loss": 1.3027, "step": 4900 }, { "epoch": 0.1697274177670661, "grad_norm": 1.4506981372833252, "learning_rate": 4.152041820835738e-05, "loss": 1.2094, "step": 5000 }, { "epoch": 0.1731219661224074, "grad_norm": 1.2342296838760376, "learning_rate": 4.1350690790590314e-05, "loss": 1.2523, "step": 5100 }, { "epoch": 0.17651651447774874, "grad_norm": 1.6375709772109985, "learning_rate": 4.118096337282325e-05, "loss": 1.2418, "step": 5200 }, { "epoch": 0.17991106283309005, "grad_norm": 1.3406407833099365, "learning_rate": 4.1011235955056184e-05, "loss": 1.2777, "step": 5300 }, { "epoch": 0.1833056111884314, "grad_norm": 1.2170027494430542, "learning_rate": 4.084150853728911e-05, "loss": 1.1458, "step": 5400 }, { "epoch": 0.1867001595437727, "grad_norm": 1.4051603078842163, "learning_rate": 4.067178111952205e-05, "loss": 1.2022, "step": 5500 }, { "epoch": 0.19009470789911403, "grad_norm": 1.3835875988006592, "learning_rate": 4.0502053701754984e-05, "loss": 1.2601, "step": 5600 }, { "epoch": 0.19348925625445534, "grad_norm": 1.7600008249282837, "learning_rate": 4.033232628398791e-05, "loss": 1.2172, "step": 5700 }, { "epoch": 0.19688380460979665, "grad_norm": 1.4803307056427002, "learning_rate": 4.0162598866220854e-05, "loss": 1.3049, "step": 5800 }, { "epoch": 0.200278352965138, "grad_norm": 1.2911592721939087, "learning_rate": 3.999287144845378e-05, "loss": 1.2018, "step": 5900 }, { "epoch": 0.2036729013204793, "grad_norm": 1.5094434022903442, "learning_rate": 3.982314403068672e-05, "loss": 1.1798, "step": 6000 }, { "epoch": 0.20706744967582064, "grad_norm": 1.44119131565094, "learning_rate": 3.9653416612919654e-05, "loss": 1.2012, "step": 6100 }, { "epoch": 0.21046199803116195, "grad_norm": 1.3899762630462646, "learning_rate": 3.948368919515258e-05, "loss": 1.1851, "step": 6200 }, { "epoch": 0.21385654638650328, "grad_norm": 2.047968864440918, "learning_rate": 3.9313961777385524e-05, "loss": 1.2239, "step": 6300 }, { "epoch": 0.2172510947418446, "grad_norm": 1.827493667602539, "learning_rate": 3.914423435961845e-05, "loss": 1.2358, "step": 6400 }, { "epoch": 0.22064564309718593, "grad_norm": 1.4631316661834717, "learning_rate": 3.8976204216029064e-05, "loss": 1.2058, "step": 6500 }, { "epoch": 0.22404019145252724, "grad_norm": 1.5218262672424316, "learning_rate": 3.880647679826199e-05, "loss": 1.1717, "step": 6600 }, { "epoch": 0.22743473980786857, "grad_norm": 1.3896803855895996, "learning_rate": 3.863674938049492e-05, "loss": 1.1912, "step": 6700 }, { "epoch": 0.23082928816320988, "grad_norm": 1.587547779083252, "learning_rate": 3.846702196272786e-05, "loss": 1.2109, "step": 6800 }, { "epoch": 0.23422383651855122, "grad_norm": 1.35820472240448, "learning_rate": 3.829729454496079e-05, "loss": 1.1828, "step": 6900 }, { "epoch": 0.23761838487389253, "grad_norm": 1.2581636905670166, "learning_rate": 3.8127567127193734e-05, "loss": 1.1701, "step": 7000 }, { "epoch": 0.24101293322923384, "grad_norm": 1.617680549621582, "learning_rate": 3.795783970942666e-05, "loss": 1.2159, "step": 7100 }, { "epoch": 0.24440748158457518, "grad_norm": 1.3621796369552612, "learning_rate": 3.778811229165959e-05, "loss": 1.1951, "step": 7200 }, { "epoch": 0.24780202993991648, "grad_norm": 1.8783783912658691, "learning_rate": 3.761838487389253e-05, "loss": 1.2664, "step": 7300 }, { "epoch": 0.2511965782952578, "grad_norm": 1.1315891742706299, "learning_rate": 3.744865745612546e-05, "loss": 1.206, "step": 7400 }, { "epoch": 0.25459112665059913, "grad_norm": 1.3531254529953003, "learning_rate": 3.7278930038358404e-05, "loss": 1.152, "step": 7500 }, { "epoch": 0.25798567500594044, "grad_norm": 1.7136415243148804, "learning_rate": 3.710920262059133e-05, "loss": 1.3282, "step": 7600 }, { "epoch": 0.2613802233612818, "grad_norm": 1.5798516273498535, "learning_rate": 3.693947520282426e-05, "loss": 1.2192, "step": 7700 }, { "epoch": 0.2647747717166231, "grad_norm": 2.0638535022735596, "learning_rate": 3.67697477850572e-05, "loss": 1.2233, "step": 7800 }, { "epoch": 0.2681693200719644, "grad_norm": 1.6473902463912964, "learning_rate": 3.660002036729013e-05, "loss": 1.2384, "step": 7900 }, { "epoch": 0.27156386842730573, "grad_norm": 1.4174180030822754, "learning_rate": 3.643029294952307e-05, "loss": 1.3018, "step": 8000 }, { "epoch": 0.27495841678264704, "grad_norm": 1.470323920249939, "learning_rate": 3.6260565531756e-05, "loss": 1.1662, "step": 8100 }, { "epoch": 0.2783529651379884, "grad_norm": 1.1814874410629272, "learning_rate": 3.609083811398893e-05, "loss": 1.1641, "step": 8200 }, { "epoch": 0.2817475134933297, "grad_norm": 1.496795892715454, "learning_rate": 3.592111069622187e-05, "loss": 1.208, "step": 8300 }, { "epoch": 0.285142061848671, "grad_norm": 1.602959394454956, "learning_rate": 3.57513832784548e-05, "loss": 1.2323, "step": 8400 }, { "epoch": 0.28853661020401233, "grad_norm": 1.4096314907073975, "learning_rate": 3.558165586068774e-05, "loss": 1.1804, "step": 8500 }, { "epoch": 0.2919311585593537, "grad_norm": 1.2292312383651733, "learning_rate": 3.541192844292067e-05, "loss": 1.1747, "step": 8600 }, { "epoch": 0.295325706914695, "grad_norm": 1.3961174488067627, "learning_rate": 3.52422010251536e-05, "loss": 1.1459, "step": 8700 }, { "epoch": 0.2987202552700363, "grad_norm": 1.2199640274047852, "learning_rate": 3.507247360738654e-05, "loss": 1.1992, "step": 8800 }, { "epoch": 0.3021148036253776, "grad_norm": 1.316805362701416, "learning_rate": 3.490274618961947e-05, "loss": 1.2202, "step": 8900 }, { "epoch": 0.305509351980719, "grad_norm": 1.3120840787887573, "learning_rate": 3.473301877185241e-05, "loss": 1.1095, "step": 9000 }, { "epoch": 0.3089039003360603, "grad_norm": 1.14743971824646, "learning_rate": 3.456329135408534e-05, "loss": 1.2154, "step": 9100 }, { "epoch": 0.3122984486914016, "grad_norm": 1.6754459142684937, "learning_rate": 3.439356393631827e-05, "loss": 1.1927, "step": 9200 }, { "epoch": 0.3156929970467429, "grad_norm": 1.2429569959640503, "learning_rate": 3.422383651855121e-05, "loss": 1.2623, "step": 9300 }, { "epoch": 0.3190875454020842, "grad_norm": 1.5485316514968872, "learning_rate": 3.405410910078414e-05, "loss": 1.176, "step": 9400 }, { "epoch": 0.3224820937574256, "grad_norm": 1.3292936086654663, "learning_rate": 3.388438168301708e-05, "loss": 1.1637, "step": 9500 }, { "epoch": 0.3258766421127669, "grad_norm": 1.4114725589752197, "learning_rate": 3.371465426525001e-05, "loss": 1.151, "step": 9600 }, { "epoch": 0.3292711904681082, "grad_norm": 1.6183195114135742, "learning_rate": 3.354492684748294e-05, "loss": 1.1929, "step": 9700 }, { "epoch": 0.3326657388234495, "grad_norm": 1.7640340328216553, "learning_rate": 3.3375199429715876e-05, "loss": 1.1472, "step": 9800 }, { "epoch": 0.3360602871787909, "grad_norm": 1.300631046295166, "learning_rate": 3.320547201194881e-05, "loss": 1.2158, "step": 9900 }, { "epoch": 0.3394548355341322, "grad_norm": 1.5510449409484863, "learning_rate": 3.303574459418175e-05, "loss": 1.231, "step": 10000 }, { "epoch": 0.3428493838894735, "grad_norm": 1.7029348611831665, "learning_rate": 3.286601717641468e-05, "loss": 1.152, "step": 10100 }, { "epoch": 0.3462439322448148, "grad_norm": 2.094801902770996, "learning_rate": 3.269628975864761e-05, "loss": 1.1792, "step": 10200 }, { "epoch": 0.3496384806001562, "grad_norm": 1.2476887702941895, "learning_rate": 3.2526562340880546e-05, "loss": 1.1297, "step": 10300 }, { "epoch": 0.3530330289554975, "grad_norm": 1.2222412824630737, "learning_rate": 3.235683492311348e-05, "loss": 1.2194, "step": 10400 }, { "epoch": 0.3564275773108388, "grad_norm": 1.2689149379730225, "learning_rate": 3.218710750534642e-05, "loss": 1.109, "step": 10500 }, { "epoch": 0.3598221256661801, "grad_norm": 1.1400436162948608, "learning_rate": 3.201738008757935e-05, "loss": 1.1778, "step": 10600 }, { "epoch": 0.3632166740215214, "grad_norm": 1.5304007530212402, "learning_rate": 3.184765266981228e-05, "loss": 1.2015, "step": 10700 }, { "epoch": 0.3666112223768628, "grad_norm": 1.4382191896438599, "learning_rate": 3.1677925252045216e-05, "loss": 1.2023, "step": 10800 }, { "epoch": 0.3700057707322041, "grad_norm": 1.2539787292480469, "learning_rate": 3.150819783427815e-05, "loss": 1.1627, "step": 10900 }, { "epoch": 0.3734003190875454, "grad_norm": 1.6526975631713867, "learning_rate": 3.133847041651109e-05, "loss": 1.1909, "step": 11000 }, { "epoch": 0.3767948674428867, "grad_norm": 1.477150559425354, "learning_rate": 3.1168742998744016e-05, "loss": 1.1767, "step": 11100 }, { "epoch": 0.38018941579822807, "grad_norm": 1.655372142791748, "learning_rate": 3.100071285515462e-05, "loss": 1.1715, "step": 11200 }, { "epoch": 0.3835839641535694, "grad_norm": 1.237518310546875, "learning_rate": 3.0830985437387555e-05, "loss": 1.1148, "step": 11300 }, { "epoch": 0.3869785125089107, "grad_norm": 2.0262339115142822, "learning_rate": 3.066125801962049e-05, "loss": 1.056, "step": 11400 }, { "epoch": 0.390373060864252, "grad_norm": 1.4669376611709595, "learning_rate": 3.0491530601853423e-05, "loss": 1.1773, "step": 11500 }, { "epoch": 0.3937676092195933, "grad_norm": 1.6047866344451904, "learning_rate": 3.032180318408636e-05, "loss": 1.1846, "step": 11600 }, { "epoch": 0.39716215757493467, "grad_norm": 1.5415077209472656, "learning_rate": 3.0152075766319293e-05, "loss": 1.1481, "step": 11700 }, { "epoch": 0.400556705930276, "grad_norm": 1.2356903553009033, "learning_rate": 2.9982348348552225e-05, "loss": 1.1914, "step": 11800 }, { "epoch": 0.4039512542856173, "grad_norm": 1.691815733909607, "learning_rate": 2.9814318204962833e-05, "loss": 1.2595, "step": 11900 }, { "epoch": 0.4073458026409586, "grad_norm": 1.3964107036590576, "learning_rate": 2.9644590787195765e-05, "loss": 1.137, "step": 12000 }, { "epoch": 0.41074035099629996, "grad_norm": 1.4641882181167603, "learning_rate": 2.94748633694287e-05, "loss": 1.2194, "step": 12100 }, { "epoch": 0.4141348993516413, "grad_norm": 1.2686254978179932, "learning_rate": 2.9305135951661632e-05, "loss": 1.1666, "step": 12200 }, { "epoch": 0.4175294477069826, "grad_norm": 1.5064525604248047, "learning_rate": 2.9135408533894564e-05, "loss": 1.2265, "step": 12300 }, { "epoch": 0.4209239960623239, "grad_norm": 1.3071587085723877, "learning_rate": 2.8965681116127503e-05, "loss": 1.0625, "step": 12400 }, { "epoch": 0.42431854441766526, "grad_norm": 1.4859912395477295, "learning_rate": 2.8795953698360435e-05, "loss": 1.1239, "step": 12500 }, { "epoch": 0.42771309277300656, "grad_norm": 1.4131548404693604, "learning_rate": 2.862622628059337e-05, "loss": 1.2125, "step": 12600 }, { "epoch": 0.4311076411283479, "grad_norm": 1.1708953380584717, "learning_rate": 2.8456498862826302e-05, "loss": 1.145, "step": 12700 }, { "epoch": 0.4345021894836892, "grad_norm": 1.4931575059890747, "learning_rate": 2.8286771445059234e-05, "loss": 1.102, "step": 12800 }, { "epoch": 0.4378967378390305, "grad_norm": 1.6308887004852295, "learning_rate": 2.8117044027292173e-05, "loss": 1.1574, "step": 12900 }, { "epoch": 0.44129128619437186, "grad_norm": 1.532914638519287, "learning_rate": 2.7947316609525105e-05, "loss": 1.1901, "step": 13000 }, { "epoch": 0.44468583454971317, "grad_norm": 1.5746792554855347, "learning_rate": 2.7777589191758037e-05, "loss": 1.2077, "step": 13100 }, { "epoch": 0.4480803829050545, "grad_norm": 1.7640366554260254, "learning_rate": 2.7607861773990972e-05, "loss": 1.2147, "step": 13200 }, { "epoch": 0.4514749312603958, "grad_norm": 1.4942810535430908, "learning_rate": 2.7438134356223904e-05, "loss": 1.2471, "step": 13300 }, { "epoch": 0.45486947961573715, "grad_norm": 1.449723243713379, "learning_rate": 2.7268406938456843e-05, "loss": 1.1991, "step": 13400 }, { "epoch": 0.45826402797107846, "grad_norm": 1.0219964981079102, "learning_rate": 2.7098679520689775e-05, "loss": 1.0989, "step": 13500 }, { "epoch": 0.46165857632641977, "grad_norm": 1.4733655452728271, "learning_rate": 2.6928952102922707e-05, "loss": 1.1652, "step": 13600 }, { "epoch": 0.4650531246817611, "grad_norm": 1.4748992919921875, "learning_rate": 2.6759224685155642e-05, "loss": 1.1522, "step": 13700 }, { "epoch": 0.46844767303710244, "grad_norm": 1.918239712715149, "learning_rate": 2.6589497267388574e-05, "loss": 1.0624, "step": 13800 }, { "epoch": 0.47184222139244375, "grad_norm": 1.4620022773742676, "learning_rate": 2.6419769849621513e-05, "loss": 1.2269, "step": 13900 }, { "epoch": 0.47523676974778506, "grad_norm": 1.647291898727417, "learning_rate": 2.6250042431854445e-05, "loss": 1.0928, "step": 14000 }, { "epoch": 0.47863131810312637, "grad_norm": 1.4002645015716553, "learning_rate": 2.6080315014087377e-05, "loss": 1.1475, "step": 14100 }, { "epoch": 0.4820258664584677, "grad_norm": 1.329160451889038, "learning_rate": 2.5910587596320312e-05, "loss": 1.1787, "step": 14200 }, { "epoch": 0.48542041481380904, "grad_norm": 1.0468798875808716, "learning_rate": 2.5740860178553244e-05, "loss": 1.1257, "step": 14300 }, { "epoch": 0.48881496316915035, "grad_norm": 1.1814810037612915, "learning_rate": 2.5571132760786176e-05, "loss": 1.2252, "step": 14400 }, { "epoch": 0.49220951152449166, "grad_norm": 1.442358136177063, "learning_rate": 2.5401405343019115e-05, "loss": 1.1474, "step": 14500 }, { "epoch": 0.49560405987983297, "grad_norm": 1.2082366943359375, "learning_rate": 2.5231677925252044e-05, "loss": 1.1271, "step": 14600 }, { "epoch": 0.49899860823517433, "grad_norm": 1.3044782876968384, "learning_rate": 2.5061950507484982e-05, "loss": 1.1204, "step": 14700 }, { "epoch": 0.5023931565905156, "grad_norm": 1.257338047027588, "learning_rate": 2.4893920363895583e-05, "loss": 1.1891, "step": 14800 }, { "epoch": 0.505787704945857, "grad_norm": 1.6963568925857544, "learning_rate": 2.472419294612852e-05, "loss": 1.0711, "step": 14900 }, { "epoch": 0.5091822533011983, "grad_norm": 1.4593158960342407, "learning_rate": 2.4554465528361454e-05, "loss": 1.1764, "step": 15000 }, { "epoch": 0.5125768016565396, "grad_norm": 1.2803332805633545, "learning_rate": 2.438473811059439e-05, "loss": 1.1213, "step": 15100 }, { "epoch": 0.5159713500118809, "grad_norm": 1.0880329608917236, "learning_rate": 2.421501069282732e-05, "loss": 1.0686, "step": 15200 }, { "epoch": 0.5193658983672222, "grad_norm": 1.350434422492981, "learning_rate": 2.4045283275060253e-05, "loss": 1.1244, "step": 15300 }, { "epoch": 0.5227604467225636, "grad_norm": 1.4851505756378174, "learning_rate": 2.387555585729319e-05, "loss": 1.1519, "step": 15400 }, { "epoch": 0.5261549950779049, "grad_norm": 1.4524593353271484, "learning_rate": 2.3705828439526124e-05, "loss": 1.1139, "step": 15500 }, { "epoch": 0.5295495434332462, "grad_norm": 1.3715015649795532, "learning_rate": 2.3536101021759056e-05, "loss": 1.1176, "step": 15600 }, { "epoch": 0.5329440917885875, "grad_norm": 1.3227180242538452, "learning_rate": 2.3366373603991988e-05, "loss": 1.1547, "step": 15700 }, { "epoch": 0.5363386401439288, "grad_norm": 1.742480754852295, "learning_rate": 2.3196646186224923e-05, "loss": 1.2338, "step": 15800 }, { "epoch": 0.5397331884992702, "grad_norm": 1.3990530967712402, "learning_rate": 2.302691876845786e-05, "loss": 1.1808, "step": 15900 }, { "epoch": 0.5431277368546115, "grad_norm": 1.6087653636932373, "learning_rate": 2.285719135069079e-05, "loss": 1.2029, "step": 16000 }, { "epoch": 0.5465222852099528, "grad_norm": 1.3504618406295776, "learning_rate": 2.2687463932923726e-05, "loss": 1.138, "step": 16100 }, { "epoch": 0.5499168335652941, "grad_norm": 1.226248025894165, "learning_rate": 2.2517736515156658e-05, "loss": 1.1006, "step": 16200 }, { "epoch": 0.5533113819206354, "grad_norm": 1.0794544219970703, "learning_rate": 2.2348009097389593e-05, "loss": 1.111, "step": 16300 }, { "epoch": 0.5567059302759768, "grad_norm": 1.3800761699676514, "learning_rate": 2.217828167962253e-05, "loss": 1.1554, "step": 16400 }, { "epoch": 0.5601004786313181, "grad_norm": 1.1783385276794434, "learning_rate": 2.200855426185546e-05, "loss": 1.157, "step": 16500 }, { "epoch": 0.5634950269866594, "grad_norm": 1.483588457107544, "learning_rate": 2.1838826844088396e-05, "loss": 1.1443, "step": 16600 }, { "epoch": 0.5668895753420008, "grad_norm": 1.847670555114746, "learning_rate": 2.1669099426321328e-05, "loss": 1.1667, "step": 16700 }, { "epoch": 0.570284123697342, "grad_norm": 1.524003028869629, "learning_rate": 2.1499372008554263e-05, "loss": 1.1555, "step": 16800 }, { "epoch": 0.5736786720526834, "grad_norm": 1.6308820247650146, "learning_rate": 2.1329644590787195e-05, "loss": 1.0674, "step": 16900 }, { "epoch": 0.5770732204080247, "grad_norm": 1.4396891593933105, "learning_rate": 2.115991717302013e-05, "loss": 1.1481, "step": 17000 }, { "epoch": 0.580467768763366, "grad_norm": 1.6904021501541138, "learning_rate": 2.0990189755253066e-05, "loss": 1.044, "step": 17100 }, { "epoch": 0.5838623171187074, "grad_norm": 1.8386590480804443, "learning_rate": 2.0820462337485998e-05, "loss": 1.0662, "step": 17200 }, { "epoch": 0.5872568654740487, "grad_norm": 1.3602131605148315, "learning_rate": 2.0650734919718933e-05, "loss": 1.072, "step": 17300 }, { "epoch": 0.59065141382939, "grad_norm": 1.2853094339370728, "learning_rate": 2.0481007501951865e-05, "loss": 1.1799, "step": 17400 }, { "epoch": 0.5940459621847313, "grad_norm": 1.418142557144165, "learning_rate": 2.03112800841848e-05, "loss": 1.1163, "step": 17500 }, { "epoch": 0.5974405105400726, "grad_norm": 1.3810557126998901, "learning_rate": 2.0141552666417736e-05, "loss": 1.1246, "step": 17600 }, { "epoch": 0.600835058895414, "grad_norm": 1.3166576623916626, "learning_rate": 1.9971825248650668e-05, "loss": 1.0635, "step": 17700 }, { "epoch": 0.6042296072507553, "grad_norm": 1.2918510437011719, "learning_rate": 1.98020978308836e-05, "loss": 1.1338, "step": 17800 }, { "epoch": 0.6076241556060966, "grad_norm": 1.3206653594970703, "learning_rate": 1.9632370413116535e-05, "loss": 1.1538, "step": 17900 }, { "epoch": 0.611018703961438, "grad_norm": 1.1084457635879517, "learning_rate": 1.946264299534947e-05, "loss": 1.0151, "step": 18000 }, { "epoch": 0.6144132523167792, "grad_norm": 1.6946609020233154, "learning_rate": 1.9292915577582406e-05, "loss": 1.173, "step": 18100 }, { "epoch": 0.6178078006721206, "grad_norm": 1.5061676502227783, "learning_rate": 1.9123188159815334e-05, "loss": 1.1463, "step": 18200 }, { "epoch": 0.6212023490274619, "grad_norm": 1.400976300239563, "learning_rate": 1.895346074204827e-05, "loss": 1.147, "step": 18300 }, { "epoch": 0.6245968973828032, "grad_norm": 1.398390769958496, "learning_rate": 1.8783733324281205e-05, "loss": 1.0824, "step": 18400 }, { "epoch": 0.6279914457381446, "grad_norm": 1.0492353439331055, "learning_rate": 1.861570318069181e-05, "loss": 1.1912, "step": 18500 }, { "epoch": 0.6313859940934858, "grad_norm": 1.5323091745376587, "learning_rate": 1.8445975762924745e-05, "loss": 1.1324, "step": 18600 }, { "epoch": 0.6347805424488272, "grad_norm": 1.2045379877090454, "learning_rate": 1.827624834515768e-05, "loss": 1.1011, "step": 18700 }, { "epoch": 0.6381750908041685, "grad_norm": 1.4627662897109985, "learning_rate": 1.8106520927390612e-05, "loss": 1.0789, "step": 18800 }, { "epoch": 0.6415696391595098, "grad_norm": 1.492099642753601, "learning_rate": 1.7936793509623544e-05, "loss": 1.1794, "step": 18900 }, { "epoch": 0.6449641875148512, "grad_norm": 1.8160879611968994, "learning_rate": 1.776706609185648e-05, "loss": 1.087, "step": 19000 }, { "epoch": 0.6483587358701924, "grad_norm": 1.135730504989624, "learning_rate": 1.7597338674089415e-05, "loss": 1.1466, "step": 19100 }, { "epoch": 0.6517532842255338, "grad_norm": 1.2633298635482788, "learning_rate": 1.7427611256322347e-05, "loss": 1.1453, "step": 19200 }, { "epoch": 0.6551478325808752, "grad_norm": 1.3639088869094849, "learning_rate": 1.725788383855528e-05, "loss": 1.1772, "step": 19300 }, { "epoch": 0.6585423809362164, "grad_norm": 1.5371415615081787, "learning_rate": 1.7088156420788214e-05, "loss": 1.1676, "step": 19400 }, { "epoch": 0.6619369292915578, "grad_norm": 1.1205295324325562, "learning_rate": 1.691842900302115e-05, "loss": 1.1312, "step": 19500 }, { "epoch": 0.665331477646899, "grad_norm": 1.3705852031707764, "learning_rate": 1.6748701585254085e-05, "loss": 1.2032, "step": 19600 }, { "epoch": 0.6687260260022404, "grad_norm": 1.6704633235931396, "learning_rate": 1.6578974167487017e-05, "loss": 1.1594, "step": 19700 }, { "epoch": 0.6721205743575818, "grad_norm": 1.3317358493804932, "learning_rate": 1.640924674971995e-05, "loss": 1.1118, "step": 19800 }, { "epoch": 0.675515122712923, "grad_norm": 1.666467547416687, "learning_rate": 1.6239519331952884e-05, "loss": 1.1402, "step": 19900 }, { "epoch": 0.6789096710682644, "grad_norm": 1.5140140056610107, "learning_rate": 1.606979191418582e-05, "loss": 1.0712, "step": 20000 }, { "epoch": 0.6823042194236056, "grad_norm": 1.5290478467941284, "learning_rate": 1.590006449641875e-05, "loss": 1.1054, "step": 20100 }, { "epoch": 0.685698767778947, "grad_norm": 1.408411979675293, "learning_rate": 1.5730337078651687e-05, "loss": 1.1755, "step": 20200 }, { "epoch": 0.6890933161342884, "grad_norm": 1.8979178667068481, "learning_rate": 1.556060966088462e-05, "loss": 1.0911, "step": 20300 }, { "epoch": 0.6924878644896296, "grad_norm": 1.3804025650024414, "learning_rate": 1.5390882243117554e-05, "loss": 1.1299, "step": 20400 }, { "epoch": 0.695882412844971, "grad_norm": 1.1603401899337769, "learning_rate": 1.5221154825350486e-05, "loss": 1.081, "step": 20500 }, { "epoch": 0.6992769612003124, "grad_norm": 1.4648966789245605, "learning_rate": 1.5051427407583421e-05, "loss": 1.1228, "step": 20600 }, { "epoch": 0.7026715095556536, "grad_norm": 1.589272379875183, "learning_rate": 1.4881699989816355e-05, "loss": 1.1068, "step": 20700 }, { "epoch": 0.706066057910995, "grad_norm": 1.337220311164856, "learning_rate": 1.471197257204929e-05, "loss": 1.1538, "step": 20800 }, { "epoch": 0.7094606062663362, "grad_norm": 1.5323350429534912, "learning_rate": 1.4542245154282224e-05, "loss": 1.0592, "step": 20900 }, { "epoch": 0.7128551546216776, "grad_norm": 1.6231937408447266, "learning_rate": 1.4372517736515156e-05, "loss": 1.1526, "step": 21000 }, { "epoch": 0.716249702977019, "grad_norm": 1.8754550218582153, "learning_rate": 1.4202790318748091e-05, "loss": 1.0773, "step": 21100 }, { "epoch": 0.7196442513323602, "grad_norm": 1.1128793954849243, "learning_rate": 1.4033062900981025e-05, "loss": 1.0372, "step": 21200 }, { "epoch": 0.7230387996877016, "grad_norm": 1.5695431232452393, "learning_rate": 1.386333548321396e-05, "loss": 1.1535, "step": 21300 }, { "epoch": 0.7264333480430428, "grad_norm": 1.327945351600647, "learning_rate": 1.3693608065446892e-05, "loss": 1.1107, "step": 21400 }, { "epoch": 0.7298278963983842, "grad_norm": 1.3291347026824951, "learning_rate": 1.3523880647679826e-05, "loss": 1.1303, "step": 21500 }, { "epoch": 0.7332224447537256, "grad_norm": 1.3102412223815918, "learning_rate": 1.3354153229912761e-05, "loss": 1.1271, "step": 21600 }, { "epoch": 0.7366169931090668, "grad_norm": 1.532332181930542, "learning_rate": 1.3184425812145695e-05, "loss": 1.1259, "step": 21700 }, { "epoch": 0.7400115414644082, "grad_norm": 1.8076393604278564, "learning_rate": 1.301469839437863e-05, "loss": 1.0277, "step": 21800 }, { "epoch": 0.7434060898197494, "grad_norm": 1.560998558998108, "learning_rate": 1.2844970976611562e-05, "loss": 1.0944, "step": 21900 }, { "epoch": 0.7468006381750908, "grad_norm": 1.4512039422988892, "learning_rate": 1.2675243558844496e-05, "loss": 1.1439, "step": 22000 }, { "epoch": 0.7501951865304322, "grad_norm": 1.1790564060211182, "learning_rate": 1.2505516141077431e-05, "loss": 1.1109, "step": 22100 }, { "epoch": 0.7535897348857734, "grad_norm": 0.8725073337554932, "learning_rate": 1.2337485997488036e-05, "loss": 1.1064, "step": 22200 }, { "epoch": 0.7569842832411148, "grad_norm": 1.7705230712890625, "learning_rate": 1.2167758579720968e-05, "loss": 1.1094, "step": 22300 }, { "epoch": 0.7603788315964561, "grad_norm": 1.72670578956604, "learning_rate": 1.1998031161953903e-05, "loss": 1.0784, "step": 22400 }, { "epoch": 0.7637733799517974, "grad_norm": 1.0623925924301147, "learning_rate": 1.1828303744186837e-05, "loss": 1.1441, "step": 22500 }, { "epoch": 0.7671679283071388, "grad_norm": 1.4572324752807617, "learning_rate": 1.165857632641977e-05, "loss": 1.0754, "step": 22600 }, { "epoch": 0.77056247666248, "grad_norm": 1.4778876304626465, "learning_rate": 1.1488848908652704e-05, "loss": 1.0816, "step": 22700 }, { "epoch": 0.7739570250178214, "grad_norm": 1.5544917583465576, "learning_rate": 1.1319121490885638e-05, "loss": 1.1449, "step": 22800 }, { "epoch": 0.7773515733731627, "grad_norm": 1.4993566274642944, "learning_rate": 1.1149394073118571e-05, "loss": 1.0315, "step": 22900 }, { "epoch": 0.780746121728504, "grad_norm": 1.5602749586105347, "learning_rate": 1.0979666655351507e-05, "loss": 1.151, "step": 23000 }, { "epoch": 0.7841406700838454, "grad_norm": 1.2788993120193481, "learning_rate": 1.080993923758444e-05, "loss": 1.1134, "step": 23100 }, { "epoch": 0.7875352184391866, "grad_norm": 1.4655214548110962, "learning_rate": 1.0640211819817374e-05, "loss": 1.1033, "step": 23200 }, { "epoch": 0.790929766794528, "grad_norm": 1.4986985921859741, "learning_rate": 1.0470484402050308e-05, "loss": 1.1025, "step": 23300 }, { "epoch": 0.7943243151498693, "grad_norm": 1.651713490486145, "learning_rate": 1.0300756984283241e-05, "loss": 1.173, "step": 23400 }, { "epoch": 0.7977188635052106, "grad_norm": 1.4241468906402588, "learning_rate": 1.0131029566516175e-05, "loss": 1.1499, "step": 23500 }, { "epoch": 0.801113411860552, "grad_norm": 1.4987541437149048, "learning_rate": 9.961302148749109e-06, "loss": 1.0355, "step": 23600 }, { "epoch": 0.8045079602158933, "grad_norm": 1.6847175359725952, "learning_rate": 9.791574730982044e-06, "loss": 1.0974, "step": 23700 }, { "epoch": 0.8079025085712346, "grad_norm": 1.319767713546753, "learning_rate": 9.621847313214976e-06, "loss": 1.0246, "step": 23800 }, { "epoch": 0.8112970569265759, "grad_norm": 0.8837277293205261, "learning_rate": 9.452119895447911e-06, "loss": 1.0828, "step": 23900 }, { "epoch": 0.8146916052819172, "grad_norm": 1.3911470174789429, "learning_rate": 9.282392477680845e-06, "loss": 1.1347, "step": 24000 }, { "epoch": 0.8180861536372586, "grad_norm": 1.6268776655197144, "learning_rate": 9.112665059913779e-06, "loss": 1.1426, "step": 24100 }, { "epoch": 0.8214807019925999, "grad_norm": 1.229019284248352, "learning_rate": 8.942937642146714e-06, "loss": 1.147, "step": 24200 }, { "epoch": 0.8248752503479412, "grad_norm": 1.4097239971160889, "learning_rate": 8.773210224379646e-06, "loss": 1.1377, "step": 24300 }, { "epoch": 0.8282697987032825, "grad_norm": 1.1406160593032837, "learning_rate": 8.603482806612581e-06, "loss": 1.1035, "step": 24400 }, { "epoch": 0.8316643470586238, "grad_norm": 1.0381433963775635, "learning_rate": 8.433755388845515e-06, "loss": 1.091, "step": 24500 }, { "epoch": 0.8350588954139652, "grad_norm": 1.3789398670196533, "learning_rate": 8.264027971078449e-06, "loss": 1.0108, "step": 24600 }, { "epoch": 0.8384534437693065, "grad_norm": 1.2343610525131226, "learning_rate": 8.094300553311382e-06, "loss": 1.1196, "step": 24700 }, { "epoch": 0.8418479921246478, "grad_norm": 1.3978173732757568, "learning_rate": 7.924573135544316e-06, "loss": 1.0261, "step": 24800 }, { "epoch": 0.8452425404799891, "grad_norm": 1.2678471803665161, "learning_rate": 7.75484571777725e-06, "loss": 1.0813, "step": 24900 }, { "epoch": 0.8486370888353305, "grad_norm": 1.5607575178146362, "learning_rate": 7.585118300010184e-06, "loss": 1.167, "step": 25000 }, { "epoch": 0.8520316371906718, "grad_norm": 1.1577645540237427, "learning_rate": 7.415390882243117e-06, "loss": 1.1326, "step": 25100 }, { "epoch": 0.8554261855460131, "grad_norm": 1.0699902772903442, "learning_rate": 7.2456634644760515e-06, "loss": 1.0751, "step": 25200 }, { "epoch": 0.8588207339013544, "grad_norm": 1.3334201574325562, "learning_rate": 7.075936046708986e-06, "loss": 1.083, "step": 25300 }, { "epoch": 0.8622152822566957, "grad_norm": 1.493215799331665, "learning_rate": 6.90620862894192e-06, "loss": 1.152, "step": 25400 }, { "epoch": 0.8656098306120371, "grad_norm": 1.5618408918380737, "learning_rate": 6.736481211174854e-06, "loss": 1.0474, "step": 25500 }, { "epoch": 0.8690043789673784, "grad_norm": 1.4898067712783813, "learning_rate": 6.566753793407787e-06, "loss": 1.1261, "step": 25600 }, { "epoch": 0.8723989273227197, "grad_norm": 1.1436446905136108, "learning_rate": 6.3970263756407215e-06, "loss": 1.0708, "step": 25700 }, { "epoch": 0.875793475678061, "grad_norm": 1.4544737339019775, "learning_rate": 6.227298957873655e-06, "loss": 1.0788, "step": 25800 }, { "epoch": 0.8791880240334023, "grad_norm": 1.0729115009307861, "learning_rate": 6.05757154010659e-06, "loss": 1.0525, "step": 25900 }, { "epoch": 0.8825825723887437, "grad_norm": 1.5516784191131592, "learning_rate": 5.887844122339523e-06, "loss": 1.0874, "step": 26000 }, { "epoch": 0.885977120744085, "grad_norm": 1.4837692975997925, "learning_rate": 5.718116704572457e-06, "loss": 1.1142, "step": 26100 }, { "epoch": 0.8893716690994263, "grad_norm": 1.1491631269454956, "learning_rate": 5.548389286805391e-06, "loss": 1.0618, "step": 26200 }, { "epoch": 0.8927662174547677, "grad_norm": 1.5417340993881226, "learning_rate": 5.378661869038324e-06, "loss": 0.993, "step": 26300 }, { "epoch": 0.896160765810109, "grad_norm": 0.9728216528892517, "learning_rate": 5.208934451271259e-06, "loss": 1.1583, "step": 26400 }, { "epoch": 0.8995553141654503, "grad_norm": 1.0447022914886475, "learning_rate": 5.0392070335041925e-06, "loss": 1.0472, "step": 26500 }, { "epoch": 0.9029498625207916, "grad_norm": 1.2869070768356323, "learning_rate": 4.869479615737126e-06, "loss": 1.0823, "step": 26600 }, { "epoch": 0.9063444108761329, "grad_norm": 1.6531902551651, "learning_rate": 4.69975219797006e-06, "loss": 1.0836, "step": 26700 }, { "epoch": 0.9097389592314743, "grad_norm": 1.559571385383606, "learning_rate": 4.530024780202994e-06, "loss": 1.0009, "step": 26800 }, { "epoch": 0.9131335075868156, "grad_norm": 1.3163347244262695, "learning_rate": 4.360297362435928e-06, "loss": 1.1214, "step": 26900 }, { "epoch": 0.9165280559421569, "grad_norm": 1.1032936573028564, "learning_rate": 4.1905699446688625e-06, "loss": 1.113, "step": 27000 }, { "epoch": 0.9199226042974982, "grad_norm": 1.4257267713546753, "learning_rate": 4.020842526901796e-06, "loss": 1.0477, "step": 27100 }, { "epoch": 0.9233171526528395, "grad_norm": 2.0018675327301025, "learning_rate": 3.85111510913473e-06, "loss": 1.1487, "step": 27200 }, { "epoch": 0.9267117010081809, "grad_norm": 1.38235342502594, "learning_rate": 3.681387691367664e-06, "loss": 1.0816, "step": 27300 }, { "epoch": 0.9301062493635222, "grad_norm": 1.4731274843215942, "learning_rate": 3.5116602736005976e-06, "loss": 1.0882, "step": 27400 }, { "epoch": 0.9335007977188635, "grad_norm": 1.225797414779663, "learning_rate": 3.3419328558335317e-06, "loss": 1.0551, "step": 27500 }, { "epoch": 0.9368953460742049, "grad_norm": 1.597345232963562, "learning_rate": 3.1722054380664653e-06, "loss": 1.0565, "step": 27600 }, { "epoch": 0.9402898944295461, "grad_norm": 1.092685341835022, "learning_rate": 3.0024780202993994e-06, "loss": 1.0821, "step": 27700 }, { "epoch": 0.9436844427848875, "grad_norm": 1.3143861293792725, "learning_rate": 2.832750602532333e-06, "loss": 1.0049, "step": 27800 }, { "epoch": 0.9470789911402288, "grad_norm": 1.1611847877502441, "learning_rate": 2.663023184765267e-06, "loss": 1.031, "step": 27900 }, { "epoch": 0.9504735394955701, "grad_norm": 1.3087385892868042, "learning_rate": 2.4932957669982012e-06, "loss": 0.9714, "step": 28000 }, { "epoch": 0.9538680878509115, "grad_norm": 1.118117094039917, "learning_rate": 2.323568349231135e-06, "loss": 1.0057, "step": 28100 }, { "epoch": 0.9572626362062527, "grad_norm": 1.5035566091537476, "learning_rate": 2.1538409314640686e-06, "loss": 1.1181, "step": 28200 }, { "epoch": 0.9606571845615941, "grad_norm": 1.372116208076477, "learning_rate": 1.9841135136970026e-06, "loss": 1.042, "step": 28300 }, { "epoch": 0.9640517329169354, "grad_norm": 1.189626693725586, "learning_rate": 1.8160833701076074e-06, "loss": 1.0346, "step": 28400 }, { "epoch": 0.9674462812722767, "grad_norm": 1.6013319492340088, "learning_rate": 1.6463559523405412e-06, "loss": 1.06, "step": 28500 }, { "epoch": 0.9708408296276181, "grad_norm": 1.7186366319656372, "learning_rate": 1.4766285345734751e-06, "loss": 1.1076, "step": 28600 }, { "epoch": 0.9742353779829593, "grad_norm": 1.3533858060836792, "learning_rate": 1.306901116806409e-06, "loss": 1.1391, "step": 28700 }, { "epoch": 0.9776299263383007, "grad_norm": 1.34947669506073, "learning_rate": 1.1371736990393429e-06, "loss": 1.0501, "step": 28800 }, { "epoch": 0.981024474693642, "grad_norm": 1.5835421085357666, "learning_rate": 9.674462812722767e-07, "loss": 1.1168, "step": 28900 }, { "epoch": 0.9844190230489833, "grad_norm": 1.2659107446670532, "learning_rate": 7.977188635052106e-07, "loss": 1.0592, "step": 29000 }, { "epoch": 0.9878135714043247, "grad_norm": 1.212120532989502, "learning_rate": 6.279914457381446e-07, "loss": 1.0867, "step": 29100 }, { "epoch": 0.9912081197596659, "grad_norm": 1.5085951089859009, "learning_rate": 4.582640279710785e-07, "loss": 1.0909, "step": 29200 }, { "epoch": 0.9946026681150073, "grad_norm": 1.6492177248001099, "learning_rate": 2.885366102040124e-07, "loss": 1.0747, "step": 29300 }, { "epoch": 0.9979972164703487, "grad_norm": 1.368004322052002, "learning_rate": 1.1880919243694626e-07, "loss": 0.9943, "step": 29400 } ], "logging_steps": 100, "max_steps": 29459, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 29459, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.93185924572119e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }