{ "best_metric": 2.170783519744873, "best_model_checkpoint": "./output/checkpoint-4800", "epoch": 0.5597014925373134, "eval_steps": 150, "global_step": 4950, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011307100859339666, "grad_norm": 13.91769790649414, "learning_rate": 1.25e-06, "loss": 2.3085, "step": 10 }, { "epoch": 0.002261420171867933, "grad_norm": 10.41028881072998, "learning_rate": 2.5e-06, "loss": 2.2577, "step": 20 }, { "epoch": 0.0033921302578018998, "grad_norm": 11.071020126342773, "learning_rate": 3.75e-06, "loss": 2.3389, "step": 30 }, { "epoch": 0.004522840343735866, "grad_norm": 10.67103385925293, "learning_rate": 5e-06, "loss": 2.1177, "step": 40 }, { "epoch": 0.005653550429669833, "grad_norm": 8.47650146484375, "learning_rate": 6.25e-06, "loss": 2.2182, "step": 50 }, { "epoch": 0.0067842605156037995, "grad_norm": 9.967148780822754, "learning_rate": 7.5e-06, "loss": 2.3602, "step": 60 }, { "epoch": 0.007914970601537766, "grad_norm": 8.98648738861084, "learning_rate": 8.75e-06, "loss": 2.2154, "step": 70 }, { "epoch": 0.009045680687471733, "grad_norm": 9.23805046081543, "learning_rate": 1e-05, "loss": 2.2039, "step": 80 }, { "epoch": 0.0101763907734057, "grad_norm": 8.63106632232666, "learning_rate": 1.125e-05, "loss": 2.1994, "step": 90 }, { "epoch": 0.011307100859339666, "grad_norm": 6.974855899810791, "learning_rate": 1.25e-05, "loss": 2.3206, "step": 100 }, { "epoch": 0.012437810945273632, "grad_norm": 7.992189884185791, "learning_rate": 1.2499871543489788e-05, "loss": 2.1591, "step": 110 }, { "epoch": 0.013568521031207599, "grad_norm": 9.83857250213623, "learning_rate": 1.2499486179239496e-05, "loss": 2.2987, "step": 120 }, { "epoch": 0.014699231117141566, "grad_norm": 13.996771812438965, "learning_rate": 1.2498843923089939e-05, "loss": 2.1778, "step": 130 }, { "epoch": 0.015829941203075532, "grad_norm": 7.180433750152588, "learning_rate": 1.249794480144175e-05, "loss": 2.1382, "step": 140 }, { "epoch": 0.016960651289009497, "grad_norm": 7.929275035858154, "learning_rate": 1.24967888512543e-05, "loss": 2.2687, "step": 150 }, { "epoch": 0.016960651289009497, "eval_loss": 2.267575263977051, "eval_runtime": 48.3219, "eval_samples_per_second": 10.347, "eval_steps_per_second": 10.347, "step": 150 }, { "epoch": 0.018091361374943465, "grad_norm": 9.578577995300293, "learning_rate": 1.2495376120044174e-05, "loss": 2.1895, "step": 160 }, { "epoch": 0.01922207146087743, "grad_norm": 7.1476945877075195, "learning_rate": 1.2493706665883217e-05, "loss": 2.1138, "step": 170 }, { "epoch": 0.0203527815468114, "grad_norm": 6.884596347808838, "learning_rate": 1.2491780557396153e-05, "loss": 2.1998, "step": 180 }, { "epoch": 0.021483491632745363, "grad_norm": 8.039597511291504, "learning_rate": 1.2489597873757757e-05, "loss": 2.1461, "step": 190 }, { "epoch": 0.022614201718679332, "grad_norm": 7.772514343261719, "learning_rate": 1.2487158704689602e-05, "loss": 2.2605, "step": 200 }, { "epoch": 0.023744911804613297, "grad_norm": 10.420337677001953, "learning_rate": 1.248446315045638e-05, "loss": 2.1968, "step": 210 }, { "epoch": 0.024875621890547265, "grad_norm": 8.081610679626465, "learning_rate": 1.2481511321861762e-05, "loss": 2.1992, "step": 220 }, { "epoch": 0.02600633197648123, "grad_norm": 8.256917953491211, "learning_rate": 1.2478303340243865e-05, "loss": 2.0914, "step": 230 }, { "epoch": 0.027137042062415198, "grad_norm": 7.677591800689697, "learning_rate": 1.2474839337470245e-05, "loss": 2.2734, "step": 240 }, { "epoch": 0.028267752148349163, "grad_norm": 7.500687122344971, "learning_rate": 1.2471119455932489e-05, "loss": 2.2588, "step": 250 }, { "epoch": 0.02939846223428313, "grad_norm": 6.991959571838379, "learning_rate": 1.246714384854036e-05, "loss": 2.1835, "step": 260 }, { "epoch": 0.030529172320217096, "grad_norm": 7.836300849914551, "learning_rate": 1.2462912678715502e-05, "loss": 2.1867, "step": 270 }, { "epoch": 0.031659882406151064, "grad_norm": 8.68214225769043, "learning_rate": 1.245842612038474e-05, "loss": 2.195, "step": 280 }, { "epoch": 0.03279059249208503, "grad_norm": 6.714479923248291, "learning_rate": 1.2453684357972907e-05, "loss": 2.172, "step": 290 }, { "epoch": 0.033921302578018994, "grad_norm": 8.047091484069824, "learning_rate": 1.2448687586395288e-05, "loss": 2.2706, "step": 300 }, { "epoch": 0.033921302578018994, "eval_loss": 2.2518246173858643, "eval_runtime": 48.5765, "eval_samples_per_second": 10.293, "eval_steps_per_second": 10.293, "step": 300 }, { "epoch": 0.03505201266395296, "grad_norm": 7.334674835205078, "learning_rate": 1.2443436011049593e-05, "loss": 2.1871, "step": 310 }, { "epoch": 0.03618272274988693, "grad_norm": 6.183835983276367, "learning_rate": 1.2437929847807512e-05, "loss": 2.1003, "step": 320 }, { "epoch": 0.03731343283582089, "grad_norm": 7.111408710479736, "learning_rate": 1.2432169323005851e-05, "loss": 2.205, "step": 330 }, { "epoch": 0.03844414292175486, "grad_norm": 8.536894798278809, "learning_rate": 1.2426154673437223e-05, "loss": 2.156, "step": 340 }, { "epoch": 0.03957485300768883, "grad_norm": 11.27071475982666, "learning_rate": 1.2419886146340315e-05, "loss": 2.2592, "step": 350 }, { "epoch": 0.0407055630936228, "grad_norm": 10.803840637207031, "learning_rate": 1.2413363999389718e-05, "loss": 2.2399, "step": 360 }, { "epoch": 0.04183627317955676, "grad_norm": 7.575082778930664, "learning_rate": 1.2406588500685356e-05, "loss": 2.1087, "step": 370 }, { "epoch": 0.04296698326549073, "grad_norm": 9.010503768920898, "learning_rate": 1.2399559928741435e-05, "loss": 2.2231, "step": 380 }, { "epoch": 0.044097693351424695, "grad_norm": 7.222311496734619, "learning_rate": 1.2392278572475025e-05, "loss": 2.1999, "step": 390 }, { "epoch": 0.045228403437358664, "grad_norm": 7.897728443145752, "learning_rate": 1.2384744731194159e-05, "loss": 2.1417, "step": 400 }, { "epoch": 0.046359113523292625, "grad_norm": 8.531673431396484, "learning_rate": 1.2376958714585546e-05, "loss": 2.1444, "step": 410 }, { "epoch": 0.04748982360922659, "grad_norm": 6.729523658752441, "learning_rate": 1.2368920842701831e-05, "loss": 2.1738, "step": 420 }, { "epoch": 0.04862053369516056, "grad_norm": 7.764725685119629, "learning_rate": 1.2360631445948449e-05, "loss": 2.2803, "step": 430 }, { "epoch": 0.04975124378109453, "grad_norm": 7.386552333831787, "learning_rate": 1.2352090865070027e-05, "loss": 2.1988, "step": 440 }, { "epoch": 0.05088195386702849, "grad_norm": 8.109585762023926, "learning_rate": 1.2343299451136397e-05, "loss": 2.1637, "step": 450 }, { "epoch": 0.05088195386702849, "eval_loss": 2.2381701469421387, "eval_runtime": 48.4186, "eval_samples_per_second": 10.327, "eval_steps_per_second": 10.327, "step": 450 }, { "epoch": 0.05201266395296246, "grad_norm": 8.307541847229004, "learning_rate": 1.2334257565528155e-05, "loss": 2.1439, "step": 460 }, { "epoch": 0.05314337403889643, "grad_norm": 8.05910873413086, "learning_rate": 1.2324965579921801e-05, "loss": 2.1699, "step": 470 }, { "epoch": 0.054274084124830396, "grad_norm": 8.29764461517334, "learning_rate": 1.2315423876274468e-05, "loss": 2.0972, "step": 480 }, { "epoch": 0.05540479421076436, "grad_norm": 6.467252731323242, "learning_rate": 1.2305632846808221e-05, "loss": 2.2061, "step": 490 }, { "epoch": 0.056535504296698326, "grad_norm": 9.56059455871582, "learning_rate": 1.2295592893993934e-05, "loss": 2.274, "step": 500 }, { "epoch": 0.057666214382632294, "grad_norm": 7.927321434020996, "learning_rate": 1.2285304430534745e-05, "loss": 2.2253, "step": 510 }, { "epoch": 0.05879692446856626, "grad_norm": 7.925745010375977, "learning_rate": 1.2274767879349083e-05, "loss": 2.3301, "step": 520 }, { "epoch": 0.059927634554500224, "grad_norm": 7.2481303215026855, "learning_rate": 1.2263983673553307e-05, "loss": 2.2219, "step": 530 }, { "epoch": 0.06105834464043419, "grad_norm": 6.850359916687012, "learning_rate": 1.2252952256443871e-05, "loss": 2.1766, "step": 540 }, { "epoch": 0.06218905472636816, "grad_norm": 6.884121894836426, "learning_rate": 1.2241674081479129e-05, "loss": 2.1775, "step": 550 }, { "epoch": 0.06331976481230213, "grad_norm": 7.791696071624756, "learning_rate": 1.223014961226068e-05, "loss": 2.0806, "step": 560 }, { "epoch": 0.06445047489823609, "grad_norm": 7.071966648101807, "learning_rate": 1.2218379322514316e-05, "loss": 2.2275, "step": 570 }, { "epoch": 0.06558118498417007, "grad_norm": 7.758040428161621, "learning_rate": 1.2206363696070545e-05, "loss": 2.2537, "step": 580 }, { "epoch": 0.06671189507010403, "grad_norm": 9.378442764282227, "learning_rate": 1.219410322684471e-05, "loss": 2.1729, "step": 590 }, { "epoch": 0.06784260515603799, "grad_norm": 8.120989799499512, "learning_rate": 1.2181598418816679e-05, "loss": 2.1472, "step": 600 }, { "epoch": 0.06784260515603799, "eval_loss": 2.226912498474121, "eval_runtime": 48.3736, "eval_samples_per_second": 10.336, "eval_steps_per_second": 10.336, "step": 600 }, { "epoch": 0.06897331524197196, "grad_norm": 7.539031028747559, "learning_rate": 1.2168849786010134e-05, "loss": 2.2155, "step": 610 }, { "epoch": 0.07010402532790593, "grad_norm": 7.440618991851807, "learning_rate": 1.2155857852471433e-05, "loss": 2.0795, "step": 620 }, { "epoch": 0.07123473541383989, "grad_norm": 7.3796491622924805, "learning_rate": 1.2142623152248081e-05, "loss": 2.2486, "step": 630 }, { "epoch": 0.07236544549977386, "grad_norm": 7.051665306091309, "learning_rate": 1.2129146229366767e-05, "loss": 2.2253, "step": 640 }, { "epoch": 0.07349615558570782, "grad_norm": 7.002635955810547, "learning_rate": 1.2115427637811003e-05, "loss": 2.2325, "step": 650 }, { "epoch": 0.07462686567164178, "grad_norm": 6.768515586853027, "learning_rate": 1.2101467941498358e-05, "loss": 2.1671, "step": 660 }, { "epoch": 0.07575757575757576, "grad_norm": 8.600127220153809, "learning_rate": 1.208726771425727e-05, "loss": 2.1776, "step": 670 }, { "epoch": 0.07688828584350972, "grad_norm": 7.239190101623535, "learning_rate": 1.2072827539803463e-05, "loss": 2.1988, "step": 680 }, { "epoch": 0.0780189959294437, "grad_norm": 8.177273750305176, "learning_rate": 1.205814801171595e-05, "loss": 2.1705, "step": 690 }, { "epoch": 0.07914970601537766, "grad_norm": 7.265347480773926, "learning_rate": 1.2043229733412637e-05, "loss": 2.1495, "step": 700 }, { "epoch": 0.08028041610131162, "grad_norm": 7.961764335632324, "learning_rate": 1.2028073318125511e-05, "loss": 2.1852, "step": 710 }, { "epoch": 0.0814111261872456, "grad_norm": 8.351218223571777, "learning_rate": 1.2012679388875442e-05, "loss": 2.2082, "step": 720 }, { "epoch": 0.08254183627317956, "grad_norm": 7.516690731048584, "learning_rate": 1.1997048578446569e-05, "loss": 2.2127, "step": 730 }, { "epoch": 0.08367254635911352, "grad_norm": 7.568490028381348, "learning_rate": 1.1981181529360284e-05, "loss": 2.2263, "step": 740 }, { "epoch": 0.08480325644504749, "grad_norm": 8.16396427154541, "learning_rate": 1.1965078893848829e-05, "loss": 2.1926, "step": 750 }, { "epoch": 0.08480325644504749, "eval_loss": 2.2287020683288574, "eval_runtime": 48.1835, "eval_samples_per_second": 10.377, "eval_steps_per_second": 10.377, "step": 750 }, { "epoch": 0.08593396653098145, "grad_norm": 7.685788154602051, "learning_rate": 1.1948741333828482e-05, "loss": 2.0831, "step": 760 }, { "epoch": 0.08706467661691543, "grad_norm": 7.220767021179199, "learning_rate": 1.1932169520872344e-05, "loss": 2.2063, "step": 770 }, { "epoch": 0.08819538670284939, "grad_norm": 8.637968063354492, "learning_rate": 1.1915364136182738e-05, "loss": 2.1785, "step": 780 }, { "epoch": 0.08932609678878335, "grad_norm": 8.000389099121094, "learning_rate": 1.189832587056321e-05, "loss": 2.2119, "step": 790 }, { "epoch": 0.09045680687471733, "grad_norm": 6.237274169921875, "learning_rate": 1.188105542439012e-05, "loss": 2.1253, "step": 800 }, { "epoch": 0.09158751696065129, "grad_norm": 7.517107009887695, "learning_rate": 1.186355350758387e-05, "loss": 2.1768, "step": 810 }, { "epoch": 0.09271822704658525, "grad_norm": 7.856780529022217, "learning_rate": 1.1845820839579707e-05, "loss": 2.1553, "step": 820 }, { "epoch": 0.09384893713251923, "grad_norm": 7.150311470031738, "learning_rate": 1.1827858149298162e-05, "loss": 2.1324, "step": 830 }, { "epoch": 0.09497964721845319, "grad_norm": 7.736023902893066, "learning_rate": 1.1809666175115075e-05, "loss": 2.2059, "step": 840 }, { "epoch": 0.09611035730438716, "grad_norm": 6.754352569580078, "learning_rate": 1.1791245664831252e-05, "loss": 2.165, "step": 850 }, { "epoch": 0.09724106739032112, "grad_norm": 7.572906970977783, "learning_rate": 1.177259737564172e-05, "loss": 2.1517, "step": 860 }, { "epoch": 0.09837177747625508, "grad_norm": 8.434165000915527, "learning_rate": 1.1753722074104613e-05, "loss": 2.1766, "step": 870 }, { "epoch": 0.09950248756218906, "grad_norm": 7.941796779632568, "learning_rate": 1.1734620536109645e-05, "loss": 2.1718, "step": 880 }, { "epoch": 0.10063319764812302, "grad_norm": 6.2347283363342285, "learning_rate": 1.1715293546846223e-05, "loss": 2.0972, "step": 890 }, { "epoch": 0.10176390773405698, "grad_norm": 7.7643256187438965, "learning_rate": 1.1695741900771185e-05, "loss": 2.2102, "step": 900 }, { "epoch": 0.10176390773405698, "eval_loss": 2.2237775325775146, "eval_runtime": 48.2403, "eval_samples_per_second": 10.365, "eval_steps_per_second": 10.365, "step": 900 }, { "epoch": 0.10289461781999096, "grad_norm": 8.228760719299316, "learning_rate": 1.1675966401576116e-05, "loss": 2.1872, "step": 910 }, { "epoch": 0.10402532790592492, "grad_norm": 7.78733491897583, "learning_rate": 1.1655967862154335e-05, "loss": 2.1399, "step": 920 }, { "epoch": 0.10515603799185888, "grad_norm": 7.820289611816406, "learning_rate": 1.1635747104567469e-05, "loss": 2.2286, "step": 930 }, { "epoch": 0.10628674807779286, "grad_norm": 9.01229190826416, "learning_rate": 1.1615304960011663e-05, "loss": 2.2352, "step": 940 }, { "epoch": 0.10741745816372682, "grad_norm": 10.763086318969727, "learning_rate": 1.1594642268783414e-05, "loss": 2.1866, "step": 950 }, { "epoch": 0.10854816824966079, "grad_norm": 8.038209915161133, "learning_rate": 1.1573759880245028e-05, "loss": 2.2101, "step": 960 }, { "epoch": 0.10967887833559475, "grad_norm": 6.662899017333984, "learning_rate": 1.1552658652789704e-05, "loss": 2.2184, "step": 970 }, { "epoch": 0.11080958842152872, "grad_norm": 7.763529300689697, "learning_rate": 1.153133945380626e-05, "loss": 2.1938, "step": 980 }, { "epoch": 0.11194029850746269, "grad_norm": 6.895880699157715, "learning_rate": 1.1509803159643458e-05, "loss": 2.1889, "step": 990 }, { "epoch": 0.11307100859339665, "grad_norm": 8.422462463378906, "learning_rate": 1.1488050655574003e-05, "loss": 2.1557, "step": 1000 }, { "epoch": 0.11420171867933061, "grad_norm": 6.711178302764893, "learning_rate": 1.1466082835758142e-05, "loss": 2.2016, "step": 1010 }, { "epoch": 0.11533242876526459, "grad_norm": 6.60110330581665, "learning_rate": 1.1443900603206901e-05, "loss": 2.246, "step": 1020 }, { "epoch": 0.11646313885119855, "grad_norm": 8.553449630737305, "learning_rate": 1.1421504869744979e-05, "loss": 2.1259, "step": 1030 }, { "epoch": 0.11759384893713253, "grad_norm": 7.317471027374268, "learning_rate": 1.139889655597326e-05, "loss": 2.1144, "step": 1040 }, { "epoch": 0.11872455902306649, "grad_norm": 7.992708206176758, "learning_rate": 1.1376076591230975e-05, "loss": 2.2178, "step": 1050 }, { "epoch": 0.11872455902306649, "eval_loss": 2.216776132583618, "eval_runtime": 48.4782, "eval_samples_per_second": 10.314, "eval_steps_per_second": 10.314, "step": 1050 }, { "epoch": 0.11985526910900045, "grad_norm": 7.371614933013916, "learning_rate": 1.1353045913557491e-05, "loss": 2.0985, "step": 1060 }, { "epoch": 0.12098597919493442, "grad_norm": 6.453526496887207, "learning_rate": 1.1329805469653767e-05, "loss": 2.1691, "step": 1070 }, { "epoch": 0.12211668928086838, "grad_norm": 6.573697566986084, "learning_rate": 1.1306356214843423e-05, "loss": 2.2535, "step": 1080 }, { "epoch": 0.12324739936680235, "grad_norm": 7.33532190322876, "learning_rate": 1.1282699113033476e-05, "loss": 2.1982, "step": 1090 }, { "epoch": 0.12437810945273632, "grad_norm": 7.721541404724121, "learning_rate": 1.125883513667473e-05, "loss": 2.1839, "step": 1100 }, { "epoch": 0.1255088195386703, "grad_norm": 7.723012447357178, "learning_rate": 1.123476526672178e-05, "loss": 2.1206, "step": 1110 }, { "epoch": 0.12663952962460426, "grad_norm": 7.843628406524658, "learning_rate": 1.1210490492592705e-05, "loss": 2.1033, "step": 1120 }, { "epoch": 0.12777023971053822, "grad_norm": 8.290491104125977, "learning_rate": 1.118601181212839e-05, "loss": 2.1742, "step": 1130 }, { "epoch": 0.12890094979647218, "grad_norm": 7.601134300231934, "learning_rate": 1.1161330231551516e-05, "loss": 2.1343, "step": 1140 }, { "epoch": 0.13003165988240614, "grad_norm": 8.088129043579102, "learning_rate": 1.1136446765425187e-05, "loss": 2.1998, "step": 1150 }, { "epoch": 0.13116236996834013, "grad_norm": 6.895781993865967, "learning_rate": 1.1111362436611233e-05, "loss": 2.2096, "step": 1160 }, { "epoch": 0.1322930800542741, "grad_norm": 7.831145286560059, "learning_rate": 1.1086078276228168e-05, "loss": 2.1492, "step": 1170 }, { "epoch": 0.13342379014020805, "grad_norm": 8.054121971130371, "learning_rate": 1.1060595323608789e-05, "loss": 2.1636, "step": 1180 }, { "epoch": 0.13455450022614202, "grad_norm": 9.498968124389648, "learning_rate": 1.1034914626257467e-05, "loss": 2.1666, "step": 1190 }, { "epoch": 0.13568521031207598, "grad_norm": 6.879083633422852, "learning_rate": 1.1009037239807091e-05, "loss": 2.1619, "step": 1200 }, { "epoch": 0.13568521031207598, "eval_loss": 2.214973211288452, "eval_runtime": 48.4103, "eval_samples_per_second": 10.328, "eval_steps_per_second": 10.328, "step": 1200 }, { "epoch": 0.13681592039800994, "grad_norm": 6.76094913482666, "learning_rate": 1.098296422797566e-05, "loss": 2.0658, "step": 1210 }, { "epoch": 0.13794663048394393, "grad_norm": 6.721004486083984, "learning_rate": 1.095669666252257e-05, "loss": 2.1488, "step": 1220 }, { "epoch": 0.1390773405698779, "grad_norm": 9.493650436401367, "learning_rate": 1.0930235623204552e-05, "loss": 2.1674, "step": 1230 }, { "epoch": 0.14020805065581185, "grad_norm": 8.290738105773926, "learning_rate": 1.0903582197731294e-05, "loss": 2.2566, "step": 1240 }, { "epoch": 0.1413387607417458, "grad_norm": 8.632882118225098, "learning_rate": 1.0876737481720722e-05, "loss": 2.0707, "step": 1250 }, { "epoch": 0.14246947082767977, "grad_norm": 6.878713607788086, "learning_rate": 1.0849702578653969e-05, "loss": 2.2293, "step": 1260 }, { "epoch": 0.14360018091361376, "grad_norm": 7.139357566833496, "learning_rate": 1.0822478599830009e-05, "loss": 2.169, "step": 1270 }, { "epoch": 0.14473089099954772, "grad_norm": 7.247822284698486, "learning_rate": 1.0795066664319983e-05, "loss": 2.1815, "step": 1280 }, { "epoch": 0.14586160108548168, "grad_norm": 8.376590728759766, "learning_rate": 1.0767467898921198e-05, "loss": 2.1183, "step": 1290 }, { "epoch": 0.14699231117141565, "grad_norm": 9.783007621765137, "learning_rate": 1.0739683438110799e-05, "loss": 2.1431, "step": 1300 }, { "epoch": 0.1481230212573496, "grad_norm": 7.473887920379639, "learning_rate": 1.0711714423999145e-05, "loss": 2.204, "step": 1310 }, { "epoch": 0.14925373134328357, "grad_norm": 7.32519006729126, "learning_rate": 1.0683562006282862e-05, "loss": 2.2538, "step": 1320 }, { "epoch": 0.15038444142921756, "grad_norm": 7.444912433624268, "learning_rate": 1.0655227342197573e-05, "loss": 2.0875, "step": 1330 }, { "epoch": 0.15151515151515152, "grad_norm": 7.5810465812683105, "learning_rate": 1.0626711596470345e-05, "loss": 2.1612, "step": 1340 }, { "epoch": 0.15264586160108548, "grad_norm": 6.272039890289307, "learning_rate": 1.0598015941271792e-05, "loss": 2.1672, "step": 1350 }, { "epoch": 0.15264586160108548, "eval_loss": 2.2149977684020996, "eval_runtime": 48.562, "eval_samples_per_second": 10.296, "eval_steps_per_second": 10.296, "step": 1350 }, { "epoch": 0.15377657168701944, "grad_norm": 7.797894477844238, "learning_rate": 1.0569141556167905e-05, "loss": 2.2649, "step": 1360 }, { "epoch": 0.1549072817729534, "grad_norm": 8.073437690734863, "learning_rate": 1.0540089628071565e-05, "loss": 2.1772, "step": 1370 }, { "epoch": 0.1560379918588874, "grad_norm": 6.944803714752197, "learning_rate": 1.0510861351193747e-05, "loss": 2.1655, "step": 1380 }, { "epoch": 0.15716870194482135, "grad_norm": 7.841436862945557, "learning_rate": 1.0481457926994435e-05, "loss": 2.1477, "step": 1390 }, { "epoch": 0.15829941203075532, "grad_norm": 7.676872730255127, "learning_rate": 1.045188056413323e-05, "loss": 2.1659, "step": 1400 }, { "epoch": 0.15943012211668928, "grad_norm": 6.854037761688232, "learning_rate": 1.0422130478419676e-05, "loss": 2.1251, "step": 1410 }, { "epoch": 0.16056083220262324, "grad_norm": 6.189146995544434, "learning_rate": 1.0392208892763269e-05, "loss": 2.142, "step": 1420 }, { "epoch": 0.16169154228855723, "grad_norm": 6.691437721252441, "learning_rate": 1.0362117037123204e-05, "loss": 2.1415, "step": 1430 }, { "epoch": 0.1628222523744912, "grad_norm": 7.207416534423828, "learning_rate": 1.0331856148457804e-05, "loss": 2.2079, "step": 1440 }, { "epoch": 0.16395296246042515, "grad_norm": 7.994815826416016, "learning_rate": 1.030142747067368e-05, "loss": 2.1378, "step": 1450 }, { "epoch": 0.1650836725463591, "grad_norm": 9.272295951843262, "learning_rate": 1.027083225457459e-05, "loss": 2.2321, "step": 1460 }, { "epoch": 0.16621438263229307, "grad_norm": 7.973877429962158, "learning_rate": 1.0240071757810035e-05, "loss": 2.2404, "step": 1470 }, { "epoch": 0.16734509271822703, "grad_norm": 6.90666389465332, "learning_rate": 1.0209147244823564e-05, "loss": 2.0819, "step": 1480 }, { "epoch": 0.16847580280416102, "grad_norm": 6.657747268676758, "learning_rate": 1.0178059986800773e-05, "loss": 2.2259, "step": 1490 }, { "epoch": 0.16960651289009498, "grad_norm": 7.255081653594971, "learning_rate": 1.0146811261617086e-05, "loss": 2.0876, "step": 1500 }, { "epoch": 0.16960651289009498, "eval_loss": 2.2110049724578857, "eval_runtime": 48.1852, "eval_samples_per_second": 10.377, "eval_steps_per_second": 10.377, "step": 1500 }, { "epoch": 0.17073722297602895, "grad_norm": 7.345371723175049, "learning_rate": 1.0115402353785198e-05, "loss": 2.1589, "step": 1510 }, { "epoch": 0.1718679330619629, "grad_norm": 7.011831283569336, "learning_rate": 1.0083834554402293e-05, "loss": 2.2614, "step": 1520 }, { "epoch": 0.17299864314789687, "grad_norm": 12.274511337280273, "learning_rate": 1.0052109161096959e-05, "loss": 2.1689, "step": 1530 }, { "epoch": 0.17412935323383086, "grad_norm": 7.7154154777526855, "learning_rate": 1.0020227477975852e-05, "loss": 2.186, "step": 1540 }, { "epoch": 0.17526006331976482, "grad_norm": 7.487543106079102, "learning_rate": 9.988190815570101e-06, "loss": 2.1763, "step": 1550 }, { "epoch": 0.17639077340569878, "grad_norm": 10.611146926879883, "learning_rate": 9.95600049078141e-06, "loss": 2.0802, "step": 1560 }, { "epoch": 0.17752148349163274, "grad_norm": 8.909916877746582, "learning_rate": 9.923657826827957e-06, "loss": 2.1643, "step": 1570 }, { "epoch": 0.1786521935775667, "grad_norm": 7.380209922790527, "learning_rate": 9.891164153189975e-06, "loss": 2.1799, "step": 1580 }, { "epoch": 0.17978290366350066, "grad_norm": 7.811763286590576, "learning_rate": 9.858520805555123e-06, "loss": 2.1943, "step": 1590 }, { "epoch": 0.18091361374943465, "grad_norm": 8.154491424560547, "learning_rate": 9.825729125763562e-06, "loss": 2.179, "step": 1600 }, { "epoch": 0.18204432383536862, "grad_norm": 7.4585347175598145, "learning_rate": 9.792790461752813e-06, "loss": 2.2112, "step": 1610 }, { "epoch": 0.18317503392130258, "grad_norm": 6.881679534912109, "learning_rate": 9.759706167502343e-06, "loss": 2.0959, "step": 1620 }, { "epoch": 0.18430574400723654, "grad_norm": 6.962556838989258, "learning_rate": 9.726477602977906e-06, "loss": 2.2202, "step": 1630 }, { "epoch": 0.1854364540931705, "grad_norm": 7.402323246002197, "learning_rate": 9.693106134075641e-06, "loss": 2.1364, "step": 1640 }, { "epoch": 0.1865671641791045, "grad_norm": 10.563421249389648, "learning_rate": 9.659593132565929e-06, "loss": 2.1031, "step": 1650 }, { "epoch": 0.1865671641791045, "eval_loss": 2.210020065307617, "eval_runtime": 48.3, "eval_samples_per_second": 10.352, "eval_steps_per_second": 10.352, "step": 1650 }, { "epoch": 0.18769787426503845, "grad_norm": 6.798852443695068, "learning_rate": 9.625939976037002e-06, "loss": 2.1364, "step": 1660 }, { "epoch": 0.1888285843509724, "grad_norm": 7.535129070281982, "learning_rate": 9.59214804783831e-06, "loss": 2.1793, "step": 1670 }, { "epoch": 0.18995929443690637, "grad_norm": 6.925334453582764, "learning_rate": 9.558218737023673e-06, "loss": 2.1797, "step": 1680 }, { "epoch": 0.19109000452284033, "grad_norm": 8.61997127532959, "learning_rate": 9.524153438294159e-06, "loss": 2.1506, "step": 1690 }, { "epoch": 0.19222071460877432, "grad_norm": 7.673538684844971, "learning_rate": 9.489953551940784e-06, "loss": 2.1499, "step": 1700 }, { "epoch": 0.19335142469470828, "grad_norm": 6.962465286254883, "learning_rate": 9.455620483786914e-06, "loss": 2.262, "step": 1710 }, { "epoch": 0.19448213478064225, "grad_norm": 6.833783149719238, "learning_rate": 9.421155645130514e-06, "loss": 2.1148, "step": 1720 }, { "epoch": 0.1956128448665762, "grad_norm": 7.818989276885986, "learning_rate": 9.386560452686111e-06, "loss": 2.1957, "step": 1730 }, { "epoch": 0.19674355495251017, "grad_norm": 7.868941307067871, "learning_rate": 9.351836328526564e-06, "loss": 2.1212, "step": 1740 }, { "epoch": 0.19787426503844413, "grad_norm": 8.223631858825684, "learning_rate": 9.316984700024613e-06, "loss": 2.1822, "step": 1750 }, { "epoch": 0.19900497512437812, "grad_norm": 6.389924049377441, "learning_rate": 9.282006999794201e-06, "loss": 2.137, "step": 1760 }, { "epoch": 0.20013568521031208, "grad_norm": 7.871743202209473, "learning_rate": 9.246904665631587e-06, "loss": 2.1799, "step": 1770 }, { "epoch": 0.20126639529624604, "grad_norm": 7.172389984130859, "learning_rate": 9.211679140456241e-06, "loss": 2.1468, "step": 1780 }, { "epoch": 0.20239710538218, "grad_norm": 7.850325107574463, "learning_rate": 9.176331872251538e-06, "loss": 2.1317, "step": 1790 }, { "epoch": 0.20352781546811397, "grad_norm": 6.912145137786865, "learning_rate": 9.140864314005223e-06, "loss": 2.2408, "step": 1800 }, { "epoch": 0.20352781546811397, "eval_loss": 2.202786445617676, "eval_runtime": 48.3245, "eval_samples_per_second": 10.347, "eval_steps_per_second": 10.347, "step": 1800 }, { "epoch": 0.20465852555404795, "grad_norm": 7.549516201019287, "learning_rate": 9.105277923649698e-06, "loss": 2.1358, "step": 1810 }, { "epoch": 0.20578923563998192, "grad_norm": 7.592644214630127, "learning_rate": 9.069574164002092e-06, "loss": 2.1375, "step": 1820 }, { "epoch": 0.20691994572591588, "grad_norm": 7.425882816314697, "learning_rate": 9.033754502704119e-06, "loss": 2.2157, "step": 1830 }, { "epoch": 0.20805065581184984, "grad_norm": 7.47833776473999, "learning_rate": 8.997820412161765e-06, "loss": 2.1905, "step": 1840 }, { "epoch": 0.2091813658977838, "grad_norm": 7.294544696807861, "learning_rate": 8.961773369484739e-06, "loss": 2.1709, "step": 1850 }, { "epoch": 0.21031207598371776, "grad_norm": 7.1882219314575195, "learning_rate": 8.925614856425787e-06, "loss": 2.1735, "step": 1860 }, { "epoch": 0.21144278606965175, "grad_norm": 7.234121322631836, "learning_rate": 8.88934635931975e-06, "loss": 2.159, "step": 1870 }, { "epoch": 0.2125734961555857, "grad_norm": 7.731087684631348, "learning_rate": 8.852969369022494e-06, "loss": 2.1446, "step": 1880 }, { "epoch": 0.21370420624151967, "grad_norm": 6.509254455566406, "learning_rate": 8.816485380849613e-06, "loss": 2.0945, "step": 1890 }, { "epoch": 0.21483491632745363, "grad_norm": 8.207448959350586, "learning_rate": 8.779895894514961e-06, "loss": 2.1328, "step": 1900 }, { "epoch": 0.2159656264133876, "grad_norm": 7.515757083892822, "learning_rate": 8.743202414069012e-06, "loss": 2.1354, "step": 1910 }, { "epoch": 0.21709633649932158, "grad_norm": 7.596570014953613, "learning_rate": 8.706406447837024e-06, "loss": 2.0655, "step": 1920 }, { "epoch": 0.21822704658525555, "grad_norm": 7.666327476501465, "learning_rate": 8.669509508357052e-06, "loss": 2.24, "step": 1930 }, { "epoch": 0.2193577566711895, "grad_norm": 7.626475811004639, "learning_rate": 8.632513112317761e-06, "loss": 2.2008, "step": 1940 }, { "epoch": 0.22048846675712347, "grad_norm": 8.541882514953613, "learning_rate": 8.59541878049609e-06, "loss": 2.1567, "step": 1950 }, { "epoch": 0.22048846675712347, "eval_loss": 2.1970231533050537, "eval_runtime": 48.3007, "eval_samples_per_second": 10.352, "eval_steps_per_second": 10.352, "step": 1950 }, { "epoch": 0.22161917684305743, "grad_norm": 7.686795711517334, "learning_rate": 8.558228037694728e-06, "loss": 2.1799, "step": 1960 }, { "epoch": 0.22274988692899142, "grad_norm": 6.517086982727051, "learning_rate": 8.520942412679448e-06, "loss": 2.1025, "step": 1970 }, { "epoch": 0.22388059701492538, "grad_norm": 6.930755138397217, "learning_rate": 8.483563438116257e-06, "loss": 2.2206, "step": 1980 }, { "epoch": 0.22501130710085934, "grad_norm": 7.571625709533691, "learning_rate": 8.446092650508393e-06, "loss": 2.2103, "step": 1990 }, { "epoch": 0.2261420171867933, "grad_norm": 6.801918983459473, "learning_rate": 8.408531590133173e-06, "loss": 2.1339, "step": 2000 }, { "epoch": 0.22727272727272727, "grad_norm": 7.028923511505127, "learning_rate": 8.370881800978673e-06, "loss": 2.102, "step": 2010 }, { "epoch": 0.22840343735866123, "grad_norm": 7.287121295928955, "learning_rate": 8.333144830680262e-06, "loss": 2.0537, "step": 2020 }, { "epoch": 0.22953414744459522, "grad_norm": 7.513479232788086, "learning_rate": 8.29532223045698e-06, "loss": 2.1064, "step": 2030 }, { "epoch": 0.23066485753052918, "grad_norm": 6.845513820648193, "learning_rate": 8.257415555047786e-06, "loss": 2.1741, "step": 2040 }, { "epoch": 0.23179556761646314, "grad_norm": 6.679000377655029, "learning_rate": 8.219426362647631e-06, "loss": 2.093, "step": 2050 }, { "epoch": 0.2329262777023971, "grad_norm": 7.333429336547852, "learning_rate": 8.181356214843423e-06, "loss": 2.1542, "step": 2060 }, { "epoch": 0.23405698778833106, "grad_norm": 5.461650371551514, "learning_rate": 8.143206676549826e-06, "loss": 2.0268, "step": 2070 }, { "epoch": 0.23518769787426505, "grad_norm": 8.255279541015625, "learning_rate": 8.104979315944941e-06, "loss": 2.1679, "step": 2080 }, { "epoch": 0.236318407960199, "grad_norm": 6.765146732330322, "learning_rate": 8.066675704405837e-06, "loss": 2.1293, "step": 2090 }, { "epoch": 0.23744911804613297, "grad_norm": 7.783084392547607, "learning_rate": 8.028297416443953e-06, "loss": 2.2362, "step": 2100 }, { "epoch": 0.23744911804613297, "eval_loss": 2.1942195892333984, "eval_runtime": 48.2758, "eval_samples_per_second": 10.357, "eval_steps_per_second": 10.357, "step": 2100 }, { "epoch": 0.23857982813206693, "grad_norm": 6.722920894622803, "learning_rate": 7.989846029640397e-06, "loss": 2.2344, "step": 2110 }, { "epoch": 0.2397105382180009, "grad_norm": 10.337465286254883, "learning_rate": 7.95132312458107e-06, "loss": 2.0739, "step": 2120 }, { "epoch": 0.24084124830393486, "grad_norm": 7.2912068367004395, "learning_rate": 7.91273028479172e-06, "loss": 2.1856, "step": 2130 }, { "epoch": 0.24197195838986885, "grad_norm": 7.101160526275635, "learning_rate": 7.87406909667283e-06, "loss": 2.1609, "step": 2140 }, { "epoch": 0.2431026684758028, "grad_norm": 9.643807411193848, "learning_rate": 7.835341149434421e-06, "loss": 2.1175, "step": 2150 }, { "epoch": 0.24423337856173677, "grad_norm": 7.488255500793457, "learning_rate": 7.796548035030715e-06, "loss": 2.2064, "step": 2160 }, { "epoch": 0.24536408864767073, "grad_norm": 7.858828544616699, "learning_rate": 7.757691348094704e-06, "loss": 2.1972, "step": 2170 }, { "epoch": 0.2464947987336047, "grad_norm": 8.380545616149902, "learning_rate": 7.718772685872596e-06, "loss": 2.169, "step": 2180 }, { "epoch": 0.24762550881953868, "grad_norm": 8.051899909973145, "learning_rate": 7.67979364815816e-06, "loss": 2.1233, "step": 2190 }, { "epoch": 0.24875621890547264, "grad_norm": 7.579977035522461, "learning_rate": 7.640755837226965e-06, "loss": 2.205, "step": 2200 }, { "epoch": 0.2498869289914066, "grad_norm": 6.5006513595581055, "learning_rate": 7.601660857770522e-06, "loss": 2.1453, "step": 2210 }, { "epoch": 0.2510176390773406, "grad_norm": 7.894851207733154, "learning_rate": 7.562510316830308e-06, "loss": 2.1332, "step": 2220 }, { "epoch": 0.25214834916327455, "grad_norm": 6.082065582275391, "learning_rate": 7.523305823731723e-06, "loss": 2.1449, "step": 2230 }, { "epoch": 0.2532790592492085, "grad_norm": 6.983736038208008, "learning_rate": 7.48404899001792e-06, "loss": 2.1606, "step": 2240 }, { "epoch": 0.2544097693351425, "grad_norm": 7.9267897605896, "learning_rate": 7.444741429383578e-06, "loss": 2.2194, "step": 2250 }, { "epoch": 0.2544097693351425, "eval_loss": 2.194028854370117, "eval_runtime": 48.2634, "eval_samples_per_second": 10.36, "eval_steps_per_second": 10.36, "step": 2250 }, { "epoch": 0.25554047942107644, "grad_norm": 7.207333087921143, "learning_rate": 7.405384757608555e-06, "loss": 2.1735, "step": 2260 }, { "epoch": 0.2566711895070104, "grad_norm": 6.758321285247803, "learning_rate": 7.365980592491479e-06, "loss": 1.9842, "step": 2270 }, { "epoch": 0.25780189959294436, "grad_norm": 8.437943458557129, "learning_rate": 7.326530553783244e-06, "loss": 2.1625, "step": 2280 }, { "epoch": 0.2589326096788783, "grad_norm": 6.544831275939941, "learning_rate": 7.287036263120425e-06, "loss": 2.104, "step": 2290 }, { "epoch": 0.2600633197648123, "grad_norm": 7.0727925300598145, "learning_rate": 7.2474993439586206e-06, "loss": 2.1288, "step": 2300 }, { "epoch": 0.26119402985074625, "grad_norm": 7.700973987579346, "learning_rate": 7.207921421505724e-06, "loss": 2.1661, "step": 2310 }, { "epoch": 0.26232473993668026, "grad_norm": 9.491445541381836, "learning_rate": 7.168304122655113e-06, "loss": 2.1306, "step": 2320 }, { "epoch": 0.2634554500226142, "grad_norm": 6.5398359298706055, "learning_rate": 7.128649075918768e-06, "loss": 2.1642, "step": 2330 }, { "epoch": 0.2645861601085482, "grad_norm": 7.057576656341553, "learning_rate": 7.088957911360347e-06, "loss": 2.1539, "step": 2340 }, { "epoch": 0.26571687019448215, "grad_norm": 7.789804935455322, "learning_rate": 7.049232260528163e-06, "loss": 2.1235, "step": 2350 }, { "epoch": 0.2668475802804161, "grad_norm": 7.7269392013549805, "learning_rate": 7.009473756388128e-06, "loss": 2.1695, "step": 2360 }, { "epoch": 0.26797829036635007, "grad_norm": 7.795671463012695, "learning_rate": 6.9696840332566226e-06, "loss": 2.1804, "step": 2370 }, { "epoch": 0.26910900045228403, "grad_norm": 8.445046424865723, "learning_rate": 6.929864726733319e-06, "loss": 2.1765, "step": 2380 }, { "epoch": 0.270239710538218, "grad_norm": 6.396809101104736, "learning_rate": 6.890017473633946e-06, "loss": 2.1375, "step": 2390 }, { "epoch": 0.27137042062415195, "grad_norm": 8.353865623474121, "learning_rate": 6.850143911923011e-06, "loss": 2.1633, "step": 2400 }, { "epoch": 0.27137042062415195, "eval_loss": 2.1882543563842773, "eval_runtime": 48.3538, "eval_samples_per_second": 10.34, "eval_steps_per_second": 10.34, "step": 2400 }, { "epoch": 0.2725011307100859, "grad_norm": 6.948223114013672, "learning_rate": 6.81024568064646e-06, "loss": 2.1329, "step": 2410 }, { "epoch": 0.2736318407960199, "grad_norm": 8.08130931854248, "learning_rate": 6.770324419864309e-06, "loss": 2.1599, "step": 2420 }, { "epoch": 0.2747625508819539, "grad_norm": 8.00659465789795, "learning_rate": 6.73038177058323e-06, "loss": 2.21, "step": 2430 }, { "epoch": 0.27589326096788785, "grad_norm": 7.727768898010254, "learning_rate": 6.690419374689087e-06, "loss": 2.1515, "step": 2440 }, { "epoch": 0.2770239710538218, "grad_norm": 7.162560939788818, "learning_rate": 6.650438874879457e-06, "loss": 2.1828, "step": 2450 }, { "epoch": 0.2781546811397558, "grad_norm": 11.718711853027344, "learning_rate": 6.6104419145960914e-06, "loss": 2.1361, "step": 2460 }, { "epoch": 0.27928539122568974, "grad_norm": 6.664314270019531, "learning_rate": 6.57043013795737e-06, "loss": 2.1065, "step": 2470 }, { "epoch": 0.2804161013116237, "grad_norm": 8.2750883102417, "learning_rate": 6.530405189690719e-06, "loss": 2.1305, "step": 2480 }, { "epoch": 0.28154681139755766, "grad_norm": 8.281411170959473, "learning_rate": 6.49036871506499e-06, "loss": 2.2141, "step": 2490 }, { "epoch": 0.2826775214834916, "grad_norm": 6.95843505859375, "learning_rate": 6.450322359822847e-06, "loss": 2.1649, "step": 2500 }, { "epoch": 0.2838082315694256, "grad_norm": 7.3537983894348145, "learning_rate": 6.4102677701130976e-06, "loss": 2.1642, "step": 2510 }, { "epoch": 0.28493894165535955, "grad_norm": 5.99790620803833, "learning_rate": 6.3702065924230445e-06, "loss": 2.0721, "step": 2520 }, { "epoch": 0.2860696517412935, "grad_norm": 11.255165100097656, "learning_rate": 6.330140473510796e-06, "loss": 2.1195, "step": 2530 }, { "epoch": 0.2872003618272275, "grad_norm": 7.260537147521973, "learning_rate": 6.2900710603375695e-06, "loss": 2.1233, "step": 2540 }, { "epoch": 0.2883310719131615, "grad_norm": 7.565793037414551, "learning_rate": 6.25e-06, "loss": 2.1365, "step": 2550 }, { "epoch": 0.2883310719131615, "eval_loss": 2.1869258880615234, "eval_runtime": 48.3358, "eval_samples_per_second": 10.344, "eval_steps_per_second": 10.344, "step": 2550 }, { "epoch": 0.28946178199909545, "grad_norm": 7.068634033203125, "learning_rate": 6.209928939662431e-06, "loss": 2.0204, "step": 2560 }, { "epoch": 0.2905924920850294, "grad_norm": 7.100194931030273, "learning_rate": 6.169859526489205e-06, "loss": 2.1537, "step": 2570 }, { "epoch": 0.29172320217096337, "grad_norm": 7.691938877105713, "learning_rate": 6.129793407576956e-06, "loss": 2.1196, "step": 2580 }, { "epoch": 0.29285391225689733, "grad_norm": 9.350695610046387, "learning_rate": 6.089732229886904e-06, "loss": 2.1453, "step": 2590 }, { "epoch": 0.2939846223428313, "grad_norm": 6.743042469024658, "learning_rate": 6.049677640177155e-06, "loss": 2.1247, "step": 2600 }, { "epoch": 0.29511533242876525, "grad_norm": 7.808825969696045, "learning_rate": 6.00963128493501e-06, "loss": 2.1341, "step": 2610 }, { "epoch": 0.2962460425146992, "grad_norm": 7.13515567779541, "learning_rate": 5.969594810309284e-06, "loss": 2.1384, "step": 2620 }, { "epoch": 0.2973767526006332, "grad_norm": 8.79920768737793, "learning_rate": 5.929569862042631e-06, "loss": 2.1525, "step": 2630 }, { "epoch": 0.29850746268656714, "grad_norm": 7.0202131271362305, "learning_rate": 5.889558085403911e-06, "loss": 2.1237, "step": 2640 }, { "epoch": 0.29963817277250115, "grad_norm": 9.797450065612793, "learning_rate": 5.849561125120545e-06, "loss": 2.1394, "step": 2650 }, { "epoch": 0.3007688828584351, "grad_norm": 6.459300994873047, "learning_rate": 5.8095806253109125e-06, "loss": 2.1357, "step": 2660 }, { "epoch": 0.3018995929443691, "grad_norm": 7.996516704559326, "learning_rate": 5.769618229416773e-06, "loss": 2.0421, "step": 2670 }, { "epoch": 0.30303030303030304, "grad_norm": 6.942202091217041, "learning_rate": 5.7296755801356925e-06, "loss": 2.1944, "step": 2680 }, { "epoch": 0.304161013116237, "grad_norm": 7.5104241371154785, "learning_rate": 5.6897543193535415e-06, "loss": 2.1261, "step": 2690 }, { "epoch": 0.30529172320217096, "grad_norm": 6.291525363922119, "learning_rate": 5.649856088076989e-06, "loss": 2.1016, "step": 2700 }, { "epoch": 0.30529172320217096, "eval_loss": 2.1850855350494385, "eval_runtime": 48.3001, "eval_samples_per_second": 10.352, "eval_steps_per_second": 10.352, "step": 2700 }, { "epoch": 0.3064224332881049, "grad_norm": 7.2428364753723145, "learning_rate": 5.609982526366055e-06, "loss": 2.1139, "step": 2710 }, { "epoch": 0.3075531433740389, "grad_norm": 6.753382205963135, "learning_rate": 5.570135273266683e-06, "loss": 2.0327, "step": 2720 }, { "epoch": 0.30868385345997285, "grad_norm": 6.91438102722168, "learning_rate": 5.53031596674338e-06, "loss": 2.1193, "step": 2730 }, { "epoch": 0.3098145635459068, "grad_norm": 9.19656753540039, "learning_rate": 5.490526243611873e-06, "loss": 2.185, "step": 2740 }, { "epoch": 0.31094527363184077, "grad_norm": 7.982601165771484, "learning_rate": 5.450767739471837e-06, "loss": 2.1547, "step": 2750 }, { "epoch": 0.3120759837177748, "grad_norm": 7.3903489112854, "learning_rate": 5.411042088639655e-06, "loss": 2.1588, "step": 2760 }, { "epoch": 0.31320669380370875, "grad_norm": 7.590287208557129, "learning_rate": 5.371350924081234e-06, "loss": 2.0884, "step": 2770 }, { "epoch": 0.3143374038896427, "grad_norm": 7.515122890472412, "learning_rate": 5.331695877344888e-06, "loss": 2.2465, "step": 2780 }, { "epoch": 0.31546811397557667, "grad_norm": 8.218059539794922, "learning_rate": 5.292078578494275e-06, "loss": 2.1522, "step": 2790 }, { "epoch": 0.31659882406151063, "grad_norm": 7.273688316345215, "learning_rate": 5.252500656041382e-06, "loss": 2.1133, "step": 2800 }, { "epoch": 0.3177295341474446, "grad_norm": 7.153193950653076, "learning_rate": 5.212963736879578e-06, "loss": 2.0886, "step": 2810 }, { "epoch": 0.31886024423337855, "grad_norm": 7.041120529174805, "learning_rate": 5.1734694462167574e-06, "loss": 2.1375, "step": 2820 }, { "epoch": 0.3199909543193125, "grad_norm": 7.7936906814575195, "learning_rate": 5.134019407508521e-06, "loss": 2.1001, "step": 2830 }, { "epoch": 0.3211216644052465, "grad_norm": 7.668168067932129, "learning_rate": 5.094615242391446e-06, "loss": 2.1571, "step": 2840 }, { "epoch": 0.32225237449118044, "grad_norm": 7.068964958190918, "learning_rate": 5.055258570616425e-06, "loss": 2.1478, "step": 2850 }, { "epoch": 0.32225237449118044, "eval_loss": 2.182772397994995, "eval_runtime": 48.1793, "eval_samples_per_second": 10.378, "eval_steps_per_second": 10.378, "step": 2850 }, { "epoch": 0.32338308457711445, "grad_norm": 6.536876678466797, "learning_rate": 5.015951009982081e-06, "loss": 2.1132, "step": 2860 }, { "epoch": 0.3245137946630484, "grad_norm": 7.389856338500977, "learning_rate": 4.976694176268278e-06, "loss": 2.1565, "step": 2870 }, { "epoch": 0.3256445047489824, "grad_norm": 8.3035888671875, "learning_rate": 4.937489683169692e-06, "loss": 2.0618, "step": 2880 }, { "epoch": 0.32677521483491634, "grad_norm": 7.79971981048584, "learning_rate": 4.898339142229478e-06, "loss": 2.1093, "step": 2890 }, { "epoch": 0.3279059249208503, "grad_norm": 7.113759517669678, "learning_rate": 4.859244162773036e-06, "loss": 2.2038, "step": 2900 }, { "epoch": 0.32903663500678426, "grad_norm": 24.253713607788086, "learning_rate": 4.820206351841842e-06, "loss": 2.1438, "step": 2910 }, { "epoch": 0.3301673450927182, "grad_norm": 7.508760452270508, "learning_rate": 4.7812273141274054e-06, "loss": 2.1626, "step": 2920 }, { "epoch": 0.3312980551786522, "grad_norm": 7.20119047164917, "learning_rate": 4.7423086519052966e-06, "loss": 2.1174, "step": 2930 }, { "epoch": 0.33242876526458615, "grad_norm": 7.071509838104248, "learning_rate": 4.703451964969287e-06, "loss": 2.142, "step": 2940 }, { "epoch": 0.3335594753505201, "grad_norm": 7.641849517822266, "learning_rate": 4.66465885056558e-06, "loss": 2.1557, "step": 2950 }, { "epoch": 0.33469018543645407, "grad_norm": 6.416446208953857, "learning_rate": 4.625930903327171e-06, "loss": 2.0426, "step": 2960 }, { "epoch": 0.3358208955223881, "grad_norm": 8.850318908691406, "learning_rate": 4.587269715208281e-06, "loss": 2.1416, "step": 2970 }, { "epoch": 0.33695160560832205, "grad_norm": 8.38691234588623, "learning_rate": 4.548676875418931e-06, "loss": 2.1219, "step": 2980 }, { "epoch": 0.338082315694256, "grad_norm": 7.101838111877441, "learning_rate": 4.510153970359606e-06, "loss": 2.1287, "step": 2990 }, { "epoch": 0.33921302578018997, "grad_norm": 8.318099975585938, "learning_rate": 4.471702583556048e-06, "loss": 2.1025, "step": 3000 }, { "epoch": 0.33921302578018997, "eval_loss": 2.1799092292785645, "eval_runtime": 48.2965, "eval_samples_per_second": 10.353, "eval_steps_per_second": 10.353, "step": 3000 }, { "epoch": 0.34034373586612393, "grad_norm": 7.666057109832764, "learning_rate": 4.433324295594166e-06, "loss": 2.081, "step": 3010 }, { "epoch": 0.3414744459520579, "grad_norm": 11.191397666931152, "learning_rate": 4.395020684055059e-06, "loss": 2.2047, "step": 3020 }, { "epoch": 0.34260515603799185, "grad_norm": 7.815023899078369, "learning_rate": 4.356793323450175e-06, "loss": 2.1593, "step": 3030 }, { "epoch": 0.3437358661239258, "grad_norm": 8.812600135803223, "learning_rate": 4.3186437851565795e-06, "loss": 2.1693, "step": 3040 }, { "epoch": 0.3448665762098598, "grad_norm": 7.641300201416016, "learning_rate": 4.280573637352371e-06, "loss": 2.1696, "step": 3050 }, { "epoch": 0.34599728629579374, "grad_norm": 7.716446876525879, "learning_rate": 4.2425844449522155e-06, "loss": 2.1366, "step": 3060 }, { "epoch": 0.3471279963817277, "grad_norm": 6.964475631713867, "learning_rate": 4.204677769543019e-06, "loss": 2.109, "step": 3070 }, { "epoch": 0.3482587064676617, "grad_norm": 8.101829528808594, "learning_rate": 4.16685516931974e-06, "loss": 2.1024, "step": 3080 }, { "epoch": 0.3493894165535957, "grad_norm": 8.215981483459473, "learning_rate": 4.129118199021329e-06, "loss": 2.1565, "step": 3090 }, { "epoch": 0.35052012663952964, "grad_norm": 7.960821151733398, "learning_rate": 4.091468409866829e-06, "loss": 2.1078, "step": 3100 }, { "epoch": 0.3516508367254636, "grad_norm": 8.794036865234375, "learning_rate": 4.0539073494916075e-06, "loss": 2.1216, "step": 3110 }, { "epoch": 0.35278154681139756, "grad_norm": 7.812005996704102, "learning_rate": 4.016436561883746e-06, "loss": 2.1001, "step": 3120 }, { "epoch": 0.3539122568973315, "grad_norm": 7.4133453369140625, "learning_rate": 3.979057587320554e-06, "loss": 2.2065, "step": 3130 }, { "epoch": 0.3550429669832655, "grad_norm": 8.322656631469727, "learning_rate": 3.941771962305274e-06, "loss": 2.107, "step": 3140 }, { "epoch": 0.35617367706919945, "grad_norm": 7.369226455688477, "learning_rate": 3.904581219503912e-06, "loss": 2.0532, "step": 3150 }, { "epoch": 0.35617367706919945, "eval_loss": 2.17913556098938, "eval_runtime": 48.4147, "eval_samples_per_second": 10.327, "eval_steps_per_second": 10.327, "step": 3150 }, { "epoch": 0.3573043871551334, "grad_norm": 6.783062934875488, "learning_rate": 3.8674868876822395e-06, "loss": 2.104, "step": 3160 }, { "epoch": 0.35843509724106737, "grad_norm": 7.376591205596924, "learning_rate": 3.83049049164295e-06, "loss": 2.1581, "step": 3170 }, { "epoch": 0.35956580732700133, "grad_norm": 7.2051472663879395, "learning_rate": 3.793593552162978e-06, "loss": 2.1516, "step": 3180 }, { "epoch": 0.36069651741293535, "grad_norm": 7.003240585327148, "learning_rate": 3.75679758593099e-06, "loss": 2.217, "step": 3190 }, { "epoch": 0.3618272274988693, "grad_norm": 7.026625156402588, "learning_rate": 3.7201041054850387e-06, "loss": 2.1497, "step": 3200 }, { "epoch": 0.36295793758480327, "grad_norm": 7.71658992767334, "learning_rate": 3.6835146191503883e-06, "loss": 2.1373, "step": 3210 }, { "epoch": 0.36408864767073723, "grad_norm": 7.247103691101074, "learning_rate": 3.6470306309775077e-06, "loss": 2.0963, "step": 3220 }, { "epoch": 0.3652193577566712, "grad_norm": 6.893926620483398, "learning_rate": 3.6106536406802526e-06, "loss": 2.2709, "step": 3230 }, { "epoch": 0.36635006784260515, "grad_norm": 6.418797969818115, "learning_rate": 3.5743851435742172e-06, "loss": 2.1397, "step": 3240 }, { "epoch": 0.3674807779285391, "grad_norm": 8.434163093566895, "learning_rate": 3.538226630515262e-06, "loss": 2.1777, "step": 3250 }, { "epoch": 0.3686114880144731, "grad_norm": 7.856057643890381, "learning_rate": 3.5021795878382376e-06, "loss": 2.0881, "step": 3260 }, { "epoch": 0.36974219810040704, "grad_norm": 7.727081775665283, "learning_rate": 3.46624549729588e-06, "loss": 2.2259, "step": 3270 }, { "epoch": 0.370872908186341, "grad_norm": 8.211278915405273, "learning_rate": 3.430425835997908e-06, "loss": 2.1668, "step": 3280 }, { "epoch": 0.372003618272275, "grad_norm": 7.657146453857422, "learning_rate": 3.394722076350302e-06, "loss": 2.1582, "step": 3290 }, { "epoch": 0.373134328358209, "grad_norm": 6.054737091064453, "learning_rate": 3.3591356859947812e-06, "loss": 2.2534, "step": 3300 }, { "epoch": 0.373134328358209, "eval_loss": 2.1768276691436768, "eval_runtime": 48.4087, "eval_samples_per_second": 10.329, "eval_steps_per_second": 10.329, "step": 3300 }, { "epoch": 0.37426503844414294, "grad_norm": 6.833901405334473, "learning_rate": 3.323668127748465e-06, "loss": 2.1089, "step": 3310 }, { "epoch": 0.3753957485300769, "grad_norm": 8.001849174499512, "learning_rate": 3.2883208595437586e-06, "loss": 2.1773, "step": 3320 }, { "epoch": 0.37652645861601086, "grad_norm": 7.723094940185547, "learning_rate": 3.253095334368414e-06, "loss": 2.1558, "step": 3330 }, { "epoch": 0.3776571687019448, "grad_norm": 7.222660064697266, "learning_rate": 3.217993000205799e-06, "loss": 2.079, "step": 3340 }, { "epoch": 0.3787878787878788, "grad_norm": 7.211380958557129, "learning_rate": 3.1830152999753906e-06, "loss": 2.117, "step": 3350 }, { "epoch": 0.37991858887381275, "grad_norm": 7.046966075897217, "learning_rate": 3.148163671473439e-06, "loss": 2.122, "step": 3360 }, { "epoch": 0.3810492989597467, "grad_norm": 9.228036880493164, "learning_rate": 3.113439547313892e-06, "loss": 2.1295, "step": 3370 }, { "epoch": 0.38218000904568067, "grad_norm": 6.91172981262207, "learning_rate": 3.0788443548694876e-06, "loss": 2.2618, "step": 3380 }, { "epoch": 0.38331071913161463, "grad_norm": 6.812857627868652, "learning_rate": 3.0443795162130875e-06, "loss": 2.1437, "step": 3390 }, { "epoch": 0.38444142921754865, "grad_norm": 8.501266479492188, "learning_rate": 3.0100464480592185e-06, "loss": 2.1473, "step": 3400 }, { "epoch": 0.3855721393034826, "grad_norm": 7.0855817794799805, "learning_rate": 2.9758465617058404e-06, "loss": 2.1431, "step": 3410 }, { "epoch": 0.38670284938941657, "grad_norm": 8.621904373168945, "learning_rate": 2.9417812629763287e-06, "loss": 2.2068, "step": 3420 }, { "epoch": 0.38783355947535053, "grad_norm": 10.98462963104248, "learning_rate": 2.9078519521616896e-06, "loss": 2.2049, "step": 3430 }, { "epoch": 0.3889642695612845, "grad_norm": 7.825772285461426, "learning_rate": 2.8740600239630003e-06, "loss": 2.1155, "step": 3440 }, { "epoch": 0.39009497964721845, "grad_norm": 7.3312907218933105, "learning_rate": 2.8404068674340713e-06, "loss": 2.0736, "step": 3450 }, { "epoch": 0.39009497964721845, "eval_loss": 2.174567937850952, "eval_runtime": 48.9147, "eval_samples_per_second": 10.222, "eval_steps_per_second": 10.222, "step": 3450 }, { "epoch": 0.3912256897331524, "grad_norm": 8.426414489746094, "learning_rate": 2.80689386592436e-06, "loss": 2.1906, "step": 3460 }, { "epoch": 0.3923563998190864, "grad_norm": 7.72976016998291, "learning_rate": 2.7735223970220957e-06, "loss": 2.0633, "step": 3470 }, { "epoch": 0.39348710990502034, "grad_norm": 6.992247104644775, "learning_rate": 2.7402938324976576e-06, "loss": 2.2568, "step": 3480 }, { "epoch": 0.3946178199909543, "grad_norm": 7.082317352294922, "learning_rate": 2.70720953824719e-06, "loss": 2.1635, "step": 3490 }, { "epoch": 0.39574853007688826, "grad_norm": 7.662208557128906, "learning_rate": 2.674270874236441e-06, "loss": 2.1348, "step": 3500 }, { "epoch": 0.3968792401628223, "grad_norm": 7.307588577270508, "learning_rate": 2.6414791944448797e-06, "loss": 2.0784, "step": 3510 }, { "epoch": 0.39800995024875624, "grad_norm": 6.933011054992676, "learning_rate": 2.6088358468100245e-06, "loss": 2.1559, "step": 3520 }, { "epoch": 0.3991406603346902, "grad_norm": 8.685033798217773, "learning_rate": 2.5763421731720436e-06, "loss": 2.1613, "step": 3530 }, { "epoch": 0.40027137042062416, "grad_norm": 9.7869234085083, "learning_rate": 2.543999509218589e-06, "loss": 2.25, "step": 3540 }, { "epoch": 0.4014020805065581, "grad_norm": 6.805147647857666, "learning_rate": 2.5118091844299002e-06, "loss": 2.2006, "step": 3550 }, { "epoch": 0.4025327905924921, "grad_norm": 7.0284881591796875, "learning_rate": 2.4797725220241472e-06, "loss": 2.2249, "step": 3560 }, { "epoch": 0.40366350067842605, "grad_norm": 7.539117336273193, "learning_rate": 2.447890838903043e-06, "loss": 2.122, "step": 3570 }, { "epoch": 0.40479421076436, "grad_norm": 7.207503795623779, "learning_rate": 2.4161654455977102e-06, "loss": 2.105, "step": 3580 }, { "epoch": 0.40592492085029397, "grad_norm": 7.948032855987549, "learning_rate": 2.3845976462148035e-06, "loss": 2.0753, "step": 3590 }, { "epoch": 0.40705563093622793, "grad_norm": 9.196198463439941, "learning_rate": 2.353188738382916e-06, "loss": 2.1305, "step": 3600 }, { "epoch": 0.40705563093622793, "eval_loss": 2.1736714839935303, "eval_runtime": 48.4292, "eval_samples_per_second": 10.324, "eval_steps_per_second": 10.324, "step": 3600 }, { "epoch": 0.4081863410221619, "grad_norm": 6.023496627807617, "learning_rate": 2.321940013199227e-06, "loss": 2.0711, "step": 3610 }, { "epoch": 0.4093170511080959, "grad_norm": 7.560184478759766, "learning_rate": 2.2908527551764406e-06, "loss": 2.1535, "step": 3620 }, { "epoch": 0.41044776119402987, "grad_norm": 7.802646636962891, "learning_rate": 2.259928242189966e-06, "loss": 2.0695, "step": 3630 }, { "epoch": 0.41157847127996383, "grad_norm": 7.263815402984619, "learning_rate": 2.229167745425414e-06, "loss": 2.1159, "step": 3640 }, { "epoch": 0.4127091813658978, "grad_norm": 6.710958957672119, "learning_rate": 2.198572529326324e-06, "loss": 2.186, "step": 3650 }, { "epoch": 0.41383989145183175, "grad_norm": 8.483007431030273, "learning_rate": 2.1681438515421955e-06, "loss": 2.1148, "step": 3660 }, { "epoch": 0.4149706015377657, "grad_norm": 7.1506667137146, "learning_rate": 2.1378829628767965e-06, "loss": 2.1484, "step": 3670 }, { "epoch": 0.4161013116236997, "grad_norm": 7.395163536071777, "learning_rate": 2.1077911072367317e-06, "loss": 2.0811, "step": 3680 }, { "epoch": 0.41723202170963364, "grad_norm": 8.00395393371582, "learning_rate": 2.077869521580325e-06, "loss": 2.1155, "step": 3690 }, { "epoch": 0.4183627317955676, "grad_norm": 13.800628662109375, "learning_rate": 2.0481194358667695e-06, "loss": 2.2311, "step": 3700 }, { "epoch": 0.41949344188150156, "grad_norm": 7.042491436004639, "learning_rate": 2.0185420730055674e-06, "loss": 2.264, "step": 3710 }, { "epoch": 0.4206241519674355, "grad_norm": 8.76215934753418, "learning_rate": 1.989138648806254e-06, "loss": 2.0974, "step": 3720 }, { "epoch": 0.42175486205336954, "grad_norm": 7.132750511169434, "learning_rate": 1.9599103719284363e-06, "loss": 2.0756, "step": 3730 }, { "epoch": 0.4228855721393035, "grad_norm": 7.339520454406738, "learning_rate": 1.930858443832096e-06, "loss": 2.1834, "step": 3740 }, { "epoch": 0.42401628222523746, "grad_norm": 7.147751808166504, "learning_rate": 1.9019840587282103e-06, "loss": 2.1386, "step": 3750 }, { "epoch": 0.42401628222523746, "eval_loss": 2.1732726097106934, "eval_runtime": 48.3078, "eval_samples_per_second": 10.35, "eval_steps_per_second": 10.35, "step": 3750 }, { "epoch": 0.4251469923111714, "grad_norm": 8.438526153564453, "learning_rate": 1.8732884035296585e-06, "loss": 2.1042, "step": 3760 }, { "epoch": 0.4262777023971054, "grad_norm": 7.596499443054199, "learning_rate": 1.844772657802428e-06, "loss": 2.1709, "step": 3770 }, { "epoch": 0.42740841248303935, "grad_norm": 6.644456386566162, "learning_rate": 1.8164379937171385e-06, "loss": 2.0864, "step": 3780 }, { "epoch": 0.4285391225689733, "grad_norm": 7.394039630889893, "learning_rate": 1.7882855760008545e-06, "loss": 2.1278, "step": 3790 }, { "epoch": 0.42966983265490727, "grad_norm": 7.848877429962158, "learning_rate": 1.7603165618892033e-06, "loss": 2.1047, "step": 3800 }, { "epoch": 0.43080054274084123, "grad_norm": 7.905978679656982, "learning_rate": 1.7325321010788035e-06, "loss": 2.1174, "step": 3810 }, { "epoch": 0.4319312528267752, "grad_norm": 7.8521294593811035, "learning_rate": 1.7049333356800166e-06, "loss": 2.1071, "step": 3820 }, { "epoch": 0.4330619629127092, "grad_norm": 8.143991470336914, "learning_rate": 1.6775214001699916e-06, "loss": 2.0766, "step": 3830 }, { "epoch": 0.43419267299864317, "grad_norm": 8.954610824584961, "learning_rate": 1.6502974213460316e-06, "loss": 2.106, "step": 3840 }, { "epoch": 0.43532338308457713, "grad_norm": 6.5821003913879395, "learning_rate": 1.623262518279279e-06, "loss": 2.1213, "step": 3850 }, { "epoch": 0.4364540931705111, "grad_norm": 6.679924964904785, "learning_rate": 1.596417802268707e-06, "loss": 2.1796, "step": 3860 }, { "epoch": 0.43758480325644505, "grad_norm": 6.923624515533447, "learning_rate": 1.569764376795449e-06, "loss": 2.0936, "step": 3870 }, { "epoch": 0.438715513342379, "grad_norm": 8.030959129333496, "learning_rate": 1.5433033374774323e-06, "loss": 2.1039, "step": 3880 }, { "epoch": 0.439846223428313, "grad_norm": 8.67519760131836, "learning_rate": 1.517035772024343e-06, "loss": 2.0985, "step": 3890 }, { "epoch": 0.44097693351424694, "grad_norm": 6.698771953582764, "learning_rate": 1.4909627601929099e-06, "loss": 2.0911, "step": 3900 }, { "epoch": 0.44097693351424694, "eval_loss": 2.173064947128296, "eval_runtime": 48.4159, "eval_samples_per_second": 10.327, "eval_steps_per_second": 10.327, "step": 3900 }, { "epoch": 0.4421076436001809, "grad_norm": 6.461952209472656, "learning_rate": 1.4650853737425327e-06, "loss": 2.1133, "step": 3910 }, { "epoch": 0.44323835368611486, "grad_norm": 7.058958530426025, "learning_rate": 1.4394046763912123e-06, "loss": 2.1262, "step": 3920 }, { "epoch": 0.4443690637720488, "grad_norm": 6.935202121734619, "learning_rate": 1.4139217237718321e-06, "loss": 2.1551, "step": 3930 }, { "epoch": 0.44549977385798284, "grad_norm": 7.341289043426514, "learning_rate": 1.3886375633887664e-06, "loss": 2.0708, "step": 3940 }, { "epoch": 0.4466304839439168, "grad_norm": 7.592216491699219, "learning_rate": 1.3635532345748139e-06, "loss": 2.1538, "step": 3950 }, { "epoch": 0.44776119402985076, "grad_norm": 7.835076332092285, "learning_rate": 1.3386697684484855e-06, "loss": 2.1539, "step": 3960 }, { "epoch": 0.4488919041157847, "grad_norm": 7.151694297790527, "learning_rate": 1.3139881878716107e-06, "loss": 2.1251, "step": 3970 }, { "epoch": 0.4500226142017187, "grad_norm": 8.185730934143066, "learning_rate": 1.2895095074072987e-06, "loss": 2.1782, "step": 3980 }, { "epoch": 0.45115332428765265, "grad_norm": 7.637902736663818, "learning_rate": 1.2652347332782227e-06, "loss": 2.1704, "step": 3990 }, { "epoch": 0.4522840343735866, "grad_norm": 6.536497116088867, "learning_rate": 1.2411648633252719e-06, "loss": 2.0732, "step": 4000 }, { "epoch": 0.45341474445952057, "grad_norm": 8.338398933410645, "learning_rate": 1.2173008869665243e-06, "loss": 2.0794, "step": 4010 }, { "epoch": 0.45454545454545453, "grad_norm": 12.014728546142578, "learning_rate": 1.1936437851565791e-06, "loss": 2.1328, "step": 4020 }, { "epoch": 0.4556761646313885, "grad_norm": 7.422269821166992, "learning_rate": 1.1701945303462337e-06, "loss": 2.0427, "step": 4030 }, { "epoch": 0.45680687471732245, "grad_norm": 8.303122520446777, "learning_rate": 1.146954086442508e-06, "loss": 2.1383, "step": 4040 }, { "epoch": 0.45793758480325647, "grad_norm": 8.263761520385742, "learning_rate": 1.1239234087690252e-06, "loss": 2.172, "step": 4050 }, { "epoch": 0.45793758480325647, "eval_loss": 2.1722817420959473, "eval_runtime": 48.3495, "eval_samples_per_second": 10.341, "eval_steps_per_second": 10.341, "step": 4050 }, { "epoch": 0.45906829488919043, "grad_norm": 7.413692951202393, "learning_rate": 1.1011034440267395e-06, "loss": 2.1228, "step": 4060 }, { "epoch": 0.4601990049751244, "grad_norm": 7.110252380371094, "learning_rate": 1.078495130255023e-06, "loss": 2.0935, "step": 4070 }, { "epoch": 0.46132971506105835, "grad_norm": 8.894190788269043, "learning_rate": 1.0560993967931005e-06, "loss": 2.1834, "step": 4080 }, { "epoch": 0.4624604251469923, "grad_norm": 8.279912948608398, "learning_rate": 1.0339171642418586e-06, "loss": 2.0771, "step": 4090 }, { "epoch": 0.4635911352329263, "grad_norm": 8.827073097229004, "learning_rate": 1.0119493444259962e-06, "loss": 2.0894, "step": 4100 }, { "epoch": 0.46472184531886024, "grad_norm": 7.0823869705200195, "learning_rate": 9.901968403565429e-07, "loss": 2.1216, "step": 4110 }, { "epoch": 0.4658525554047942, "grad_norm": 10.01314926147461, "learning_rate": 9.686605461937441e-07, "loss": 2.195, "step": 4120 }, { "epoch": 0.46698326549072816, "grad_norm": 6.658485412597656, "learning_rate": 9.473413472102982e-07, "loss": 2.1583, "step": 4130 }, { "epoch": 0.4681139755766621, "grad_norm": 9.450687408447266, "learning_rate": 9.262401197549744e-07, "loss": 2.0773, "step": 4140 }, { "epoch": 0.4692446856625961, "grad_norm": 7.385706901550293, "learning_rate": 9.05357731216587e-07, "loss": 2.2287, "step": 4150 }, { "epoch": 0.4703753957485301, "grad_norm": 8.266607284545898, "learning_rate": 8.846950399883369e-07, "loss": 2.1595, "step": 4160 }, { "epoch": 0.47150610583446406, "grad_norm": 7.939385414123535, "learning_rate": 8.642528954325311e-07, "loss": 2.2658, "step": 4170 }, { "epoch": 0.472636815920398, "grad_norm": 9.474833488464355, "learning_rate": 8.440321378456656e-07, "loss": 2.1328, "step": 4180 }, { "epoch": 0.473767526006332, "grad_norm": 6.553383827209473, "learning_rate": 8.240335984238845e-07, "loss": 2.1364, "step": 4190 }, { "epoch": 0.47489823609226595, "grad_norm": 7.559937953948975, "learning_rate": 8.042580992288162e-07, "loss": 2.1844, "step": 4200 }, { "epoch": 0.47489823609226595, "eval_loss": 2.1722524166107178, "eval_runtime": 48.3983, "eval_samples_per_second": 10.331, "eval_steps_per_second": 10.331, "step": 4200 }, { "epoch": 0.4760289461781999, "grad_norm": 7.009474754333496, "learning_rate": 7.847064531537773e-07, "loss": 2.1265, "step": 4210 }, { "epoch": 0.47715965626413387, "grad_norm": 8.13442325592041, "learning_rate": 7.653794638903575e-07, "loss": 2.1094, "step": 4220 }, { "epoch": 0.47829036635006783, "grad_norm": 7.741903781890869, "learning_rate": 7.462779258953875e-07, "loss": 2.117, "step": 4230 }, { "epoch": 0.4794210764360018, "grad_norm": 8.127983093261719, "learning_rate": 7.274026243582796e-07, "loss": 2.169, "step": 4240 }, { "epoch": 0.48055178652193575, "grad_norm": 6.6429762840271, "learning_rate": 7.087543351687493e-07, "loss": 2.0987, "step": 4250 }, { "epoch": 0.4816824966078697, "grad_norm": 6.923095226287842, "learning_rate": 6.903338248849269e-07, "loss": 2.2313, "step": 4260 }, { "epoch": 0.48281320669380373, "grad_norm": 7.46994161605835, "learning_rate": 6.721418507018393e-07, "loss": 2.0535, "step": 4270 }, { "epoch": 0.4839439167797377, "grad_norm": 7.804521560668945, "learning_rate": 6.541791604202936e-07, "loss": 2.055, "step": 4280 }, { "epoch": 0.48507462686567165, "grad_norm": 8.184720039367676, "learning_rate": 6.36446492416131e-07, "loss": 2.1931, "step": 4290 }, { "epoch": 0.4862053369516056, "grad_norm": 7.078069686889648, "learning_rate": 6.18944575609881e-07, "loss": 2.0873, "step": 4300 }, { "epoch": 0.4873360470375396, "grad_norm": 9.47761058807373, "learning_rate": 6.016741294367911e-07, "loss": 2.1873, "step": 4310 }, { "epoch": 0.48846675712347354, "grad_norm": 7.138107776641846, "learning_rate": 5.846358638172615e-07, "loss": 2.1189, "step": 4320 }, { "epoch": 0.4895974672094075, "grad_norm": 6.7536540031433105, "learning_rate": 5.678304791276568e-07, "loss": 2.1254, "step": 4330 }, { "epoch": 0.49072817729534146, "grad_norm": 8.156876564025879, "learning_rate": 5.51258666171519e-07, "loss": 2.1607, "step": 4340 }, { "epoch": 0.4918588873812754, "grad_norm": 7.141668796539307, "learning_rate": 5.349211061511726e-07, "loss": 2.0986, "step": 4350 }, { "epoch": 0.4918588873812754, "eval_loss": 2.171393632888794, "eval_runtime": 48.45, "eval_samples_per_second": 10.32, "eval_steps_per_second": 10.32, "step": 4350 }, { "epoch": 0.4929895974672094, "grad_norm": 6.334621906280518, "learning_rate": 5.188184706397182e-07, "loss": 2.1247, "step": 4360 }, { "epoch": 0.4941203075531434, "grad_norm": 6.946900367736816, "learning_rate": 5.029514215534339e-07, "loss": 2.1163, "step": 4370 }, { "epoch": 0.49525101763907736, "grad_norm": 9.918743133544922, "learning_rate": 4.873206111245595e-07, "loss": 2.0662, "step": 4380 }, { "epoch": 0.4963817277250113, "grad_norm": 7.090532302856445, "learning_rate": 4.719266818744912e-07, "loss": 2.1574, "step": 4390 }, { "epoch": 0.4975124378109453, "grad_norm": 10.679587364196777, "learning_rate": 4.5677026658736477e-07, "loss": 2.2056, "step": 4400 }, { "epoch": 0.49864314789687925, "grad_norm": 7.583798408508301, "learning_rate": 4.418519882840505e-07, "loss": 2.1393, "step": 4410 }, { "epoch": 0.4997738579828132, "grad_norm": 8.197004318237305, "learning_rate": 4.271724601965371e-07, "loss": 2.1061, "step": 4420 }, { "epoch": 0.5009045680687472, "grad_norm": 8.92779541015625, "learning_rate": 4.127322857427306e-07, "loss": 2.0291, "step": 4430 }, { "epoch": 0.5020352781546812, "grad_norm": 6.497152805328369, "learning_rate": 3.985320585016425e-07, "loss": 2.1127, "step": 4440 }, { "epoch": 0.5031659882406151, "grad_norm": 6.622493267059326, "learning_rate": 3.8457236218899724e-07, "loss": 2.1018, "step": 4450 }, { "epoch": 0.5042966983265491, "grad_norm": 6.964017391204834, "learning_rate": 3.708537706332345e-07, "loss": 2.1857, "step": 4460 }, { "epoch": 0.505427408412483, "grad_norm": 7.607455730438232, "learning_rate": 3.573768477519189e-07, "loss": 2.1044, "step": 4470 }, { "epoch": 0.506558118498417, "grad_norm": 8.895915985107422, "learning_rate": 3.441421475285679e-07, "loss": 2.143, "step": 4480 }, { "epoch": 0.5076888285843509, "grad_norm": 7.190673828125, "learning_rate": 3.311502139898677e-07, "loss": 2.1138, "step": 4490 }, { "epoch": 0.508819538670285, "grad_norm": 6.651699066162109, "learning_rate": 3.18401581183321e-07, "loss": 2.0651, "step": 4500 }, { "epoch": 0.508819538670285, "eval_loss": 2.1712958812713623, "eval_runtime": 48.552, "eval_samples_per_second": 10.298, "eval_steps_per_second": 10.298, "step": 4500 }, { "epoch": 0.5099502487562189, "grad_norm": 6.923861026763916, "learning_rate": 3.0589677315529047e-07, "loss": 2.1079, "step": 4510 }, { "epoch": 0.5110809588421529, "grad_norm": 7.113198280334473, "learning_rate": 2.9363630392945514e-07, "loss": 2.0686, "step": 4520 }, { "epoch": 0.5122116689280869, "grad_norm": 43.09786605834961, "learning_rate": 2.8162067748568543e-07, "loss": 2.144, "step": 4530 }, { "epoch": 0.5133423790140208, "grad_norm": 6.44378662109375, "learning_rate": 2.6985038773932045e-07, "loss": 2.0813, "step": 4540 }, { "epoch": 0.5144730890999548, "grad_norm": 8.069598197937012, "learning_rate": 2.583259185208714e-07, "loss": 2.0546, "step": 4550 }, { "epoch": 0.5156037991858887, "grad_norm": 7.838702201843262, "learning_rate": 2.4704774355612943e-07, "loss": 2.1704, "step": 4560 }, { "epoch": 0.5167345092718227, "grad_norm": 10.241755485534668, "learning_rate": 2.3601632644669538e-07, "loss": 2.1599, "step": 4570 }, { "epoch": 0.5178652193577566, "grad_norm": 7.890923976898193, "learning_rate": 2.2523212065091726e-07, "loss": 2.2198, "step": 4580 }, { "epoch": 0.5189959294436907, "grad_norm": 8.300923347473145, "learning_rate": 2.1469556946525707e-07, "loss": 2.1522, "step": 4590 }, { "epoch": 0.5201266395296246, "grad_norm": 7.938436508178711, "learning_rate": 2.0440710600606595e-07, "loss": 2.0943, "step": 4600 }, { "epoch": 0.5212573496155586, "grad_norm": 7.055637836456299, "learning_rate": 1.9436715319177957e-07, "loss": 2.1505, "step": 4610 }, { "epoch": 0.5223880597014925, "grad_norm": 7.342301845550537, "learning_rate": 1.8457612372553348e-07, "loss": 2.1136, "step": 4620 }, { "epoch": 0.5235187697874265, "grad_norm": 8.444767951965332, "learning_rate": 1.75034420078201e-07, "loss": 2.1256, "step": 4630 }, { "epoch": 0.5246494798733605, "grad_norm": 7.327487468719482, "learning_rate": 1.65742434471846e-07, "loss": 2.1688, "step": 4640 }, { "epoch": 0.5257801899592944, "grad_norm": 7.379541397094727, "learning_rate": 1.567005488636024e-07, "loss": 2.1732, "step": 4650 }, { "epoch": 0.5257801899592944, "eval_loss": 2.17098331451416, "eval_runtime": 48.54, "eval_samples_per_second": 10.301, "eval_steps_per_second": 10.301, "step": 4650 }, { "epoch": 0.5269109000452284, "grad_norm": 6.7260823249816895, "learning_rate": 1.4790913492997437e-07, "loss": 2.1182, "step": 4660 }, { "epoch": 0.5280416101311624, "grad_norm": 7.565569877624512, "learning_rate": 1.3936855405155408e-07, "loss": 2.1055, "step": 4670 }, { "epoch": 0.5291723202170964, "grad_norm": 9.824432373046875, "learning_rate": 1.3107915729816954e-07, "loss": 2.0709, "step": 4680 }, { "epoch": 0.5303030303030303, "grad_norm": 8.108494758605957, "learning_rate": 1.230412854144547e-07, "loss": 2.0708, "step": 4690 }, { "epoch": 0.5314337403889643, "grad_norm": 7.706394672393799, "learning_rate": 1.1525526880584101e-07, "loss": 2.2229, "step": 4700 }, { "epoch": 0.5325644504748982, "grad_norm": 6.23236083984375, "learning_rate": 1.0772142752497605e-07, "loss": 2.122, "step": 4710 }, { "epoch": 0.5336951605608322, "grad_norm": 9.338789939880371, "learning_rate": 1.0044007125856459e-07, "loss": 2.1791, "step": 4720 }, { "epoch": 0.5348258706467661, "grad_norm": 6.334522724151611, "learning_rate": 9.341149931464538e-08, "loss": 2.1446, "step": 4730 }, { "epoch": 0.5359565807327001, "grad_norm": 7.732450008392334, "learning_rate": 8.663600061028162e-08, "loss": 2.1937, "step": 4740 }, { "epoch": 0.5370872908186342, "grad_norm": 7.841606616973877, "learning_rate": 8.01138536596864e-08, "loss": 2.106, "step": 4750 }, { "epoch": 0.5382180009045681, "grad_norm": 6.506497859954834, "learning_rate": 7.384532656277699e-08, "loss": 2.1109, "step": 4760 }, { "epoch": 0.5393487109905021, "grad_norm": 8.013265609741211, "learning_rate": 6.783067699414891e-08, "loss": 2.1418, "step": 4770 }, { "epoch": 0.540479421076436, "grad_norm": 8.116345405578613, "learning_rate": 6.207015219248866e-08, "loss": 2.1595, "step": 4780 }, { "epoch": 0.54161013116237, "grad_norm": 6.582291603088379, "learning_rate": 5.656398895040812e-08, "loss": 2.2133, "step": 4790 }, { "epoch": 0.5427408412483039, "grad_norm": 9.832483291625977, "learning_rate": 5.131241360471217e-08, "loss": 2.0924, "step": 4800 }, { "epoch": 0.5427408412483039, "eval_loss": 2.170783519744873, "eval_runtime": 48.3173, "eval_samples_per_second": 10.348, "eval_steps_per_second": 10.348, "step": 4800 }, { "epoch": 0.5438715513342379, "grad_norm": 7.343964099884033, "learning_rate": 4.631564202709354e-08, "loss": 2.1663, "step": 4810 }, { "epoch": 0.5450022614201718, "grad_norm": 7.99005126953125, "learning_rate": 4.1573879615262184e-08, "loss": 2.1205, "step": 4820 }, { "epoch": 0.5461329715061058, "grad_norm": 9.38589859008789, "learning_rate": 3.708732128449785e-08, "loss": 2.1934, "step": 4830 }, { "epoch": 0.5472636815920398, "grad_norm": 8.230008125305176, "learning_rate": 3.2856151459641216e-08, "loss": 2.1707, "step": 4840 }, { "epoch": 0.5483943916779738, "grad_norm": 8.252881050109863, "learning_rate": 2.8880544067511063e-08, "loss": 2.0679, "step": 4850 }, { "epoch": 0.5495251017639078, "grad_norm": 7.558712005615234, "learning_rate": 2.5160662529755823e-08, "loss": 2.0296, "step": 4860 }, { "epoch": 0.5506558118498417, "grad_norm": 7.497928142547607, "learning_rate": 2.169665975613605e-08, "loss": 2.1634, "step": 4870 }, { "epoch": 0.5517865219357757, "grad_norm": 9.227611541748047, "learning_rate": 1.8488678138238458e-08, "loss": 2.154, "step": 4880 }, { "epoch": 0.5529172320217096, "grad_norm": 7.9704670906066895, "learning_rate": 1.5536849543621583e-08, "loss": 2.1631, "step": 4890 }, { "epoch": 0.5540479421076436, "grad_norm": 6.741693496704102, "learning_rate": 1.2841295310397906e-08, "loss": 2.0698, "step": 4900 }, { "epoch": 0.5551786521935775, "grad_norm": 6.446054458618164, "learning_rate": 1.0402126242244764e-08, "loss": 2.1016, "step": 4910 }, { "epoch": 0.5563093622795116, "grad_norm": 8.23855972290039, "learning_rate": 8.219442603847605e-09, "loss": 2.1704, "step": 4920 }, { "epoch": 0.5574400723654455, "grad_norm": 8.730588912963867, "learning_rate": 6.293334116783817e-09, "loss": 2.1139, "step": 4930 }, { "epoch": 0.5585707824513795, "grad_norm": 7.763499736785889, "learning_rate": 4.623879955827082e-09, "loss": 2.1412, "step": 4940 }, { "epoch": 0.5597014925373134, "grad_norm": 7.802066802978516, "learning_rate": 3.211148745700665e-09, "loss": 2.2264, "step": 4950 }, { "epoch": 0.5597014925373134, "eval_loss": 2.170982837677002, "eval_runtime": 48.4544, "eval_samples_per_second": 10.319, "eval_steps_per_second": 10.319, "step": 4950 } ], "logging_steps": 10, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 150, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.36931017801728e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }