{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.11786454262354065, "eval_steps": 200, "global_step": 19000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00015508492450465875, "grad_norm": 0.12764382362365723, "learning_rate": 0.0015, "loss": 3.062, "step": 25 }, { "epoch": 0.0003101698490093175, "grad_norm": 0.08861421793699265, "learning_rate": 0.0015, "loss": 3.0523, "step": 50 }, { "epoch": 0.00046525477351397625, "grad_norm": 0.10059793293476105, "learning_rate": 0.0015, "loss": 3.0271, "step": 75 }, { "epoch": 0.000620339698018635, "grad_norm": 0.09730365872383118, "learning_rate": 0.0015, "loss": 3.0421, "step": 100 }, { "epoch": 0.0007754246225232938, "grad_norm": 0.15407200157642365, "learning_rate": 0.0015, "loss": 2.9894, "step": 125 }, { "epoch": 0.0009305095470279525, "grad_norm": 0.12250959873199463, "learning_rate": 0.0015, "loss": 3.0055, "step": 150 }, { "epoch": 0.0010855944715326112, "grad_norm": 0.08540652692317963, "learning_rate": 0.0015, "loss": 3.0025, "step": 175 }, { "epoch": 0.00124067939603727, "grad_norm": 0.1479829102754593, "learning_rate": 0.0015, "loss": 2.9881, "step": 200 }, { "epoch": 0.00124067939603727, "eval_loss": 4.852784156799316, "perplexity": 128.09652709960938, "step": 200 }, { "epoch": 0.0013957643205419288, "grad_norm": 0.1036139577627182, "learning_rate": 0.0015, "loss": 2.9609, "step": 225 }, { "epoch": 0.0015508492450465876, "grad_norm": 0.10382606089115143, "learning_rate": 0.0015, "loss": 2.9771, "step": 250 }, { "epoch": 0.0017059341695512462, "grad_norm": 0.08648105710744858, "learning_rate": 0.0015, "loss": 2.9522, "step": 275 }, { "epoch": 0.001861019094055905, "grad_norm": 0.08675844967365265, "learning_rate": 0.0015, "loss": 2.9833, "step": 300 }, { "epoch": 0.0020161040185605636, "grad_norm": 0.1417882740497589, "learning_rate": 0.0015, "loss": 2.9626, "step": 325 }, { "epoch": 0.0021711889430652224, "grad_norm": 0.09860406816005707, "learning_rate": 0.0015, "loss": 2.9515, "step": 350 }, { "epoch": 0.002326273867569881, "grad_norm": 0.11757214367389679, "learning_rate": 0.0015, "loss": 2.9523, "step": 375 }, { "epoch": 0.00248135879207454, "grad_norm": 0.11415340006351471, "learning_rate": 0.0015, "loss": 2.9579, "step": 400 }, { "epoch": 0.00248135879207454, "eval_loss": 4.8426313400268555, "perplexity": 126.80257415771484, "step": 400 }, { "epoch": 0.002636443716579199, "grad_norm": 0.10692940652370453, "learning_rate": 0.0015, "loss": 2.9273, "step": 425 }, { "epoch": 0.0027915286410838576, "grad_norm": 0.12780559062957764, "learning_rate": 0.0015, "loss": 2.9577, "step": 450 }, { "epoch": 0.0029466135655885164, "grad_norm": 0.21147418022155762, "learning_rate": 0.0015, "loss": 2.9118, "step": 475 }, { "epoch": 0.003101698490093175, "grad_norm": 0.13209331035614014, "learning_rate": 0.0015, "loss": 2.9584, "step": 500 }, { "epoch": 0.0032567834145978336, "grad_norm": 0.13230836391448975, "learning_rate": 0.0015, "loss": 2.9621, "step": 525 }, { "epoch": 0.0034118683391024924, "grad_norm": 0.11265246570110321, "learning_rate": 0.0015, "loss": 2.941, "step": 550 }, { "epoch": 0.003566953263607151, "grad_norm": 0.10484226047992706, "learning_rate": 0.0015, "loss": 2.9311, "step": 575 }, { "epoch": 0.00372203818811181, "grad_norm": 0.13941314816474915, "learning_rate": 0.0015, "loss": 2.9741, "step": 600 }, { "epoch": 0.00372203818811181, "eval_loss": 4.831629276275635, "perplexity": 125.41513061523438, "step": 600 }, { "epoch": 0.0038771231126164688, "grad_norm": 0.0885343998670578, "learning_rate": 0.0015, "loss": 2.944, "step": 625 }, { "epoch": 0.004032208037121127, "grad_norm": 0.093564473092556, "learning_rate": 0.0015, "loss": 2.9673, "step": 650 }, { "epoch": 0.004187292961625786, "grad_norm": 0.15350665152072906, "learning_rate": 0.0015, "loss": 2.9314, "step": 675 }, { "epoch": 0.004342377886130445, "grad_norm": 0.11337901651859283, "learning_rate": 0.0015, "loss": 2.97, "step": 700 }, { "epoch": 0.004497462810635104, "grad_norm": 0.13508272171020508, "learning_rate": 0.0015, "loss": 2.9121, "step": 725 }, { "epoch": 0.004652547735139762, "grad_norm": 0.10049441456794739, "learning_rate": 0.0015, "loss": 2.9572, "step": 750 }, { "epoch": 0.004807632659644422, "grad_norm": 0.1017594188451767, "learning_rate": 0.0015, "loss": 2.9207, "step": 775 }, { "epoch": 0.00496271758414908, "grad_norm": 0.09874167293310165, "learning_rate": 0.0015, "loss": 2.9258, "step": 800 }, { "epoch": 0.00496271758414908, "eval_loss": 4.783432960510254, "perplexity": 119.51393127441406, "step": 800 }, { "epoch": 0.005117802508653739, "grad_norm": 0.09769408404827118, "learning_rate": 0.0015, "loss": 2.9606, "step": 825 }, { "epoch": 0.005272887433158398, "grad_norm": 0.11946038156747818, "learning_rate": 0.0015, "loss": 2.889, "step": 850 }, { "epoch": 0.005427972357663056, "grad_norm": 0.12191672623157501, "learning_rate": 0.0015, "loss": 2.9094, "step": 875 }, { "epoch": 0.005583057282167715, "grad_norm": 0.09349209070205688, "learning_rate": 0.0015, "loss": 2.9242, "step": 900 }, { "epoch": 0.0057381422066723736, "grad_norm": 0.07793531566858292, "learning_rate": 0.0015, "loss": 2.9692, "step": 925 }, { "epoch": 0.005893227131177033, "grad_norm": 0.1276599019765854, "learning_rate": 0.0015, "loss": 2.9339, "step": 950 }, { "epoch": 0.006048312055681691, "grad_norm": 0.11083021759986877, "learning_rate": 0.0015, "loss": 2.9251, "step": 975 }, { "epoch": 0.00620339698018635, "grad_norm": 0.13207702338695526, "learning_rate": 0.0015, "loss": 2.8567, "step": 1000 }, { "epoch": 0.00620339698018635, "eval_loss": 4.790068626403809, "perplexity": 120.30962371826172, "step": 1000 }, { "epoch": 0.006358481904691009, "grad_norm": 0.20453479886054993, "learning_rate": 0.0015, "loss": 2.9127, "step": 1025 }, { "epoch": 0.006513566829195667, "grad_norm": 0.12530989944934845, "learning_rate": 0.0015, "loss": 2.9147, "step": 1050 }, { "epoch": 0.006668651753700326, "grad_norm": 0.11520997434854507, "learning_rate": 0.0015, "loss": 2.936, "step": 1075 }, { "epoch": 0.006823736678204985, "grad_norm": 0.09191219508647919, "learning_rate": 0.0015, "loss": 2.9115, "step": 1100 }, { "epoch": 0.006978821602709644, "grad_norm": 0.07251202315092087, "learning_rate": 0.0015, "loss": 2.9154, "step": 1125 }, { "epoch": 0.007133906527214302, "grad_norm": 0.10054546594619751, "learning_rate": 0.0015, "loss": 2.8924, "step": 1150 }, { "epoch": 0.007288991451718962, "grad_norm": 0.1192697063088417, "learning_rate": 0.0015, "loss": 2.957, "step": 1175 }, { "epoch": 0.00744407637622362, "grad_norm": 0.14840476214885712, "learning_rate": 0.0015, "loss": 2.895, "step": 1200 }, { "epoch": 0.00744407637622362, "eval_loss": 4.770949363708496, "perplexity": 118.03124237060547, "step": 1200 }, { "epoch": 0.007599161300728279, "grad_norm": 0.11221906542778015, "learning_rate": 0.0015, "loss": 2.9131, "step": 1225 }, { "epoch": 0.0077542462252329376, "grad_norm": 0.11528974026441574, "learning_rate": 0.0015, "loss": 2.8783, "step": 1250 }, { "epoch": 0.007909331149737596, "grad_norm": 0.0807015597820282, "learning_rate": 0.0015, "loss": 2.91, "step": 1275 }, { "epoch": 0.008064416074242254, "grad_norm": 0.1435490846633911, "learning_rate": 0.0015, "loss": 2.9198, "step": 1300 }, { "epoch": 0.008219500998746914, "grad_norm": 0.11956608295440674, "learning_rate": 0.0015, "loss": 2.8771, "step": 1325 }, { "epoch": 0.008374585923251573, "grad_norm": 0.10362117737531662, "learning_rate": 0.0015, "loss": 2.8913, "step": 1350 }, { "epoch": 0.008529670847756231, "grad_norm": 0.07132004201412201, "learning_rate": 0.0015, "loss": 2.946, "step": 1375 }, { "epoch": 0.00868475577226089, "grad_norm": 0.08756817877292633, "learning_rate": 0.0015, "loss": 2.9015, "step": 1400 }, { "epoch": 0.00868475577226089, "eval_loss": 4.769084453582764, "perplexity": 117.81133270263672, "step": 1400 }, { "epoch": 0.00883984069676555, "grad_norm": 0.18067917227745056, "learning_rate": 0.0015, "loss": 2.8887, "step": 1425 }, { "epoch": 0.008994925621270208, "grad_norm": 0.09742950648069382, "learning_rate": 0.0015, "loss": 2.8834, "step": 1450 }, { "epoch": 0.009150010545774866, "grad_norm": 0.09857803583145142, "learning_rate": 0.0015, "loss": 2.8856, "step": 1475 }, { "epoch": 0.009305095470279525, "grad_norm": 0.17605328559875488, "learning_rate": 0.0015, "loss": 2.9238, "step": 1500 }, { "epoch": 0.009460180394784183, "grad_norm": 0.08441105484962463, "learning_rate": 0.0015, "loss": 2.8605, "step": 1525 }, { "epoch": 0.009615265319288843, "grad_norm": 0.15339621901512146, "learning_rate": 0.0015, "loss": 2.9421, "step": 1550 }, { "epoch": 0.009770350243793502, "grad_norm": 0.21426236629486084, "learning_rate": 0.0015, "loss": 2.8899, "step": 1575 }, { "epoch": 0.00992543516829816, "grad_norm": 0.16503557562828064, "learning_rate": 0.0015, "loss": 2.878, "step": 1600 }, { "epoch": 0.00992543516829816, "eval_loss": 4.774999618530273, "perplexity": 118.51026916503906, "step": 1600 }, { "epoch": 0.010080520092802818, "grad_norm": 0.11398541182279587, "learning_rate": 0.0015, "loss": 2.866, "step": 1625 }, { "epoch": 0.010235605017307478, "grad_norm": 0.16510234773159027, "learning_rate": 0.0015, "loss": 2.8936, "step": 1650 }, { "epoch": 0.010390689941812137, "grad_norm": 0.08827799558639526, "learning_rate": 0.0015, "loss": 2.8789, "step": 1675 }, { "epoch": 0.010545774866316795, "grad_norm": 0.12703286111354828, "learning_rate": 0.0015, "loss": 2.9104, "step": 1700 }, { "epoch": 0.010700859790821454, "grad_norm": 0.10185768455266953, "learning_rate": 0.0015, "loss": 2.8389, "step": 1725 }, { "epoch": 0.010855944715326112, "grad_norm": 0.13076236844062805, "learning_rate": 0.0015, "loss": 2.8603, "step": 1750 }, { "epoch": 0.011011029639830772, "grad_norm": 0.08955707401037216, "learning_rate": 0.0015, "loss": 2.8283, "step": 1775 }, { "epoch": 0.01116611456433543, "grad_norm": 0.07163148373365402, "learning_rate": 0.0015, "loss": 2.8852, "step": 1800 }, { "epoch": 0.01116611456433543, "eval_loss": 4.75281286239624, "perplexity": 115.90986633300781, "step": 1800 }, { "epoch": 0.011321199488840089, "grad_norm": 0.09710580855607986, "learning_rate": 0.0015, "loss": 2.8573, "step": 1825 }, { "epoch": 0.011476284413344747, "grad_norm": 0.11669810861349106, "learning_rate": 0.0015, "loss": 2.8674, "step": 1850 }, { "epoch": 0.011631369337849405, "grad_norm": 0.11174403876066208, "learning_rate": 0.0015, "loss": 2.9121, "step": 1875 }, { "epoch": 0.011786454262354066, "grad_norm": 0.09547118842601776, "learning_rate": 0.0015, "loss": 2.9033, "step": 1900 }, { "epoch": 0.011941539186858724, "grad_norm": 0.09878171980381012, "learning_rate": 0.0015, "loss": 2.8738, "step": 1925 }, { "epoch": 0.012096624111363382, "grad_norm": 0.09479096531867981, "learning_rate": 0.0015, "loss": 2.8775, "step": 1950 }, { "epoch": 0.01225170903586804, "grad_norm": 0.12434259057044983, "learning_rate": 0.0015, "loss": 2.8452, "step": 1975 }, { "epoch": 0.0124067939603727, "grad_norm": 0.09166444838047028, "learning_rate": 0.0015, "loss": 2.8546, "step": 2000 }, { "epoch": 0.0124067939603727, "eval_loss": 4.748600482940674, "perplexity": 115.42263793945312, "step": 2000 }, { "epoch": 0.01256187888487736, "grad_norm": 0.07793508470058441, "learning_rate": 0.0015, "loss": 2.8306, "step": 2025 }, { "epoch": 0.012716963809382018, "grad_norm": 0.1670406609773636, "learning_rate": 0.0015, "loss": 2.863, "step": 2050 }, { "epoch": 0.012872048733886676, "grad_norm": 0.20754718780517578, "learning_rate": 0.0015, "loss": 2.8871, "step": 2075 }, { "epoch": 0.013027133658391334, "grad_norm": 0.14225496351718903, "learning_rate": 0.0015, "loss": 2.8498, "step": 2100 }, { "epoch": 0.013182218582895994, "grad_norm": 0.11809197813272476, "learning_rate": 0.0015, "loss": 2.8206, "step": 2125 }, { "epoch": 0.013337303507400653, "grad_norm": 0.09541622549295425, "learning_rate": 0.0015, "loss": 2.8585, "step": 2150 }, { "epoch": 0.013492388431905311, "grad_norm": 0.1115843802690506, "learning_rate": 0.0015, "loss": 2.8533, "step": 2175 }, { "epoch": 0.01364747335640997, "grad_norm": 0.08517899364233017, "learning_rate": 0.0015, "loss": 2.8477, "step": 2200 }, { "epoch": 0.01364747335640997, "eval_loss": 4.753279685974121, "perplexity": 115.9639892578125, "step": 2200 }, { "epoch": 0.01380255828091463, "grad_norm": 0.13083544373512268, "learning_rate": 0.0015, "loss": 2.8518, "step": 2225 }, { "epoch": 0.013957643205419288, "grad_norm": 0.07403870671987534, "learning_rate": 0.0015, "loss": 2.8685, "step": 2250 }, { "epoch": 0.014112728129923946, "grad_norm": 0.16436311602592468, "learning_rate": 0.0015, "loss": 2.8601, "step": 2275 }, { "epoch": 0.014267813054428605, "grad_norm": 0.12990187108516693, "learning_rate": 0.0015, "loss": 2.8332, "step": 2300 }, { "epoch": 0.014422897978933263, "grad_norm": 0.0897112786769867, "learning_rate": 0.0015, "loss": 2.8578, "step": 2325 }, { "epoch": 0.014577982903437923, "grad_norm": 0.10096879303455353, "learning_rate": 0.0015, "loss": 2.802, "step": 2350 }, { "epoch": 0.014733067827942582, "grad_norm": 0.0850217416882515, "learning_rate": 0.0015, "loss": 2.8529, "step": 2375 }, { "epoch": 0.01488815275244724, "grad_norm": 0.11395123600959778, "learning_rate": 0.0015, "loss": 2.8655, "step": 2400 }, { "epoch": 0.01488815275244724, "eval_loss": 4.743602275848389, "perplexity": 114.84716796875, "step": 2400 }, { "epoch": 0.015043237676951898, "grad_norm": 0.1590801179409027, "learning_rate": 0.0015, "loss": 2.8227, "step": 2425 }, { "epoch": 0.015198322601456558, "grad_norm": 0.16819922626018524, "learning_rate": 0.0015, "loss": 2.8551, "step": 2450 }, { "epoch": 0.015353407525961217, "grad_norm": 0.15390118956565857, "learning_rate": 0.0015, "loss": 2.8691, "step": 2475 }, { "epoch": 0.015508492450465875, "grad_norm": 0.10976951569318771, "learning_rate": 0.0015, "loss": 2.8615, "step": 2500 }, { "epoch": 0.015663577374970535, "grad_norm": 0.09539350867271423, "learning_rate": 0.0015, "loss": 2.7755, "step": 2525 }, { "epoch": 0.015818662299475192, "grad_norm": 0.09798863530158997, "learning_rate": 0.0015, "loss": 2.7675, "step": 2550 }, { "epoch": 0.015973747223979852, "grad_norm": 0.10233014822006226, "learning_rate": 0.0015, "loss": 2.7905, "step": 2575 }, { "epoch": 0.01612883214848451, "grad_norm": 0.09607812017202377, "learning_rate": 0.0015, "loss": 2.779, "step": 2600 }, { "epoch": 0.01612883214848451, "eval_loss": 4.757762432098389, "perplexity": 116.48499298095703, "step": 2600 }, { "epoch": 0.01628391707298917, "grad_norm": 0.09782920032739639, "learning_rate": 0.0015, "loss": 2.8455, "step": 2625 }, { "epoch": 0.01643900199749383, "grad_norm": 0.08443335443735123, "learning_rate": 0.0015, "loss": 2.8537, "step": 2650 }, { "epoch": 0.016594086921998485, "grad_norm": 0.1567981094121933, "learning_rate": 0.0015, "loss": 2.8334, "step": 2675 }, { "epoch": 0.016749171846503146, "grad_norm": 0.1279255449771881, "learning_rate": 0.0015, "loss": 2.8733, "step": 2700 }, { "epoch": 0.016904256771007802, "grad_norm": 0.09086953848600388, "learning_rate": 0.0015, "loss": 2.7992, "step": 2725 }, { "epoch": 0.017059341695512462, "grad_norm": 0.15084481239318848, "learning_rate": 0.0015, "loss": 2.7891, "step": 2750 }, { "epoch": 0.017214426620017122, "grad_norm": 0.1059018149971962, "learning_rate": 0.0015, "loss": 2.8088, "step": 2775 }, { "epoch": 0.01736951154452178, "grad_norm": 0.08803548663854599, "learning_rate": 0.0015, "loss": 2.817, "step": 2800 }, { "epoch": 0.01736951154452178, "eval_loss": 4.730724334716797, "perplexity": 113.37765502929688, "step": 2800 }, { "epoch": 0.01752459646902644, "grad_norm": 0.0954984724521637, "learning_rate": 0.0015, "loss": 2.8528, "step": 2825 }, { "epoch": 0.0176796813935311, "grad_norm": 0.14015914499759674, "learning_rate": 0.0015, "loss": 2.8131, "step": 2850 }, { "epoch": 0.017834766318035756, "grad_norm": 0.07908599078655243, "learning_rate": 0.0015, "loss": 2.8371, "step": 2875 }, { "epoch": 0.017989851242540416, "grad_norm": 0.14578266441822052, "learning_rate": 0.0015, "loss": 2.8033, "step": 2900 }, { "epoch": 0.018144936167045073, "grad_norm": 0.10059946030378342, "learning_rate": 0.0015, "loss": 2.8165, "step": 2925 }, { "epoch": 0.018300021091549733, "grad_norm": 0.10238490998744965, "learning_rate": 0.0015, "loss": 2.7739, "step": 2950 }, { "epoch": 0.018455106016054393, "grad_norm": 0.12706336379051208, "learning_rate": 0.0015, "loss": 2.8018, "step": 2975 }, { "epoch": 0.01861019094055905, "grad_norm": 0.1252700239419937, "learning_rate": 0.0015, "loss": 2.8155, "step": 3000 }, { "epoch": 0.01861019094055905, "eval_loss": 4.707705020904541, "perplexity": 110.79759216308594, "step": 3000 }, { "epoch": 0.01876527586506371, "grad_norm": 0.13322588801383972, "learning_rate": 0.0015, "loss": 2.8201, "step": 3025 }, { "epoch": 0.018920360789568366, "grad_norm": 0.14152252674102783, "learning_rate": 0.0015, "loss": 2.7942, "step": 3050 }, { "epoch": 0.019075445714073026, "grad_norm": 0.1276037096977234, "learning_rate": 0.0015, "loss": 2.8065, "step": 3075 }, { "epoch": 0.019230530638577686, "grad_norm": 0.11600831896066666, "learning_rate": 0.0015, "loss": 2.8335, "step": 3100 }, { "epoch": 0.019385615563082343, "grad_norm": 0.11985427141189575, "learning_rate": 0.0015, "loss": 2.7993, "step": 3125 }, { "epoch": 0.019540700487587003, "grad_norm": 0.11630894988775253, "learning_rate": 0.0015, "loss": 2.7838, "step": 3150 }, { "epoch": 0.01969578541209166, "grad_norm": 0.08493560552597046, "learning_rate": 0.0015, "loss": 2.7884, "step": 3175 }, { "epoch": 0.01985087033659632, "grad_norm": 0.12671016156673431, "learning_rate": 0.0015, "loss": 2.7763, "step": 3200 }, { "epoch": 0.01985087033659632, "eval_loss": 4.7127766609191895, "perplexity": 111.3609390258789, "step": 3200 }, { "epoch": 0.02000595526110098, "grad_norm": 0.10381816327571869, "learning_rate": 0.0015, "loss": 2.7849, "step": 3225 }, { "epoch": 0.020161040185605637, "grad_norm": 0.12319795787334442, "learning_rate": 0.0015, "loss": 2.8325, "step": 3250 }, { "epoch": 0.020316125110110297, "grad_norm": 0.11378122121095657, "learning_rate": 0.0015, "loss": 2.7609, "step": 3275 }, { "epoch": 0.020471210034614957, "grad_norm": 0.08910433948040009, "learning_rate": 0.0015, "loss": 2.7886, "step": 3300 }, { "epoch": 0.020626294959119613, "grad_norm": 0.11803348362445831, "learning_rate": 0.0015, "loss": 2.7716, "step": 3325 }, { "epoch": 0.020781379883624274, "grad_norm": 0.10203807801008224, "learning_rate": 0.0015, "loss": 2.778, "step": 3350 }, { "epoch": 0.02093646480812893, "grad_norm": 0.07175683230161667, "learning_rate": 0.0015, "loss": 2.7844, "step": 3375 }, { "epoch": 0.02109154973263359, "grad_norm": 0.1556989699602127, "learning_rate": 0.0015, "loss": 2.748, "step": 3400 }, { "epoch": 0.02109154973263359, "eval_loss": 4.711516857147217, "perplexity": 111.22074127197266, "step": 3400 }, { "epoch": 0.02124663465713825, "grad_norm": 0.11983326822519302, "learning_rate": 0.0015, "loss": 2.7747, "step": 3425 }, { "epoch": 0.021401719581642907, "grad_norm": 0.09098344296216965, "learning_rate": 0.0015, "loss": 2.7609, "step": 3450 }, { "epoch": 0.021556804506147567, "grad_norm": 0.1238594651222229, "learning_rate": 0.0015, "loss": 2.7849, "step": 3475 }, { "epoch": 0.021711889430652224, "grad_norm": 0.10654041916131973, "learning_rate": 0.0015, "loss": 2.7742, "step": 3500 }, { "epoch": 0.021866974355156884, "grad_norm": 0.12955708801746368, "learning_rate": 0.0015, "loss": 2.7302, "step": 3525 }, { "epoch": 0.022022059279661544, "grad_norm": 0.0945751890540123, "learning_rate": 0.0015, "loss": 2.7366, "step": 3550 }, { "epoch": 0.0221771442041662, "grad_norm": 0.11322261393070221, "learning_rate": 0.0015, "loss": 2.7307, "step": 3575 }, { "epoch": 0.02233222912867086, "grad_norm": 0.14438313245773315, "learning_rate": 0.0015, "loss": 2.741, "step": 3600 }, { "epoch": 0.02233222912867086, "eval_loss": 4.7056427001953125, "perplexity": 110.56932830810547, "step": 3600 }, { "epoch": 0.022487314053175517, "grad_norm": 0.12101957201957703, "learning_rate": 0.0015, "loss": 2.7699, "step": 3625 }, { "epoch": 0.022642398977680177, "grad_norm": 0.13060438632965088, "learning_rate": 0.0015, "loss": 2.7534, "step": 3650 }, { "epoch": 0.022797483902184838, "grad_norm": 0.18028861284255981, "learning_rate": 0.0015, "loss": 2.7716, "step": 3675 }, { "epoch": 0.022952568826689494, "grad_norm": 0.2551407217979431, "learning_rate": 0.0015, "loss": 2.7505, "step": 3700 }, { "epoch": 0.023107653751194154, "grad_norm": 0.14461354911327362, "learning_rate": 0.0015, "loss": 2.762, "step": 3725 }, { "epoch": 0.02326273867569881, "grad_norm": 0.08960037678480148, "learning_rate": 0.0015, "loss": 2.7752, "step": 3750 }, { "epoch": 0.02341782360020347, "grad_norm": 0.12423495948314667, "learning_rate": 0.0015, "loss": 2.7649, "step": 3775 }, { "epoch": 0.02357290852470813, "grad_norm": 0.11889061331748962, "learning_rate": 0.0015, "loss": 2.7465, "step": 3800 }, { "epoch": 0.02357290852470813, "eval_loss": 4.709405422210693, "perplexity": 110.98615264892578, "step": 3800 }, { "epoch": 0.023727993449212788, "grad_norm": 0.1310662031173706, "learning_rate": 0.0015, "loss": 2.7739, "step": 3825 }, { "epoch": 0.023883078373717448, "grad_norm": 0.10841766744852066, "learning_rate": 0.0015, "loss": 2.7558, "step": 3850 }, { "epoch": 0.024038163298222108, "grad_norm": 0.11951743066310883, "learning_rate": 0.0015, "loss": 2.7574, "step": 3875 }, { "epoch": 0.024193248222726765, "grad_norm": 0.10914873331785202, "learning_rate": 0.0015, "loss": 2.7593, "step": 3900 }, { "epoch": 0.024348333147231425, "grad_norm": 0.12661431729793549, "learning_rate": 0.0015, "loss": 2.7405, "step": 3925 }, { "epoch": 0.02450341807173608, "grad_norm": 0.09351510554552078, "learning_rate": 0.0015, "loss": 2.7614, "step": 3950 }, { "epoch": 0.02465850299624074, "grad_norm": 0.10916408896446228, "learning_rate": 0.0015, "loss": 2.7348, "step": 3975 }, { "epoch": 0.0248135879207454, "grad_norm": 0.1506185084581375, "learning_rate": 0.0015, "loss": 2.7465, "step": 4000 }, { "epoch": 0.0248135879207454, "eval_loss": 4.691644191741943, "perplexity": 109.03230285644531, "step": 4000 }, { "epoch": 0.024968672845250058, "grad_norm": 0.16664201021194458, "learning_rate": 0.0015, "loss": 2.7099, "step": 4025 }, { "epoch": 0.02512375776975472, "grad_norm": 0.08793428540229797, "learning_rate": 0.0015, "loss": 2.7062, "step": 4050 }, { "epoch": 0.025278842694259375, "grad_norm": 0.10746140778064728, "learning_rate": 0.0015, "loss": 2.7013, "step": 4075 }, { "epoch": 0.025433927618764035, "grad_norm": 0.14466698467731476, "learning_rate": 0.0015, "loss": 2.7366, "step": 4100 }, { "epoch": 0.025589012543268695, "grad_norm": 0.12191653996706009, "learning_rate": 0.0015, "loss": 2.7042, "step": 4125 }, { "epoch": 0.025744097467773352, "grad_norm": 0.10167489945888519, "learning_rate": 0.0015, "loss": 2.7215, "step": 4150 }, { "epoch": 0.025899182392278012, "grad_norm": 0.11334148049354553, "learning_rate": 0.0015, "loss": 2.7365, "step": 4175 }, { "epoch": 0.02605426731678267, "grad_norm": 0.09303794056177139, "learning_rate": 0.0015, "loss": 2.7471, "step": 4200 }, { "epoch": 0.02605426731678267, "eval_loss": 4.692121505737305, "perplexity": 109.08435821533203, "step": 4200 }, { "epoch": 0.02620935224128733, "grad_norm": 0.09444712847471237, "learning_rate": 0.0015, "loss": 2.6965, "step": 4225 }, { "epoch": 0.02636443716579199, "grad_norm": 0.09560113400220871, "learning_rate": 0.0015, "loss": 2.7186, "step": 4250 }, { "epoch": 0.026519522090296645, "grad_norm": 0.10814715176820755, "learning_rate": 0.0015, "loss": 2.7, "step": 4275 }, { "epoch": 0.026674607014801305, "grad_norm": 0.12008251994848251, "learning_rate": 0.0015, "loss": 2.6827, "step": 4300 }, { "epoch": 0.026829691939305966, "grad_norm": 0.13892072439193726, "learning_rate": 0.0015, "loss": 2.7481, "step": 4325 }, { "epoch": 0.026984776863810622, "grad_norm": 0.10116352885961533, "learning_rate": 0.0015, "loss": 2.6839, "step": 4350 }, { "epoch": 0.027139861788315282, "grad_norm": 0.2541595697402954, "learning_rate": 0.0015, "loss": 2.6987, "step": 4375 }, { "epoch": 0.02729494671281994, "grad_norm": 0.11070574074983597, "learning_rate": 0.0015, "loss": 2.7102, "step": 4400 }, { "epoch": 0.02729494671281994, "eval_loss": 4.702114105224609, "perplexity": 110.17985534667969, "step": 4400 }, { "epoch": 0.0274500316373246, "grad_norm": 0.09290622174739838, "learning_rate": 0.0015, "loss": 2.744, "step": 4425 }, { "epoch": 0.02760511656182926, "grad_norm": 0.09867129474878311, "learning_rate": 0.0015, "loss": 2.6979, "step": 4450 }, { "epoch": 0.027760201486333916, "grad_norm": 0.08975850045681, "learning_rate": 0.0015, "loss": 2.7346, "step": 4475 }, { "epoch": 0.027915286410838576, "grad_norm": 0.1251811683177948, "learning_rate": 0.0015, "loss": 2.6901, "step": 4500 }, { "epoch": 0.028070371335343233, "grad_norm": 0.10718528181314468, "learning_rate": 0.0015, "loss": 2.6584, "step": 4525 }, { "epoch": 0.028225456259847893, "grad_norm": 0.1920158714056015, "learning_rate": 0.0015, "loss": 2.6776, "step": 4550 }, { "epoch": 0.028380541184352553, "grad_norm": 0.11409153789281845, "learning_rate": 0.0015, "loss": 2.7052, "step": 4575 }, { "epoch": 0.02853562610885721, "grad_norm": 0.12506772577762604, "learning_rate": 0.0015, "loss": 2.6954, "step": 4600 }, { "epoch": 0.02853562610885721, "eval_loss": 4.685390949249268, "perplexity": 108.35262298583984, "step": 4600 }, { "epoch": 0.02869071103336187, "grad_norm": 0.1093166172504425, "learning_rate": 0.0015, "loss": 2.7257, "step": 4625 }, { "epoch": 0.028845795957866526, "grad_norm": 0.16628532111644745, "learning_rate": 0.0015, "loss": 2.6782, "step": 4650 }, { "epoch": 0.029000880882371186, "grad_norm": 0.1638079136610031, "learning_rate": 0.0015, "loss": 2.6884, "step": 4675 }, { "epoch": 0.029155965806875846, "grad_norm": 0.11411619931459427, "learning_rate": 0.0015, "loss": 2.7054, "step": 4700 }, { "epoch": 0.029311050731380503, "grad_norm": 0.09292814135551453, "learning_rate": 0.0015, "loss": 2.6826, "step": 4725 }, { "epoch": 0.029466135655885163, "grad_norm": 0.09136354923248291, "learning_rate": 0.0015, "loss": 2.6936, "step": 4750 }, { "epoch": 0.029621220580389823, "grad_norm": 0.1188502386212349, "learning_rate": 0.0015, "loss": 2.6466, "step": 4775 }, { "epoch": 0.02977630550489448, "grad_norm": 0.09645655751228333, "learning_rate": 0.0015, "loss": 2.6092, "step": 4800 }, { "epoch": 0.02977630550489448, "eval_loss": 4.683995723724365, "perplexity": 108.20155334472656, "step": 4800 }, { "epoch": 0.02993139042939914, "grad_norm": 0.17193672060966492, "learning_rate": 0.0015, "loss": 2.6916, "step": 4825 }, { "epoch": 0.030086475353903797, "grad_norm": 0.14866988360881805, "learning_rate": 0.0015, "loss": 2.6776, "step": 4850 }, { "epoch": 0.030241560278408457, "grad_norm": 0.10588869452476501, "learning_rate": 0.0015, "loss": 2.6773, "step": 4875 }, { "epoch": 0.030396645202913117, "grad_norm": 0.12059559673070908, "learning_rate": 0.0015, "loss": 2.639, "step": 4900 }, { "epoch": 0.030551730127417773, "grad_norm": 0.13296598196029663, "learning_rate": 0.0015, "loss": 2.6359, "step": 4925 }, { "epoch": 0.030706815051922434, "grad_norm": 0.12300167232751846, "learning_rate": 0.0015, "loss": 2.668, "step": 4950 }, { "epoch": 0.03086189997642709, "grad_norm": 0.15900522470474243, "learning_rate": 0.0015, "loss": 2.6252, "step": 4975 }, { "epoch": 0.03101698490093175, "grad_norm": 0.138090580701828, "learning_rate": 0.0015, "loss": 2.659, "step": 5000 }, { "epoch": 0.03101698490093175, "eval_loss": 4.688181400299072, "perplexity": 108.65540313720703, "step": 5000 }, { "epoch": 0.03117206982543641, "grad_norm": 0.13720737397670746, "learning_rate": 0.0015, "loss": 2.6096, "step": 5025 }, { "epoch": 0.03132715474994107, "grad_norm": 0.13671600818634033, "learning_rate": 0.0015, "loss": 2.647, "step": 5050 }, { "epoch": 0.031482239674445724, "grad_norm": 0.12611277401447296, "learning_rate": 0.0015, "loss": 2.639, "step": 5075 }, { "epoch": 0.031637324598950384, "grad_norm": 0.12045291066169739, "learning_rate": 0.0015, "loss": 2.663, "step": 5100 }, { "epoch": 0.031792409523455044, "grad_norm": 0.10857657343149185, "learning_rate": 0.0015, "loss": 2.6677, "step": 5125 }, { "epoch": 0.031947494447959704, "grad_norm": 0.12052007764577866, "learning_rate": 0.0015, "loss": 2.6508, "step": 5150 }, { "epoch": 0.032102579372464364, "grad_norm": 0.10999467223882675, "learning_rate": 0.0015, "loss": 2.661, "step": 5175 }, { "epoch": 0.03225766429696902, "grad_norm": 0.11075185984373093, "learning_rate": 0.0015, "loss": 2.6645, "step": 5200 }, { "epoch": 0.03225766429696902, "eval_loss": 4.706582546234131, "perplexity": 110.67329406738281, "step": 5200 }, { "epoch": 0.03241274922147368, "grad_norm": 0.09703061729669571, "learning_rate": 0.0015, "loss": 2.6109, "step": 5225 }, { "epoch": 0.03256783414597834, "grad_norm": 0.13556119799613953, "learning_rate": 0.0015, "loss": 2.6621, "step": 5250 }, { "epoch": 0.032722919070483, "grad_norm": 0.09178316593170166, "learning_rate": 0.0015, "loss": 2.6263, "step": 5275 }, { "epoch": 0.03287800399498766, "grad_norm": 0.10839138180017471, "learning_rate": 0.0015, "loss": 2.5999, "step": 5300 }, { "epoch": 0.03303308891949231, "grad_norm": 0.12049377709627151, "learning_rate": 0.0015, "loss": 2.6085, "step": 5325 }, { "epoch": 0.03318817384399697, "grad_norm": 0.15260230004787445, "learning_rate": 0.0015, "loss": 2.664, "step": 5350 }, { "epoch": 0.03334325876850163, "grad_norm": 0.12393297255039215, "learning_rate": 0.0015, "loss": 2.6234, "step": 5375 }, { "epoch": 0.03349834369300629, "grad_norm": 0.1284521073102951, "learning_rate": 0.0015, "loss": 2.5624, "step": 5400 }, { "epoch": 0.03349834369300629, "eval_loss": 4.696901321411133, "perplexity": 109.60700988769531, "step": 5400 }, { "epoch": 0.03365342861751095, "grad_norm": 0.18052247166633606, "learning_rate": 0.0015, "loss": 2.5779, "step": 5425 }, { "epoch": 0.033808513542015604, "grad_norm": 0.11775010824203491, "learning_rate": 0.0015, "loss": 2.6167, "step": 5450 }, { "epoch": 0.033963598466520264, "grad_norm": 0.13769109547138214, "learning_rate": 0.0015, "loss": 2.6117, "step": 5475 }, { "epoch": 0.034118683391024925, "grad_norm": 0.09634970873594284, "learning_rate": 0.0015, "loss": 2.613, "step": 5500 }, { "epoch": 0.034273768315529585, "grad_norm": 0.14692488312721252, "learning_rate": 0.0015, "loss": 2.6176, "step": 5525 }, { "epoch": 0.034428853240034245, "grad_norm": 0.21920783817768097, "learning_rate": 0.0015, "loss": 2.6196, "step": 5550 }, { "epoch": 0.034583938164538905, "grad_norm": 0.1033003106713295, "learning_rate": 0.0015, "loss": 2.5872, "step": 5575 }, { "epoch": 0.03473902308904356, "grad_norm": 0.09867612272500992, "learning_rate": 0.0015, "loss": 2.5782, "step": 5600 }, { "epoch": 0.03473902308904356, "eval_loss": 4.704063892364502, "perplexity": 110.3948974609375, "step": 5600 }, { "epoch": 0.03489410801354822, "grad_norm": 0.1032184287905693, "learning_rate": 0.0015, "loss": 2.6187, "step": 5625 }, { "epoch": 0.03504919293805288, "grad_norm": 0.12661318480968475, "learning_rate": 0.0015, "loss": 2.5805, "step": 5650 }, { "epoch": 0.03520427786255754, "grad_norm": 0.28772449493408203, "learning_rate": 0.0015, "loss": 2.7518, "step": 5675 }, { "epoch": 0.0353593627870622, "grad_norm": 0.10005131363868713, "learning_rate": 0.0015, "loss": 2.8556, "step": 5700 }, { "epoch": 0.03551444771156685, "grad_norm": 0.10379570722579956, "learning_rate": 0.0015, "loss": 2.8648, "step": 5725 }, { "epoch": 0.03566953263607151, "grad_norm": 0.08921229094266891, "learning_rate": 0.0015, "loss": 2.8421, "step": 5750 }, { "epoch": 0.03582461756057617, "grad_norm": 0.15366144478321075, "learning_rate": 0.0015, "loss": 2.8162, "step": 5775 }, { "epoch": 0.03597970248508083, "grad_norm": 0.12743431329727173, "learning_rate": 0.0015, "loss": 2.8635, "step": 5800 }, { "epoch": 0.03597970248508083, "eval_loss": 4.674878120422363, "perplexity": 107.21949768066406, "step": 5800 }, { "epoch": 0.03613478740958549, "grad_norm": 0.08773666620254517, "learning_rate": 0.0015, "loss": 2.8787, "step": 5825 }, { "epoch": 0.036289872334090145, "grad_norm": 0.11721781641244888, "learning_rate": 0.0015, "loss": 2.853, "step": 5850 }, { "epoch": 0.036444957258594805, "grad_norm": 0.09957700222730637, "learning_rate": 0.0015, "loss": 2.8163, "step": 5875 }, { "epoch": 0.036600042183099465, "grad_norm": 0.09999966621398926, "learning_rate": 0.0015, "loss": 2.8206, "step": 5900 }, { "epoch": 0.036755127107604126, "grad_norm": 0.09899301081895828, "learning_rate": 0.0015, "loss": 2.8378, "step": 5925 }, { "epoch": 0.036910212032108786, "grad_norm": 0.09676779061555862, "learning_rate": 0.0015, "loss": 2.8385, "step": 5950 }, { "epoch": 0.03706529695661344, "grad_norm": 0.14397811889648438, "learning_rate": 0.0015, "loss": 2.8639, "step": 5975 }, { "epoch": 0.0372203818811181, "grad_norm": 0.08991026133298874, "learning_rate": 0.0015, "loss": 2.862, "step": 6000 }, { "epoch": 0.0372203818811181, "eval_loss": 4.649503707885742, "perplexity": 104.53309631347656, "step": 6000 }, { "epoch": 0.03737546680562276, "grad_norm": 0.11916879564523697, "learning_rate": 0.0015, "loss": 2.8336, "step": 6025 }, { "epoch": 0.03753055173012742, "grad_norm": 0.1533547192811966, "learning_rate": 0.0015, "loss": 2.8154, "step": 6050 }, { "epoch": 0.03768563665463208, "grad_norm": 0.10416785627603531, "learning_rate": 0.0015, "loss": 2.8073, "step": 6075 }, { "epoch": 0.03784072157913673, "grad_norm": 0.1307593733072281, "learning_rate": 0.0015, "loss": 2.8227, "step": 6100 }, { "epoch": 0.03799580650364139, "grad_norm": 0.11226139962673187, "learning_rate": 0.0015, "loss": 2.8316, "step": 6125 }, { "epoch": 0.03815089142814605, "grad_norm": 0.12050950527191162, "learning_rate": 0.0015, "loss": 2.8636, "step": 6150 }, { "epoch": 0.03830597635265071, "grad_norm": 0.14836955070495605, "learning_rate": 0.0015, "loss": 2.8433, "step": 6175 }, { "epoch": 0.03846106127715537, "grad_norm": 0.1240909993648529, "learning_rate": 0.0015, "loss": 2.885, "step": 6200 }, { "epoch": 0.03846106127715537, "eval_loss": 4.652696132659912, "perplexity": 104.86734008789062, "step": 6200 }, { "epoch": 0.038616146201660026, "grad_norm": 0.09549515694379807, "learning_rate": 0.0015, "loss": 2.822, "step": 6225 }, { "epoch": 0.038771231126164686, "grad_norm": 0.1386450082063675, "learning_rate": 0.0015, "loss": 2.8455, "step": 6250 }, { "epoch": 0.038926316050669346, "grad_norm": 0.10233025252819061, "learning_rate": 0.0015, "loss": 2.834, "step": 6275 }, { "epoch": 0.039081400975174006, "grad_norm": 0.09776704013347626, "learning_rate": 0.0015, "loss": 2.8114, "step": 6300 }, { "epoch": 0.039236485899678666, "grad_norm": 0.09631351381540298, "learning_rate": 0.0015, "loss": 2.8107, "step": 6325 }, { "epoch": 0.03939157082418332, "grad_norm": 0.08424117416143417, "learning_rate": 0.0015, "loss": 2.8373, "step": 6350 }, { "epoch": 0.03954665574868798, "grad_norm": 0.14171521365642548, "learning_rate": 0.0015, "loss": 2.8394, "step": 6375 }, { "epoch": 0.03970174067319264, "grad_norm": 0.11349046230316162, "learning_rate": 0.0015, "loss": 2.8131, "step": 6400 }, { "epoch": 0.03970174067319264, "eval_loss": 4.652514934539795, "perplexity": 104.84834289550781, "step": 6400 }, { "epoch": 0.0398568255976973, "grad_norm": 0.09066054224967957, "learning_rate": 0.0015, "loss": 2.8758, "step": 6425 }, { "epoch": 0.04001191052220196, "grad_norm": 0.09391192346811295, "learning_rate": 0.0015, "loss": 2.826, "step": 6450 }, { "epoch": 0.04016699544670661, "grad_norm": 0.17412593960762024, "learning_rate": 0.0015, "loss": 2.8487, "step": 6475 }, { "epoch": 0.04032208037121127, "grad_norm": 0.17672564089298248, "learning_rate": 0.0015, "loss": 2.8441, "step": 6500 }, { "epoch": 0.04047716529571593, "grad_norm": 0.11427825689315796, "learning_rate": 0.0015, "loss": 2.8843, "step": 6525 }, { "epoch": 0.04063225022022059, "grad_norm": 0.13745597004890442, "learning_rate": 0.0015, "loss": 2.8458, "step": 6550 }, { "epoch": 0.040787335144725254, "grad_norm": 0.12339327484369278, "learning_rate": 0.0015, "loss": 2.8299, "step": 6575 }, { "epoch": 0.040942420069229914, "grad_norm": 0.11045660078525543, "learning_rate": 0.0015, "loss": 2.8504, "step": 6600 }, { "epoch": 0.040942420069229914, "eval_loss": 4.645139217376709, "perplexity": 104.0778579711914, "step": 6600 }, { "epoch": 0.04109750499373457, "grad_norm": 0.14822149276733398, "learning_rate": 0.0015, "loss": 2.8438, "step": 6625 }, { "epoch": 0.04125258991823923, "grad_norm": 0.09271769225597382, "learning_rate": 0.0015, "loss": 2.8195, "step": 6650 }, { "epoch": 0.04140767484274389, "grad_norm": 0.12357133626937866, "learning_rate": 0.0015, "loss": 2.8434, "step": 6675 }, { "epoch": 0.04156275976724855, "grad_norm": 0.12669824063777924, "learning_rate": 0.0015, "loss": 2.8262, "step": 6700 }, { "epoch": 0.04171784469175321, "grad_norm": 0.10409893840551376, "learning_rate": 0.0015, "loss": 2.8164, "step": 6725 }, { "epoch": 0.04187292961625786, "grad_norm": 0.10687699913978577, "learning_rate": 0.0015, "loss": 2.83, "step": 6750 }, { "epoch": 0.04202801454076252, "grad_norm": 0.09924216568470001, "learning_rate": 0.0015, "loss": 2.8415, "step": 6775 }, { "epoch": 0.04218309946526718, "grad_norm": 0.11719833314418793, "learning_rate": 0.0015, "loss": 2.8368, "step": 6800 }, { "epoch": 0.04218309946526718, "eval_loss": 4.673882484436035, "perplexity": 107.11280059814453, "step": 6800 }, { "epoch": 0.04233818438977184, "grad_norm": 0.10162920504808426, "learning_rate": 0.0015, "loss": 2.8285, "step": 6825 }, { "epoch": 0.0424932693142765, "grad_norm": 0.10563603043556213, "learning_rate": 0.0015, "loss": 2.809, "step": 6850 }, { "epoch": 0.042648354238781154, "grad_norm": 0.079631008207798, "learning_rate": 0.0015, "loss": 2.8362, "step": 6875 }, { "epoch": 0.042803439163285814, "grad_norm": 0.11915802210569382, "learning_rate": 0.0015, "loss": 2.8211, "step": 6900 }, { "epoch": 0.042958524087790474, "grad_norm": 0.13783864676952362, "learning_rate": 0.0015, "loss": 2.8403, "step": 6925 }, { "epoch": 0.043113609012295134, "grad_norm": 0.17333541810512543, "learning_rate": 0.0015, "loss": 2.8699, "step": 6950 }, { "epoch": 0.043268693936799794, "grad_norm": 0.10923554003238678, "learning_rate": 0.0015, "loss": 2.8016, "step": 6975 }, { "epoch": 0.04342377886130445, "grad_norm": 0.10525023192167282, "learning_rate": 0.0015, "loss": 2.8302, "step": 7000 }, { "epoch": 0.04342377886130445, "eval_loss": 4.660215854644775, "perplexity": 105.65888977050781, "step": 7000 }, { "epoch": 0.04357886378580911, "grad_norm": 0.10499420017004013, "learning_rate": 0.0015, "loss": 2.8215, "step": 7025 }, { "epoch": 0.04373394871031377, "grad_norm": 0.09560755640268326, "learning_rate": 0.0015, "loss": 2.8279, "step": 7050 }, { "epoch": 0.04388903363481843, "grad_norm": 0.10454019159078598, "learning_rate": 0.0015, "loss": 2.8161, "step": 7075 }, { "epoch": 0.04404411855932309, "grad_norm": 0.0982690081000328, "learning_rate": 0.0015, "loss": 2.7895, "step": 7100 }, { "epoch": 0.04419920348382774, "grad_norm": 0.10405784100294113, "learning_rate": 0.0015, "loss": 2.7945, "step": 7125 }, { "epoch": 0.0443542884083324, "grad_norm": 0.09310988336801529, "learning_rate": 0.0015, "loss": 2.8535, "step": 7150 }, { "epoch": 0.04450937333283706, "grad_norm": 0.1031995639204979, "learning_rate": 0.0015, "loss": 2.8298, "step": 7175 }, { "epoch": 0.04466445825734172, "grad_norm": 0.09206147491931915, "learning_rate": 0.0015, "loss": 2.794, "step": 7200 }, { "epoch": 0.04466445825734172, "eval_loss": 4.642621994018555, "perplexity": 103.81619262695312, "step": 7200 }, { "epoch": 0.04481954318184638, "grad_norm": 0.1051359549164772, "learning_rate": 0.0015, "loss": 2.7996, "step": 7225 }, { "epoch": 0.044974628106351035, "grad_norm": 0.12941063940525055, "learning_rate": 0.0015, "loss": 2.792, "step": 7250 }, { "epoch": 0.045129713030855695, "grad_norm": 0.09297281503677368, "learning_rate": 0.0015, "loss": 2.7847, "step": 7275 }, { "epoch": 0.045284797955360355, "grad_norm": 0.11114951968193054, "learning_rate": 0.0015, "loss": 2.8164, "step": 7300 }, { "epoch": 0.045439882879865015, "grad_norm": 0.08519440144300461, "learning_rate": 0.0015, "loss": 2.8053, "step": 7325 }, { "epoch": 0.045594967804369675, "grad_norm": 0.11148552596569061, "learning_rate": 0.0015, "loss": 2.7871, "step": 7350 }, { "epoch": 0.04575005272887433, "grad_norm": 0.136012002825737, "learning_rate": 0.0015, "loss": 2.8457, "step": 7375 }, { "epoch": 0.04590513765337899, "grad_norm": 0.1037759929895401, "learning_rate": 0.0015, "loss": 2.748, "step": 7400 }, { "epoch": 0.04590513765337899, "eval_loss": 4.631537437438965, "perplexity": 102.67179107666016, "step": 7400 }, { "epoch": 0.04606022257788365, "grad_norm": 0.11162275820970535, "learning_rate": 0.0015, "loss": 2.8044, "step": 7425 }, { "epoch": 0.04621530750238831, "grad_norm": 0.11309058219194412, "learning_rate": 0.0015, "loss": 2.8198, "step": 7450 }, { "epoch": 0.04637039242689297, "grad_norm": 0.09359199553728104, "learning_rate": 0.0015, "loss": 2.8302, "step": 7475 }, { "epoch": 0.04652547735139762, "grad_norm": 0.09513767808675766, "learning_rate": 0.0015, "loss": 2.8325, "step": 7500 }, { "epoch": 0.04668056227590228, "grad_norm": 0.08243551850318909, "learning_rate": 0.0015, "loss": 2.7925, "step": 7525 }, { "epoch": 0.04683564720040694, "grad_norm": 0.08001349121332169, "learning_rate": 0.0015, "loss": 2.8406, "step": 7550 }, { "epoch": 0.0469907321249116, "grad_norm": 0.11749595403671265, "learning_rate": 0.0015, "loss": 2.7762, "step": 7575 }, { "epoch": 0.04714581704941626, "grad_norm": 0.15697765350341797, "learning_rate": 0.0015, "loss": 2.8137, "step": 7600 }, { "epoch": 0.04714581704941626, "eval_loss": 4.643322467803955, "perplexity": 103.8889389038086, "step": 7600 }, { "epoch": 0.04730090197392092, "grad_norm": 0.1004658117890358, "learning_rate": 0.0015, "loss": 2.7787, "step": 7625 }, { "epoch": 0.047455986898425576, "grad_norm": 0.11577022075653076, "learning_rate": 0.0015, "loss": 2.806, "step": 7650 }, { "epoch": 0.047611071822930236, "grad_norm": 0.10791046917438507, "learning_rate": 0.0015, "loss": 2.7637, "step": 7675 }, { "epoch": 0.047766156747434896, "grad_norm": 0.09490654617547989, "learning_rate": 0.0015, "loss": 2.8187, "step": 7700 }, { "epoch": 0.047921241671939556, "grad_norm": 0.10448817163705826, "learning_rate": 0.0015, "loss": 2.8335, "step": 7725 }, { "epoch": 0.048076326596444216, "grad_norm": 0.10800398141145706, "learning_rate": 0.0015, "loss": 2.8138, "step": 7750 }, { "epoch": 0.04823141152094887, "grad_norm": 0.10268035531044006, "learning_rate": 0.0015, "loss": 2.8074, "step": 7775 }, { "epoch": 0.04838649644545353, "grad_norm": 0.145925372838974, "learning_rate": 0.0015, "loss": 2.8161, "step": 7800 }, { "epoch": 0.04838649644545353, "eval_loss": 4.628528118133545, "perplexity": 102.36328887939453, "step": 7800 }, { "epoch": 0.04854158136995819, "grad_norm": 0.1422831267118454, "learning_rate": 0.0015, "loss": 2.8179, "step": 7825 }, { "epoch": 0.04869666629446285, "grad_norm": 0.10019826889038086, "learning_rate": 0.0015, "loss": 2.8228, "step": 7850 }, { "epoch": 0.04885175121896751, "grad_norm": 0.12028387933969498, "learning_rate": 0.0015, "loss": 2.8359, "step": 7875 }, { "epoch": 0.04900683614347216, "grad_norm": 0.08171118795871735, "learning_rate": 0.0015, "loss": 2.7829, "step": 7900 }, { "epoch": 0.04916192106797682, "grad_norm": 0.138522207736969, "learning_rate": 0.0015, "loss": 2.7992, "step": 7925 }, { "epoch": 0.04931700599248148, "grad_norm": 0.10419227927923203, "learning_rate": 0.0015, "loss": 2.8097, "step": 7950 }, { "epoch": 0.04947209091698614, "grad_norm": 0.1020691841840744, "learning_rate": 0.0015, "loss": 2.8152, "step": 7975 }, { "epoch": 0.0496271758414908, "grad_norm": 0.12423787266016006, "learning_rate": 0.0015, "loss": 2.7966, "step": 8000 }, { "epoch": 0.0496271758414908, "eval_loss": 4.6273722648620605, "perplexity": 102.24504089355469, "step": 8000 }, { "epoch": 0.049782260765995456, "grad_norm": 0.15230977535247803, "learning_rate": 0.0015, "loss": 2.7575, "step": 8025 }, { "epoch": 0.049937345690500116, "grad_norm": 0.12649676203727722, "learning_rate": 0.0015, "loss": 2.7897, "step": 8050 }, { "epoch": 0.05009243061500478, "grad_norm": 0.11257271468639374, "learning_rate": 0.0015, "loss": 2.8115, "step": 8075 }, { "epoch": 0.05024751553950944, "grad_norm": 0.09349871426820755, "learning_rate": 0.0015, "loss": 2.8041, "step": 8100 }, { "epoch": 0.0504026004640141, "grad_norm": 0.14108401536941528, "learning_rate": 0.0015, "loss": 2.7772, "step": 8125 }, { "epoch": 0.05055768538851875, "grad_norm": 0.17286863923072815, "learning_rate": 0.0015, "loss": 2.8197, "step": 8150 }, { "epoch": 0.05071277031302341, "grad_norm": 0.10759209096431732, "learning_rate": 0.0015, "loss": 2.8396, "step": 8175 }, { "epoch": 0.05086785523752807, "grad_norm": 0.10236554592847824, "learning_rate": 0.0015, "loss": 2.8175, "step": 8200 }, { "epoch": 0.05086785523752807, "eval_loss": 4.610519886016846, "perplexity": 100.5363998413086, "step": 8200 }, { "epoch": 0.05102294016203273, "grad_norm": 0.12348885089159012, "learning_rate": 0.0015, "loss": 2.8139, "step": 8225 }, { "epoch": 0.05117802508653739, "grad_norm": 0.10251584649085999, "learning_rate": 0.0015, "loss": 2.8436, "step": 8250 }, { "epoch": 0.051333110011042044, "grad_norm": 0.10069389641284943, "learning_rate": 0.0015, "loss": 2.8409, "step": 8275 }, { "epoch": 0.051488194935546704, "grad_norm": 0.1546829789876938, "learning_rate": 0.0015, "loss": 2.8199, "step": 8300 }, { "epoch": 0.051643279860051364, "grad_norm": 0.10704527795314789, "learning_rate": 0.0015, "loss": 2.7721, "step": 8325 }, { "epoch": 0.051798364784556024, "grad_norm": 0.12251198291778564, "learning_rate": 0.0015, "loss": 2.8175, "step": 8350 }, { "epoch": 0.051953449709060684, "grad_norm": 0.11113474518060684, "learning_rate": 0.0015, "loss": 2.8085, "step": 8375 }, { "epoch": 0.05210853463356534, "grad_norm": 0.1341187059879303, "learning_rate": 0.0015, "loss": 2.8169, "step": 8400 }, { "epoch": 0.05210853463356534, "eval_loss": 4.610434532165527, "perplexity": 100.52782440185547, "step": 8400 }, { "epoch": 0.05226361955807, "grad_norm": 0.16195224225521088, "learning_rate": 0.0015, "loss": 2.8266, "step": 8425 }, { "epoch": 0.05241870448257466, "grad_norm": 0.1637653261423111, "learning_rate": 0.0015, "loss": 2.8106, "step": 8450 }, { "epoch": 0.05257378940707932, "grad_norm": 0.10014921426773071, "learning_rate": 0.0015, "loss": 2.8103, "step": 8475 }, { "epoch": 0.05272887433158398, "grad_norm": 0.11419603228569031, "learning_rate": 0.0015, "loss": 2.7965, "step": 8500 }, { "epoch": 0.05288395925608863, "grad_norm": 0.08137035369873047, "learning_rate": 0.0015, "loss": 2.7802, "step": 8525 }, { "epoch": 0.05303904418059329, "grad_norm": 0.08078640699386597, "learning_rate": 0.0015, "loss": 2.7819, "step": 8550 }, { "epoch": 0.05319412910509795, "grad_norm": 0.13133442401885986, "learning_rate": 0.0015, "loss": 2.83, "step": 8575 }, { "epoch": 0.05334921402960261, "grad_norm": 0.08819993585348129, "learning_rate": 0.0015, "loss": 2.833, "step": 8600 }, { "epoch": 0.05334921402960261, "eval_loss": 4.603670120239258, "perplexity": 99.85010528564453, "step": 8600 }, { "epoch": 0.05350429895410727, "grad_norm": 0.14662431180477142, "learning_rate": 0.0015, "loss": 2.8201, "step": 8625 }, { "epoch": 0.05365938387861193, "grad_norm": 0.10400764644145966, "learning_rate": 0.0015, "loss": 2.7944, "step": 8650 }, { "epoch": 0.053814468803116584, "grad_norm": 0.2790142297744751, "learning_rate": 0.0015, "loss": 2.8307, "step": 8675 }, { "epoch": 0.053969553727621244, "grad_norm": 0.13645683228969574, "learning_rate": 0.0015, "loss": 2.7904, "step": 8700 }, { "epoch": 0.054124638652125905, "grad_norm": 0.09604925662279129, "learning_rate": 0.0015, "loss": 2.76, "step": 8725 }, { "epoch": 0.054279723576630565, "grad_norm": 0.07631650567054749, "learning_rate": 0.0015, "loss": 2.7955, "step": 8750 }, { "epoch": 0.054434808501135225, "grad_norm": 0.13132531940937042, "learning_rate": 0.0015, "loss": 2.8308, "step": 8775 }, { "epoch": 0.05458989342563988, "grad_norm": 0.08334681391716003, "learning_rate": 0.0015, "loss": 2.755, "step": 8800 }, { "epoch": 0.05458989342563988, "eval_loss": 4.597860336303711, "perplexity": 99.27168273925781, "step": 8800 }, { "epoch": 0.05474497835014454, "grad_norm": 0.10585317760705948, "learning_rate": 0.0015, "loss": 2.7708, "step": 8825 }, { "epoch": 0.0549000632746492, "grad_norm": 0.08953095227479935, "learning_rate": 0.0015, "loss": 2.7622, "step": 8850 }, { "epoch": 0.05505514819915386, "grad_norm": 0.10430523008108139, "learning_rate": 0.0015, "loss": 2.8255, "step": 8875 }, { "epoch": 0.05521023312365852, "grad_norm": 0.08961856365203857, "learning_rate": 0.0015, "loss": 2.7835, "step": 8900 }, { "epoch": 0.05536531804816317, "grad_norm": 0.13602201640605927, "learning_rate": 0.0015, "loss": 2.813, "step": 8925 }, { "epoch": 0.05552040297266783, "grad_norm": 0.1858643889427185, "learning_rate": 0.0015, "loss": 2.8296, "step": 8950 }, { "epoch": 0.05567548789717249, "grad_norm": 0.12873806059360504, "learning_rate": 0.0015, "loss": 2.7669, "step": 8975 }, { "epoch": 0.05583057282167715, "grad_norm": 0.09891733527183533, "learning_rate": 0.0015, "loss": 2.7829, "step": 9000 }, { "epoch": 0.05583057282167715, "eval_loss": 4.606179714202881, "perplexity": 100.10100555419922, "step": 9000 }, { "epoch": 0.05598565774618181, "grad_norm": 0.1619413048028946, "learning_rate": 0.0015, "loss": 2.7885, "step": 9025 }, { "epoch": 0.056140742670686465, "grad_norm": 0.1223379522562027, "learning_rate": 0.0015, "loss": 2.7829, "step": 9050 }, { "epoch": 0.056295827595191125, "grad_norm": 0.10872245579957962, "learning_rate": 0.0015, "loss": 2.7962, "step": 9075 }, { "epoch": 0.056450912519695785, "grad_norm": 0.11461862176656723, "learning_rate": 0.0015, "loss": 2.7476, "step": 9100 }, { "epoch": 0.056605997444200445, "grad_norm": 0.08933119475841522, "learning_rate": 0.0015, "loss": 2.7745, "step": 9125 }, { "epoch": 0.056761082368705106, "grad_norm": 0.12911683320999146, "learning_rate": 0.0015, "loss": 2.8029, "step": 9150 }, { "epoch": 0.05691616729320976, "grad_norm": 0.13963252305984497, "learning_rate": 0.0015, "loss": 2.7931, "step": 9175 }, { "epoch": 0.05707125221771442, "grad_norm": 0.13462606072425842, "learning_rate": 0.0015, "loss": 2.7771, "step": 9200 }, { "epoch": 0.05707125221771442, "eval_loss": 4.619841575622559, "perplexity": 101.47795104980469, "step": 9200 }, { "epoch": 0.05722633714221908, "grad_norm": 0.12551379203796387, "learning_rate": 0.0015, "loss": 2.7934, "step": 9225 }, { "epoch": 0.05738142206672374, "grad_norm": 0.12379872798919678, "learning_rate": 0.0015, "loss": 2.7882, "step": 9250 }, { "epoch": 0.0575365069912284, "grad_norm": 0.0940781831741333, "learning_rate": 0.0015, "loss": 2.7658, "step": 9275 }, { "epoch": 0.05769159191573305, "grad_norm": 0.14165829122066498, "learning_rate": 0.0015, "loss": 2.7973, "step": 9300 }, { "epoch": 0.05784667684023771, "grad_norm": 0.10727201402187347, "learning_rate": 0.0015, "loss": 2.815, "step": 9325 }, { "epoch": 0.05800176176474237, "grad_norm": 0.1628653109073639, "learning_rate": 0.0015, "loss": 2.7854, "step": 9350 }, { "epoch": 0.05815684668924703, "grad_norm": 0.09925588220357895, "learning_rate": 0.0015, "loss": 2.7578, "step": 9375 }, { "epoch": 0.05831193161375169, "grad_norm": 0.1587476134300232, "learning_rate": 0.0015, "loss": 2.7296, "step": 9400 }, { "epoch": 0.05831193161375169, "eval_loss": 4.604221343994141, "perplexity": 99.90515899658203, "step": 9400 }, { "epoch": 0.058467016538256346, "grad_norm": 0.10519708693027496, "learning_rate": 0.0015, "loss": 2.7712, "step": 9425 }, { "epoch": 0.058622101462761006, "grad_norm": 0.10321429371833801, "learning_rate": 0.0015, "loss": 2.7281, "step": 9450 }, { "epoch": 0.058777186387265666, "grad_norm": 0.20060209929943085, "learning_rate": 0.0015, "loss": 2.807, "step": 9475 }, { "epoch": 0.058932271311770326, "grad_norm": 0.10847010463476181, "learning_rate": 0.0015, "loss": 2.8078, "step": 9500 }, { "epoch": 0.059087356236274986, "grad_norm": 0.11248752474784851, "learning_rate": 0.0015, "loss": 2.796, "step": 9525 }, { "epoch": 0.059242441160779646, "grad_norm": 0.13171915709972382, "learning_rate": 0.0015, "loss": 2.7658, "step": 9550 }, { "epoch": 0.0593975260852843, "grad_norm": 0.12041529268026352, "learning_rate": 0.0015, "loss": 2.7507, "step": 9575 }, { "epoch": 0.05955261100978896, "grad_norm": 0.11275593191385269, "learning_rate": 0.0015, "loss": 2.8022, "step": 9600 }, { "epoch": 0.05955261100978896, "eval_loss": 4.5886077880859375, "perplexity": 98.3573989868164, "step": 9600 }, { "epoch": 0.05970769593429362, "grad_norm": 0.1715971678495407, "learning_rate": 0.0015, "loss": 2.8003, "step": 9625 }, { "epoch": 0.05986278085879828, "grad_norm": 0.1223614364862442, "learning_rate": 0.0015, "loss": 2.8012, "step": 9650 }, { "epoch": 0.06001786578330294, "grad_norm": 0.114704430103302, "learning_rate": 0.0015, "loss": 2.7963, "step": 9675 }, { "epoch": 0.06017295070780759, "grad_norm": 0.10282139480113983, "learning_rate": 0.0015, "loss": 2.7965, "step": 9700 }, { "epoch": 0.06032803563231225, "grad_norm": 0.10494767129421234, "learning_rate": 0.0015, "loss": 2.7698, "step": 9725 }, { "epoch": 0.06048312055681691, "grad_norm": 0.0908605083823204, "learning_rate": 0.0015, "loss": 2.749, "step": 9750 }, { "epoch": 0.06063820548132157, "grad_norm": 0.0847998857498169, "learning_rate": 0.0015, "loss": 2.838, "step": 9775 }, { "epoch": 0.060793290405826234, "grad_norm": 0.24615754187107086, "learning_rate": 0.0015, "loss": 2.8117, "step": 9800 }, { "epoch": 0.060793290405826234, "eval_loss": 4.593789100646973, "perplexity": 98.86833953857422, "step": 9800 }, { "epoch": 0.06094837533033089, "grad_norm": 0.0959208682179451, "learning_rate": 0.0015, "loss": 2.7845, "step": 9825 }, { "epoch": 0.06110346025483555, "grad_norm": 0.09963307529687881, "learning_rate": 0.0015, "loss": 2.8296, "step": 9850 }, { "epoch": 0.06125854517934021, "grad_norm": 0.1115136444568634, "learning_rate": 0.0015, "loss": 2.7586, "step": 9875 }, { "epoch": 0.06141363010384487, "grad_norm": 0.13883067667484283, "learning_rate": 0.0015, "loss": 2.7978, "step": 9900 }, { "epoch": 0.06156871502834953, "grad_norm": 0.2048570066690445, "learning_rate": 0.0015, "loss": 2.8397, "step": 9925 }, { "epoch": 0.06172379995285418, "grad_norm": 0.1306881606578827, "learning_rate": 0.0015, "loss": 2.8084, "step": 9950 }, { "epoch": 0.06187888487735884, "grad_norm": 0.18285603821277618, "learning_rate": 0.0015, "loss": 2.7989, "step": 9975 }, { "epoch": 0.0620339698018635, "grad_norm": 0.1109723299741745, "learning_rate": 0.0015, "loss": 2.8064, "step": 10000 }, { "epoch": 0.0620339698018635, "eval_loss": 4.5877556800842285, "perplexity": 98.27362823486328, "step": 10000 }, { "epoch": 0.06218905472636816, "grad_norm": 0.12350066751241684, "learning_rate": 0.0015, "loss": 2.7684, "step": 10025 }, { "epoch": 0.06234413965087282, "grad_norm": 0.11565285176038742, "learning_rate": 0.0015, "loss": 2.7748, "step": 10050 }, { "epoch": 0.062499224575377474, "grad_norm": 0.1117839589715004, "learning_rate": 0.0015, "loss": 2.8044, "step": 10075 }, { "epoch": 0.06265430949988214, "grad_norm": 0.1102209985256195, "learning_rate": 0.0015, "loss": 2.7844, "step": 10100 }, { "epoch": 0.0628093944243868, "grad_norm": 0.10270575433969498, "learning_rate": 0.0015, "loss": 2.7685, "step": 10125 }, { "epoch": 0.06296447934889145, "grad_norm": 0.09842963516712189, "learning_rate": 0.0015, "loss": 2.8048, "step": 10150 }, { "epoch": 0.06311956427339611, "grad_norm": 0.10446088761091232, "learning_rate": 0.0015, "loss": 2.8051, "step": 10175 }, { "epoch": 0.06327464919790077, "grad_norm": 0.14759957790374756, "learning_rate": 0.0015, "loss": 2.8089, "step": 10200 }, { "epoch": 0.06327464919790077, "eval_loss": 4.588883399963379, "perplexity": 98.38451385498047, "step": 10200 }, { "epoch": 0.06342973412240543, "grad_norm": 0.12910906970500946, "learning_rate": 0.0015, "loss": 2.8193, "step": 10225 }, { "epoch": 0.06358481904691009, "grad_norm": 0.13095402717590332, "learning_rate": 0.0015, "loss": 2.7509, "step": 10250 }, { "epoch": 0.06373990397141474, "grad_norm": 0.16069594025611877, "learning_rate": 0.0015, "loss": 2.7911, "step": 10275 }, { "epoch": 0.06389498889591941, "grad_norm": 0.08322907984256744, "learning_rate": 0.0015, "loss": 2.8025, "step": 10300 }, { "epoch": 0.06405007382042406, "grad_norm": 0.2328927367925644, "learning_rate": 0.0015, "loss": 2.7863, "step": 10325 }, { "epoch": 0.06420515874492873, "grad_norm": 0.09172859787940979, "learning_rate": 0.0015, "loss": 2.8101, "step": 10350 }, { "epoch": 0.06436024366943338, "grad_norm": 0.13464473187923431, "learning_rate": 0.0015, "loss": 2.7718, "step": 10375 }, { "epoch": 0.06451532859393803, "grad_norm": 0.1284090131521225, "learning_rate": 0.0015, "loss": 2.7667, "step": 10400 }, { "epoch": 0.06451532859393803, "eval_loss": 4.59510612487793, "perplexity": 98.99864196777344, "step": 10400 }, { "epoch": 0.0646704135184427, "grad_norm": 0.13565704226493835, "learning_rate": 0.0015, "loss": 2.7552, "step": 10425 }, { "epoch": 0.06482549844294735, "grad_norm": 0.1089024469256401, "learning_rate": 0.0015, "loss": 2.7838, "step": 10450 }, { "epoch": 0.06498058336745202, "grad_norm": 0.11035135388374329, "learning_rate": 0.0015, "loss": 2.7986, "step": 10475 }, { "epoch": 0.06513566829195667, "grad_norm": 0.08107917010784149, "learning_rate": 0.0015, "loss": 2.7791, "step": 10500 }, { "epoch": 0.06529075321646133, "grad_norm": 0.10200012475252151, "learning_rate": 0.0015, "loss": 2.7636, "step": 10525 }, { "epoch": 0.065445838140966, "grad_norm": 0.08427785336971283, "learning_rate": 0.0015, "loss": 2.794, "step": 10550 }, { "epoch": 0.06560092306547065, "grad_norm": 0.10828018933534622, "learning_rate": 0.0015, "loss": 2.7778, "step": 10575 }, { "epoch": 0.06575600798997532, "grad_norm": 0.12101134657859802, "learning_rate": 0.0015, "loss": 2.7469, "step": 10600 }, { "epoch": 0.06575600798997532, "eval_loss": 4.597805500030518, "perplexity": 99.2662353515625, "step": 10600 }, { "epoch": 0.06591109291447997, "grad_norm": 0.11220554262399673, "learning_rate": 0.0015, "loss": 2.7294, "step": 10625 }, { "epoch": 0.06606617783898462, "grad_norm": 0.13899332284927368, "learning_rate": 0.0015, "loss": 2.763, "step": 10650 }, { "epoch": 0.06622126276348929, "grad_norm": 0.11773937195539474, "learning_rate": 0.0015, "loss": 2.7866, "step": 10675 }, { "epoch": 0.06637634768799394, "grad_norm": 0.11059702187776566, "learning_rate": 0.0015, "loss": 2.8076, "step": 10700 }, { "epoch": 0.06653143261249861, "grad_norm": 0.1251254379749298, "learning_rate": 0.0015, "loss": 2.7674, "step": 10725 }, { "epoch": 0.06668651753700326, "grad_norm": 0.12195979803800583, "learning_rate": 0.0015, "loss": 2.768, "step": 10750 }, { "epoch": 0.06684160246150792, "grad_norm": 0.1487302929162979, "learning_rate": 0.0015, "loss": 2.762, "step": 10775 }, { "epoch": 0.06699668738601258, "grad_norm": 0.1315547525882721, "learning_rate": 0.0015, "loss": 2.7348, "step": 10800 }, { "epoch": 0.06699668738601258, "eval_loss": 4.566490650177002, "perplexity": 96.20589447021484, "step": 10800 }, { "epoch": 0.06715177231051724, "grad_norm": 0.13864025473594666, "learning_rate": 0.0015, "loss": 2.7517, "step": 10825 }, { "epoch": 0.0673068572350219, "grad_norm": 0.08808566629886627, "learning_rate": 0.0015, "loss": 2.7718, "step": 10850 }, { "epoch": 0.06746194215952656, "grad_norm": 0.115321584045887, "learning_rate": 0.0015, "loss": 2.7007, "step": 10875 }, { "epoch": 0.06761702708403121, "grad_norm": 0.10276370495557785, "learning_rate": 0.0015, "loss": 2.7692, "step": 10900 }, { "epoch": 0.06777211200853588, "grad_norm": 0.09534792602062225, "learning_rate": 0.0015, "loss": 2.8186, "step": 10925 }, { "epoch": 0.06792719693304053, "grad_norm": 0.14239507913589478, "learning_rate": 0.0015, "loss": 2.7801, "step": 10950 }, { "epoch": 0.0680822818575452, "grad_norm": 0.11848737299442291, "learning_rate": 0.0015, "loss": 2.7394, "step": 10975 }, { "epoch": 0.06823736678204985, "grad_norm": 0.09367898106575012, "learning_rate": 0.0015, "loss": 2.8043, "step": 11000 }, { "epoch": 0.06823736678204985, "eval_loss": 4.5800089836120605, "perplexity": 97.51527404785156, "step": 11000 }, { "epoch": 0.0683924517065545, "grad_norm": 0.1494915634393692, "learning_rate": 0.0015, "loss": 2.7841, "step": 11025 }, { "epoch": 0.06854753663105917, "grad_norm": 0.09982737898826599, "learning_rate": 0.0015, "loss": 2.7933, "step": 11050 }, { "epoch": 0.06870262155556382, "grad_norm": 0.12379477173089981, "learning_rate": 0.0015, "loss": 2.7419, "step": 11075 }, { "epoch": 0.06885770648006849, "grad_norm": 0.11405149102210999, "learning_rate": 0.0015, "loss": 2.763, "step": 11100 }, { "epoch": 0.06901279140457314, "grad_norm": 0.09574620425701141, "learning_rate": 0.0015, "loss": 2.7961, "step": 11125 }, { "epoch": 0.06916787632907781, "grad_norm": 0.2947874963283539, "learning_rate": 0.0015, "loss": 2.789, "step": 11150 }, { "epoch": 0.06932296125358246, "grad_norm": 0.09219149500131607, "learning_rate": 0.0015, "loss": 2.7951, "step": 11175 }, { "epoch": 0.06947804617808712, "grad_norm": 0.11840498447418213, "learning_rate": 0.0015, "loss": 2.7717, "step": 11200 }, { "epoch": 0.06947804617808712, "eval_loss": 4.564184188842773, "perplexity": 95.98426055908203, "step": 11200 }, { "epoch": 0.06963313110259178, "grad_norm": 0.09422053396701813, "learning_rate": 0.0015, "loss": 2.7976, "step": 11225 }, { "epoch": 0.06978821602709644, "grad_norm": 0.11220031976699829, "learning_rate": 0.0015, "loss": 2.7634, "step": 11250 }, { "epoch": 0.0699433009516011, "grad_norm": 0.10228817909955978, "learning_rate": 0.0015, "loss": 2.7256, "step": 11275 }, { "epoch": 0.07009838587610576, "grad_norm": 0.0929483100771904, "learning_rate": 0.0015, "loss": 2.8005, "step": 11300 }, { "epoch": 0.07025347080061041, "grad_norm": 0.11491668224334717, "learning_rate": 0.0015, "loss": 2.7504, "step": 11325 }, { "epoch": 0.07040855572511508, "grad_norm": 0.15256111323833466, "learning_rate": 0.0015, "loss": 2.7609, "step": 11350 }, { "epoch": 0.07056364064961973, "grad_norm": 0.11576159298419952, "learning_rate": 0.0015, "loss": 2.7742, "step": 11375 }, { "epoch": 0.0707187255741244, "grad_norm": 0.08809765428304672, "learning_rate": 0.0015, "loss": 2.7891, "step": 11400 }, { "epoch": 0.0707187255741244, "eval_loss": 4.568883895874023, "perplexity": 96.43641662597656, "step": 11400 }, { "epoch": 0.07087381049862905, "grad_norm": 0.08563827723264694, "learning_rate": 0.0015, "loss": 2.8066, "step": 11425 }, { "epoch": 0.0710288954231337, "grad_norm": 0.18896931409835815, "learning_rate": 0.0015, "loss": 2.8055, "step": 11450 }, { "epoch": 0.07118398034763837, "grad_norm": 0.13940319418907166, "learning_rate": 0.0015, "loss": 2.7766, "step": 11475 }, { "epoch": 0.07133906527214302, "grad_norm": 0.09737322479486465, "learning_rate": 0.0015, "loss": 2.7945, "step": 11500 }, { "epoch": 0.07149415019664769, "grad_norm": 0.11357785761356354, "learning_rate": 0.0015, "loss": 2.7799, "step": 11525 }, { "epoch": 0.07164923512115234, "grad_norm": 0.10513681918382645, "learning_rate": 0.0015, "loss": 2.7627, "step": 11550 }, { "epoch": 0.071804320045657, "grad_norm": 0.1434682458639145, "learning_rate": 0.0015, "loss": 2.8055, "step": 11575 }, { "epoch": 0.07195940497016166, "grad_norm": 0.10169105976819992, "learning_rate": 0.0015, "loss": 2.7832, "step": 11600 }, { "epoch": 0.07195940497016166, "eval_loss": 4.560365676879883, "perplexity": 95.61843872070312, "step": 11600 }, { "epoch": 0.07211448989466632, "grad_norm": 0.1385478526353836, "learning_rate": 0.0015, "loss": 2.7548, "step": 11625 }, { "epoch": 0.07226957481917098, "grad_norm": 0.1300746351480484, "learning_rate": 0.0015, "loss": 2.7553, "step": 11650 }, { "epoch": 0.07242465974367564, "grad_norm": 0.11596991866827011, "learning_rate": 0.0015, "loss": 2.8095, "step": 11675 }, { "epoch": 0.07257974466818029, "grad_norm": 0.11611347645521164, "learning_rate": 0.0015, "loss": 2.76, "step": 11700 }, { "epoch": 0.07273482959268496, "grad_norm": 0.11249697953462601, "learning_rate": 0.0015, "loss": 2.7827, "step": 11725 }, { "epoch": 0.07288991451718961, "grad_norm": 0.1243973895907402, "learning_rate": 0.0015, "loss": 2.7754, "step": 11750 }, { "epoch": 0.07304499944169428, "grad_norm": 0.08843350410461426, "learning_rate": 0.0015, "loss": 2.8079, "step": 11775 }, { "epoch": 0.07320008436619893, "grad_norm": 0.09881053864955902, "learning_rate": 0.0015, "loss": 2.7961, "step": 11800 }, { "epoch": 0.07320008436619893, "eval_loss": 4.567913055419922, "perplexity": 96.34283447265625, "step": 11800 }, { "epoch": 0.07335516929070358, "grad_norm": 0.08978071063756943, "learning_rate": 0.0015, "loss": 2.7786, "step": 11825 }, { "epoch": 0.07351025421520825, "grad_norm": 0.1376107782125473, "learning_rate": 0.0015, "loss": 2.7931, "step": 11850 }, { "epoch": 0.0736653391397129, "grad_norm": 0.09934777021408081, "learning_rate": 0.0015, "loss": 2.7787, "step": 11875 }, { "epoch": 0.07382042406421757, "grad_norm": 0.17031100392341614, "learning_rate": 0.0015, "loss": 2.7997, "step": 11900 }, { "epoch": 0.07397550898872222, "grad_norm": 0.13974526524543762, "learning_rate": 0.0015, "loss": 2.7975, "step": 11925 }, { "epoch": 0.07413059391322688, "grad_norm": 0.12611718475818634, "learning_rate": 0.0015, "loss": 2.792, "step": 11950 }, { "epoch": 0.07428567883773154, "grad_norm": 0.15177124738693237, "learning_rate": 0.0015, "loss": 2.7904, "step": 11975 }, { "epoch": 0.0744407637622362, "grad_norm": 0.1411113739013672, "learning_rate": 0.0015, "loss": 2.7677, "step": 12000 }, { "epoch": 0.0744407637622362, "eval_loss": 4.5571770668029785, "perplexity": 95.31403350830078, "step": 12000 }, { "epoch": 0.07459584868674086, "grad_norm": 0.08981940150260925, "learning_rate": 0.0015, "loss": 2.7765, "step": 12025 }, { "epoch": 0.07475093361124552, "grad_norm": 0.09796686470508575, "learning_rate": 0.0015, "loss": 2.7503, "step": 12050 }, { "epoch": 0.07490601853575017, "grad_norm": 0.1125386580824852, "learning_rate": 0.0015, "loss": 2.7263, "step": 12075 }, { "epoch": 0.07506110346025484, "grad_norm": 0.11394508183002472, "learning_rate": 0.0015, "loss": 2.7855, "step": 12100 }, { "epoch": 0.07521618838475949, "grad_norm": 0.11744117736816406, "learning_rate": 0.0015, "loss": 2.7698, "step": 12125 }, { "epoch": 0.07537127330926416, "grad_norm": 0.17264704406261444, "learning_rate": 0.0015, "loss": 2.7592, "step": 12150 }, { "epoch": 0.07552635823376881, "grad_norm": 0.10691671818494797, "learning_rate": 0.0015, "loss": 2.7519, "step": 12175 }, { "epoch": 0.07568144315827346, "grad_norm": 0.1205432191491127, "learning_rate": 0.0015, "loss": 2.7676, "step": 12200 }, { "epoch": 0.07568144315827346, "eval_loss": 4.544521808624268, "perplexity": 94.11540985107422, "step": 12200 }, { "epoch": 0.07583652808277813, "grad_norm": 0.1253867894411087, "learning_rate": 0.0015, "loss": 2.7698, "step": 12225 }, { "epoch": 0.07599161300728279, "grad_norm": 0.1450471729040146, "learning_rate": 0.0015, "loss": 2.77, "step": 12250 }, { "epoch": 0.07614669793178745, "grad_norm": 0.17055222392082214, "learning_rate": 0.0015, "loss": 2.7352, "step": 12275 }, { "epoch": 0.0763017828562921, "grad_norm": 0.10687011480331421, "learning_rate": 0.0015, "loss": 2.7988, "step": 12300 }, { "epoch": 0.07645686778079676, "grad_norm": 0.15520496666431427, "learning_rate": 0.0015, "loss": 2.7828, "step": 12325 }, { "epoch": 0.07661195270530143, "grad_norm": 0.09279755502939224, "learning_rate": 0.0015, "loss": 2.7222, "step": 12350 }, { "epoch": 0.07676703762980608, "grad_norm": 0.18024928867816925, "learning_rate": 0.0015, "loss": 2.7555, "step": 12375 }, { "epoch": 0.07692212255431075, "grad_norm": 0.13292630016803741, "learning_rate": 0.0015, "loss": 2.733, "step": 12400 }, { "epoch": 0.07692212255431075, "eval_loss": 4.538700103759766, "perplexity": 93.569091796875, "step": 12400 }, { "epoch": 0.0770772074788154, "grad_norm": 0.09353446960449219, "learning_rate": 0.0015, "loss": 2.7768, "step": 12425 }, { "epoch": 0.07723229240332005, "grad_norm": 0.0946316123008728, "learning_rate": 0.0015, "loss": 2.7321, "step": 12450 }, { "epoch": 0.07738737732782472, "grad_norm": 0.11109050363302231, "learning_rate": 0.0015, "loss": 2.7607, "step": 12475 }, { "epoch": 0.07754246225232937, "grad_norm": 0.10057735443115234, "learning_rate": 0.0015, "loss": 2.7707, "step": 12500 }, { "epoch": 0.07769754717683404, "grad_norm": 0.1466909795999527, "learning_rate": 0.0015, "loss": 2.7434, "step": 12525 }, { "epoch": 0.07785263210133869, "grad_norm": 0.09831534326076508, "learning_rate": 0.0015, "loss": 2.7858, "step": 12550 }, { "epoch": 0.07800771702584335, "grad_norm": 0.13202817738056183, "learning_rate": 0.0015, "loss": 2.7884, "step": 12575 }, { "epoch": 0.07816280195034801, "grad_norm": 0.10797799378633499, "learning_rate": 0.0015, "loss": 2.7788, "step": 12600 }, { "epoch": 0.07816280195034801, "eval_loss": 4.5452494621276855, "perplexity": 94.18392181396484, "step": 12600 }, { "epoch": 0.07831788687485267, "grad_norm": 0.10239394754171371, "learning_rate": 0.0015, "loss": 2.7803, "step": 12625 }, { "epoch": 0.07847297179935733, "grad_norm": 0.10468672215938568, "learning_rate": 0.0015, "loss": 2.7449, "step": 12650 }, { "epoch": 0.07862805672386199, "grad_norm": 0.13691146671772003, "learning_rate": 0.0015, "loss": 2.7837, "step": 12675 }, { "epoch": 0.07878314164836664, "grad_norm": 0.16976097226142883, "learning_rate": 0.0015, "loss": 2.7557, "step": 12700 }, { "epoch": 0.0789382265728713, "grad_norm": 0.09623986482620239, "learning_rate": 0.0015, "loss": 2.7576, "step": 12725 }, { "epoch": 0.07909331149737596, "grad_norm": 0.11203131079673767, "learning_rate": 0.0015, "loss": 2.7846, "step": 12750 }, { "epoch": 0.07924839642188063, "grad_norm": 0.12257611751556396, "learning_rate": 0.0015, "loss": 2.8015, "step": 12775 }, { "epoch": 0.07940348134638528, "grad_norm": 0.08369628340005875, "learning_rate": 0.0015, "loss": 2.7616, "step": 12800 }, { "epoch": 0.07940348134638528, "eval_loss": 4.548933506011963, "perplexity": 94.53153991699219, "step": 12800 }, { "epoch": 0.07955856627088993, "grad_norm": 0.12149519473314285, "learning_rate": 0.0015, "loss": 2.7651, "step": 12825 }, { "epoch": 0.0797136511953946, "grad_norm": 0.09911686927080154, "learning_rate": 0.0015, "loss": 2.7964, "step": 12850 }, { "epoch": 0.07986873611989925, "grad_norm": 0.09883631020784378, "learning_rate": 0.0015, "loss": 2.7461, "step": 12875 }, { "epoch": 0.08002382104440392, "grad_norm": 0.08828576654195786, "learning_rate": 0.0015, "loss": 2.7735, "step": 12900 }, { "epoch": 0.08017890596890857, "grad_norm": 0.18119321763515472, "learning_rate": 0.0015, "loss": 2.7863, "step": 12925 }, { "epoch": 0.08033399089341323, "grad_norm": 0.09123501181602478, "learning_rate": 0.0015, "loss": 2.7559, "step": 12950 }, { "epoch": 0.0804890758179179, "grad_norm": 0.18334759771823883, "learning_rate": 0.0015, "loss": 2.7357, "step": 12975 }, { "epoch": 0.08064416074242255, "grad_norm": 0.08934136480093002, "learning_rate": 0.0015, "loss": 2.8003, "step": 13000 }, { "epoch": 0.08064416074242255, "eval_loss": 4.537932395935059, "perplexity": 93.49728393554688, "step": 13000 }, { "epoch": 0.08079924566692721, "grad_norm": 0.117793008685112, "learning_rate": 0.0015, "loss": 2.738, "step": 13025 }, { "epoch": 0.08095433059143187, "grad_norm": 0.1012151837348938, "learning_rate": 0.0015, "loss": 2.767, "step": 13050 }, { "epoch": 0.08110941551593653, "grad_norm": 0.1099851131439209, "learning_rate": 0.0015, "loss": 2.7899, "step": 13075 }, { "epoch": 0.08126450044044119, "grad_norm": 0.105575330555439, "learning_rate": 0.0015, "loss": 2.7857, "step": 13100 }, { "epoch": 0.08141958536494584, "grad_norm": 0.11926279962062836, "learning_rate": 0.0015, "loss": 2.7821, "step": 13125 }, { "epoch": 0.08157467028945051, "grad_norm": 0.1669924259185791, "learning_rate": 0.0015, "loss": 2.7673, "step": 13150 }, { "epoch": 0.08172975521395516, "grad_norm": 0.11445988714694977, "learning_rate": 0.0015, "loss": 2.8081, "step": 13175 }, { "epoch": 0.08188484013845983, "grad_norm": 0.09700124710798264, "learning_rate": 0.0015, "loss": 2.7841, "step": 13200 }, { "epoch": 0.08188484013845983, "eval_loss": 4.540359973907471, "perplexity": 93.72453308105469, "step": 13200 }, { "epoch": 0.08203992506296448, "grad_norm": 0.11112058907747269, "learning_rate": 0.0015, "loss": 2.7471, "step": 13225 }, { "epoch": 0.08219500998746913, "grad_norm": 0.17890195548534393, "learning_rate": 0.0015, "loss": 2.7898, "step": 13250 }, { "epoch": 0.0823500949119738, "grad_norm": 0.12197751551866531, "learning_rate": 0.0015, "loss": 2.7328, "step": 13275 }, { "epoch": 0.08250517983647845, "grad_norm": 0.11677111685276031, "learning_rate": 0.0015, "loss": 2.7849, "step": 13300 }, { "epoch": 0.08266026476098312, "grad_norm": 0.15514017641544342, "learning_rate": 0.0015, "loss": 2.7561, "step": 13325 }, { "epoch": 0.08281534968548777, "grad_norm": 0.10389192402362823, "learning_rate": 0.0015, "loss": 2.7611, "step": 13350 }, { "epoch": 0.08297043460999243, "grad_norm": 0.10176412016153336, "learning_rate": 0.0015, "loss": 2.7793, "step": 13375 }, { "epoch": 0.0831255195344971, "grad_norm": 0.1043052077293396, "learning_rate": 0.0015, "loss": 2.7375, "step": 13400 }, { "epoch": 0.0831255195344971, "eval_loss": 4.5388336181640625, "perplexity": 93.58158111572266, "step": 13400 }, { "epoch": 0.08328060445900175, "grad_norm": 0.08918718248605728, "learning_rate": 0.0015, "loss": 2.7465, "step": 13425 }, { "epoch": 0.08343568938350641, "grad_norm": 0.10008233785629272, "learning_rate": 0.0015, "loss": 2.7776, "step": 13450 }, { "epoch": 0.08359077430801107, "grad_norm": 0.10228800773620605, "learning_rate": 0.0015, "loss": 2.756, "step": 13475 }, { "epoch": 0.08374585923251572, "grad_norm": 0.0868915542960167, "learning_rate": 0.0015, "loss": 2.7556, "step": 13500 }, { "epoch": 0.08390094415702039, "grad_norm": 0.11076166480779648, "learning_rate": 0.0015, "loss": 2.6975, "step": 13525 }, { "epoch": 0.08405602908152504, "grad_norm": 0.13617128133773804, "learning_rate": 0.0015, "loss": 2.7643, "step": 13550 }, { "epoch": 0.08421111400602971, "grad_norm": 0.15346932411193848, "learning_rate": 0.0015, "loss": 2.7966, "step": 13575 }, { "epoch": 0.08436619893053436, "grad_norm": 0.17080894112586975, "learning_rate": 0.0015, "loss": 2.7636, "step": 13600 }, { "epoch": 0.08436619893053436, "eval_loss": 4.513378620147705, "perplexity": 91.22953033447266, "step": 13600 }, { "epoch": 0.08452128385503901, "grad_norm": 0.11548548936843872, "learning_rate": 0.0015, "loss": 2.7729, "step": 13625 }, { "epoch": 0.08467636877954368, "grad_norm": 0.14650912582874298, "learning_rate": 0.0015, "loss": 2.7063, "step": 13650 }, { "epoch": 0.08483145370404833, "grad_norm": 0.09750749915838242, "learning_rate": 0.0015, "loss": 2.7648, "step": 13675 }, { "epoch": 0.084986538628553, "grad_norm": 0.18051239848136902, "learning_rate": 0.0015, "loss": 2.754, "step": 13700 }, { "epoch": 0.08514162355305765, "grad_norm": 0.21637938916683197, "learning_rate": 0.0015, "loss": 2.7529, "step": 13725 }, { "epoch": 0.08529670847756231, "grad_norm": 0.10037226974964142, "learning_rate": 0.0015, "loss": 2.7638, "step": 13750 }, { "epoch": 0.08545179340206698, "grad_norm": 0.1033267229795456, "learning_rate": 0.0015, "loss": 2.7713, "step": 13775 }, { "epoch": 0.08560687832657163, "grad_norm": 0.09179462492465973, "learning_rate": 0.0015, "loss": 2.8278, "step": 13800 }, { "epoch": 0.08560687832657163, "eval_loss": 4.508410453796387, "perplexity": 90.77741241455078, "step": 13800 }, { "epoch": 0.0857619632510763, "grad_norm": 0.09874552488327026, "learning_rate": 0.0015, "loss": 2.7544, "step": 13825 }, { "epoch": 0.08591704817558095, "grad_norm": 0.17807777225971222, "learning_rate": 0.0015, "loss": 2.7401, "step": 13850 }, { "epoch": 0.0860721331000856, "grad_norm": 0.14388497173786163, "learning_rate": 0.0015, "loss": 2.7879, "step": 13875 }, { "epoch": 0.08622721802459027, "grad_norm": 0.13081450760364532, "learning_rate": 0.0015, "loss": 2.7162, "step": 13900 }, { "epoch": 0.08638230294909492, "grad_norm": 0.15077342092990875, "learning_rate": 0.0015, "loss": 2.757, "step": 13925 }, { "epoch": 0.08653738787359959, "grad_norm": 0.11368410289287567, "learning_rate": 0.0015, "loss": 2.7546, "step": 13950 }, { "epoch": 0.08669247279810424, "grad_norm": 0.16447153687477112, "learning_rate": 0.0015, "loss": 2.7371, "step": 13975 }, { "epoch": 0.0868475577226089, "grad_norm": 0.20563559234142303, "learning_rate": 0.0015, "loss": 2.7474, "step": 14000 }, { "epoch": 0.0868475577226089, "eval_loss": 4.525671005249023, "perplexity": 92.35787963867188, "step": 14000 }, { "epoch": 0.08700264264711356, "grad_norm": 0.10695035755634308, "learning_rate": 0.0015, "loss": 2.7565, "step": 14025 }, { "epoch": 0.08715772757161822, "grad_norm": 0.12368099391460419, "learning_rate": 0.0015, "loss": 2.784, "step": 14050 }, { "epoch": 0.08731281249612288, "grad_norm": 0.11491699516773224, "learning_rate": 0.0015, "loss": 2.7477, "step": 14075 }, { "epoch": 0.08746789742062754, "grad_norm": 0.10570378601551056, "learning_rate": 0.0015, "loss": 2.7575, "step": 14100 }, { "epoch": 0.08762298234513219, "grad_norm": 0.09137633442878723, "learning_rate": 0.0015, "loss": 2.7517, "step": 14125 }, { "epoch": 0.08777806726963686, "grad_norm": 0.09999803453683853, "learning_rate": 0.0015, "loss": 2.7446, "step": 14150 }, { "epoch": 0.08793315219414151, "grad_norm": 0.15709616243839264, "learning_rate": 0.0015, "loss": 2.7606, "step": 14175 }, { "epoch": 0.08808823711864618, "grad_norm": 0.10327859222888947, "learning_rate": 0.0015, "loss": 2.7441, "step": 14200 }, { "epoch": 0.08808823711864618, "eval_loss": 4.521189212799072, "perplexity": 91.94487762451172, "step": 14200 }, { "epoch": 0.08824332204315083, "grad_norm": 0.1964125633239746, "learning_rate": 0.0015, "loss": 2.7109, "step": 14225 }, { "epoch": 0.08839840696765548, "grad_norm": 0.12792247533798218, "learning_rate": 0.0015, "loss": 2.7401, "step": 14250 }, { "epoch": 0.08855349189216015, "grad_norm": 0.17532923817634583, "learning_rate": 0.0015, "loss": 2.7609, "step": 14275 }, { "epoch": 0.0887085768166648, "grad_norm": 0.096143439412117, "learning_rate": 0.0015, "loss": 2.7749, "step": 14300 }, { "epoch": 0.08886366174116947, "grad_norm": 0.12778601050376892, "learning_rate": 0.0015, "loss": 2.6981, "step": 14325 }, { "epoch": 0.08901874666567412, "grad_norm": 0.1130848377943039, "learning_rate": 0.0015, "loss": 2.7255, "step": 14350 }, { "epoch": 0.08917383159017878, "grad_norm": 0.0818464607000351, "learning_rate": 0.0015, "loss": 2.7223, "step": 14375 }, { "epoch": 0.08932891651468344, "grad_norm": 0.10516222566366196, "learning_rate": 0.0015, "loss": 2.7672, "step": 14400 }, { "epoch": 0.08932891651468344, "eval_loss": 4.524067401885986, "perplexity": 92.20989227294922, "step": 14400 }, { "epoch": 0.0894840014391881, "grad_norm": 0.08912840485572815, "learning_rate": 0.0015, "loss": 2.7349, "step": 14425 }, { "epoch": 0.08963908636369276, "grad_norm": 0.11931388080120087, "learning_rate": 0.0015, "loss": 2.7326, "step": 14450 }, { "epoch": 0.08979417128819742, "grad_norm": 0.12271756678819656, "learning_rate": 0.0015, "loss": 2.7327, "step": 14475 }, { "epoch": 0.08994925621270207, "grad_norm": 0.1567191183567047, "learning_rate": 0.0015, "loss": 2.7573, "step": 14500 }, { "epoch": 0.09010434113720674, "grad_norm": 0.1841791719198227, "learning_rate": 0.0015, "loss": 2.7582, "step": 14525 }, { "epoch": 0.09025942606171139, "grad_norm": 0.12743189930915833, "learning_rate": 0.0015, "loss": 2.8061, "step": 14550 }, { "epoch": 0.09041451098621606, "grad_norm": 0.11932828277349472, "learning_rate": 0.0015, "loss": 2.7447, "step": 14575 }, { "epoch": 0.09056959591072071, "grad_norm": 0.18284690380096436, "learning_rate": 0.0015, "loss": 2.7436, "step": 14600 }, { "epoch": 0.09056959591072071, "eval_loss": 4.515897750854492, "perplexity": 91.45964050292969, "step": 14600 }, { "epoch": 0.09072468083522536, "grad_norm": 0.17987670004367828, "learning_rate": 0.0015, "loss": 2.7831, "step": 14625 }, { "epoch": 0.09087976575973003, "grad_norm": 0.10992395132780075, "learning_rate": 0.0015, "loss": 2.7516, "step": 14650 }, { "epoch": 0.09103485068423468, "grad_norm": 0.09343726187944412, "learning_rate": 0.0015, "loss": 2.7475, "step": 14675 }, { "epoch": 0.09118993560873935, "grad_norm": 0.10370751470327377, "learning_rate": 0.0015, "loss": 2.7518, "step": 14700 }, { "epoch": 0.091345020533244, "grad_norm": 0.11190348863601685, "learning_rate": 0.0015, "loss": 2.7482, "step": 14725 }, { "epoch": 0.09150010545774866, "grad_norm": 0.12450053542852402, "learning_rate": 0.0015, "loss": 2.7726, "step": 14750 }, { "epoch": 0.09165519038225332, "grad_norm": 0.11882703006267548, "learning_rate": 0.0015, "loss": 2.7318, "step": 14775 }, { "epoch": 0.09181027530675798, "grad_norm": 0.1315181404352188, "learning_rate": 0.0015, "loss": 2.757, "step": 14800 }, { "epoch": 0.09181027530675798, "eval_loss": 4.521557807922363, "perplexity": 91.97877502441406, "step": 14800 }, { "epoch": 0.09196536023126264, "grad_norm": 0.18574784696102142, "learning_rate": 0.0015, "loss": 2.7353, "step": 14825 }, { "epoch": 0.0921204451557673, "grad_norm": 0.17665444314479828, "learning_rate": 0.0015, "loss": 2.7687, "step": 14850 }, { "epoch": 0.09227553008027195, "grad_norm": 0.12507860362529755, "learning_rate": 0.0015, "loss": 2.7386, "step": 14875 }, { "epoch": 0.09243061500477662, "grad_norm": 0.10472691059112549, "learning_rate": 0.0015, "loss": 2.7716, "step": 14900 }, { "epoch": 0.09258569992928127, "grad_norm": 0.10282575339078903, "learning_rate": 0.0015, "loss": 2.7312, "step": 14925 }, { "epoch": 0.09274078485378594, "grad_norm": 0.12706094980239868, "learning_rate": 0.0015, "loss": 2.7995, "step": 14950 }, { "epoch": 0.09289586977829059, "grad_norm": 0.15283973515033722, "learning_rate": 0.0015, "loss": 2.7313, "step": 14975 }, { "epoch": 0.09305095470279524, "grad_norm": 0.12476324290037155, "learning_rate": 0.0015, "loss": 2.7727, "step": 15000 }, { "epoch": 0.09305095470279524, "eval_loss": 4.547565937042236, "perplexity": 94.40234375, "step": 15000 }, { "epoch": 0.09320603962729991, "grad_norm": 0.12369734048843384, "learning_rate": 0.0015, "loss": 2.7565, "step": 15025 }, { "epoch": 0.09336112455180456, "grad_norm": 0.1322038471698761, "learning_rate": 0.0015, "loss": 2.7588, "step": 15050 }, { "epoch": 0.09351620947630923, "grad_norm": 0.0926559790968895, "learning_rate": 0.0015, "loss": 2.7393, "step": 15075 }, { "epoch": 0.09367129440081388, "grad_norm": 0.17404210567474365, "learning_rate": 0.0015, "loss": 2.723, "step": 15100 }, { "epoch": 0.09382637932531855, "grad_norm": 0.10326647758483887, "learning_rate": 0.0015, "loss": 2.7853, "step": 15125 }, { "epoch": 0.0939814642498232, "grad_norm": 0.13869203627109528, "learning_rate": 0.0015, "loss": 2.7535, "step": 15150 }, { "epoch": 0.09413654917432786, "grad_norm": 0.14325955510139465, "learning_rate": 0.0015, "loss": 2.7597, "step": 15175 }, { "epoch": 0.09429163409883252, "grad_norm": 0.11783768236637115, "learning_rate": 0.0015, "loss": 2.7524, "step": 15200 }, { "epoch": 0.09429163409883252, "eval_loss": 4.5251593589782715, "perplexity": 92.31063842773438, "step": 15200 }, { "epoch": 0.09444671902333718, "grad_norm": 0.12261676043272018, "learning_rate": 0.0015, "loss": 2.7279, "step": 15225 }, { "epoch": 0.09460180394784184, "grad_norm": 0.09966279566287994, "learning_rate": 0.0015, "loss": 2.8119, "step": 15250 }, { "epoch": 0.0947568888723465, "grad_norm": 0.1052974984049797, "learning_rate": 0.0015, "loss": 2.7392, "step": 15275 }, { "epoch": 0.09491197379685115, "grad_norm": 0.11074663698673248, "learning_rate": 0.0015, "loss": 2.7319, "step": 15300 }, { "epoch": 0.09506705872135582, "grad_norm": 0.09762706607580185, "learning_rate": 0.0015, "loss": 2.7806, "step": 15325 }, { "epoch": 0.09522214364586047, "grad_norm": 0.08552476018667221, "learning_rate": 0.0015, "loss": 2.7351, "step": 15350 }, { "epoch": 0.09537722857036514, "grad_norm": 0.13211695849895477, "learning_rate": 0.0015, "loss": 2.7667, "step": 15375 }, { "epoch": 0.09553231349486979, "grad_norm": 0.12074939906597137, "learning_rate": 0.0015, "loss": 2.7614, "step": 15400 }, { "epoch": 0.09553231349486979, "eval_loss": 4.53213357925415, "perplexity": 92.95668029785156, "step": 15400 }, { "epoch": 0.09568739841937444, "grad_norm": 0.11755666136741638, "learning_rate": 0.0015, "loss": 2.7101, "step": 15425 }, { "epoch": 0.09584248334387911, "grad_norm": 0.10476246476173401, "learning_rate": 0.0015, "loss": 2.7391, "step": 15450 }, { "epoch": 0.09599756826838376, "grad_norm": 0.10921350121498108, "learning_rate": 0.0015, "loss": 2.7423, "step": 15475 }, { "epoch": 0.09615265319288843, "grad_norm": 0.11517275124788284, "learning_rate": 0.0015, "loss": 2.7374, "step": 15500 }, { "epoch": 0.09630773811739309, "grad_norm": 0.10500945895910263, "learning_rate": 0.0015, "loss": 2.73, "step": 15525 }, { "epoch": 0.09646282304189774, "grad_norm": 0.0962584912776947, "learning_rate": 0.0015, "loss": 2.7597, "step": 15550 }, { "epoch": 0.0966179079664024, "grad_norm": 0.1273050308227539, "learning_rate": 0.0015, "loss": 2.7306, "step": 15575 }, { "epoch": 0.09677299289090706, "grad_norm": 0.11249135434627533, "learning_rate": 0.0015, "loss": 2.7859, "step": 15600 }, { "epoch": 0.09677299289090706, "eval_loss": 4.537318706512451, "perplexity": 93.43992614746094, "step": 15600 }, { "epoch": 0.09692807781541173, "grad_norm": 0.19111056625843048, "learning_rate": 0.0015, "loss": 2.7386, "step": 15625 }, { "epoch": 0.09708316273991638, "grad_norm": 0.10486472398042679, "learning_rate": 0.0015, "loss": 2.7462, "step": 15650 }, { "epoch": 0.09723824766442103, "grad_norm": 0.1453208327293396, "learning_rate": 0.0015, "loss": 2.762, "step": 15675 }, { "epoch": 0.0973933325889257, "grad_norm": 0.08459452539682388, "learning_rate": 0.0015, "loss": 2.7353, "step": 15700 }, { "epoch": 0.09754841751343035, "grad_norm": 0.11150529980659485, "learning_rate": 0.0015, "loss": 2.7617, "step": 15725 }, { "epoch": 0.09770350243793502, "grad_norm": 0.11301703006029129, "learning_rate": 0.0015, "loss": 2.7623, "step": 15750 }, { "epoch": 0.09785858736243967, "grad_norm": 0.16564789414405823, "learning_rate": 0.0015, "loss": 2.7315, "step": 15775 }, { "epoch": 0.09801367228694433, "grad_norm": 0.08968822658061981, "learning_rate": 0.0015, "loss": 2.7842, "step": 15800 }, { "epoch": 0.09801367228694433, "eval_loss": 4.528219223022461, "perplexity": 92.5935287475586, "step": 15800 }, { "epoch": 0.09816875721144899, "grad_norm": 0.1233256533741951, "learning_rate": 0.0015, "loss": 2.7584, "step": 15825 }, { "epoch": 0.09832384213595365, "grad_norm": 0.18926863372325897, "learning_rate": 0.0015, "loss": 2.7651, "step": 15850 }, { "epoch": 0.09847892706045831, "grad_norm": 0.0912550836801529, "learning_rate": 0.0015, "loss": 2.7551, "step": 15875 }, { "epoch": 0.09863401198496297, "grad_norm": 0.1443813592195511, "learning_rate": 0.0015, "loss": 2.7378, "step": 15900 }, { "epoch": 0.09878909690946762, "grad_norm": 0.11620072275400162, "learning_rate": 0.0015, "loss": 2.7706, "step": 15925 }, { "epoch": 0.09894418183397229, "grad_norm": 0.10275860130786896, "learning_rate": 0.0015, "loss": 2.7502, "step": 15950 }, { "epoch": 0.09909926675847694, "grad_norm": 0.1417694240808487, "learning_rate": 0.0015, "loss": 2.706, "step": 15975 }, { "epoch": 0.0992543516829816, "grad_norm": 0.1121877133846283, "learning_rate": 0.0015, "loss": 2.7537, "step": 16000 }, { "epoch": 0.0992543516829816, "eval_loss": 4.520648956298828, "perplexity": 91.89521789550781, "step": 16000 }, { "epoch": 0.09940943660748626, "grad_norm": 0.10022582858800888, "learning_rate": 0.0015, "loss": 2.7213, "step": 16025 }, { "epoch": 0.09956452153199091, "grad_norm": 0.09722616523504257, "learning_rate": 0.0015, "loss": 2.7437, "step": 16050 }, { "epoch": 0.09971960645649558, "grad_norm": 0.11053729802370071, "learning_rate": 0.0015, "loss": 2.7495, "step": 16075 }, { "epoch": 0.09987469138100023, "grad_norm": 0.10231011360883713, "learning_rate": 0.0015, "loss": 2.7505, "step": 16100 }, { "epoch": 0.1000297763055049, "grad_norm": 0.135975643992424, "learning_rate": 0.0015, "loss": 2.7487, "step": 16125 }, { "epoch": 0.10018486123000955, "grad_norm": 0.11350739002227783, "learning_rate": 0.0015, "loss": 2.7484, "step": 16150 }, { "epoch": 0.1003399461545142, "grad_norm": 0.10639143735170364, "learning_rate": 0.0015, "loss": 2.7429, "step": 16175 }, { "epoch": 0.10049503107901887, "grad_norm": 0.09016221761703491, "learning_rate": 0.0015, "loss": 2.7891, "step": 16200 }, { "epoch": 0.10049503107901887, "eval_loss": 4.5112504959106445, "perplexity": 91.03558349609375, "step": 16200 }, { "epoch": 0.10065011600352353, "grad_norm": 0.11324500292539597, "learning_rate": 0.0015, "loss": 2.7678, "step": 16225 }, { "epoch": 0.1008052009280282, "grad_norm": 0.13268886506557465, "learning_rate": 0.0015, "loss": 2.723, "step": 16250 }, { "epoch": 0.10096028585253285, "grad_norm": 0.11448831856250763, "learning_rate": 0.0015, "loss": 2.7328, "step": 16275 }, { "epoch": 0.1011153707770375, "grad_norm": 0.10799309611320496, "learning_rate": 0.0015, "loss": 2.7478, "step": 16300 }, { "epoch": 0.10127045570154217, "grad_norm": 0.19559204578399658, "learning_rate": 0.0015, "loss": 2.7606, "step": 16325 }, { "epoch": 0.10142554062604682, "grad_norm": 0.14151975512504578, "learning_rate": 0.0015, "loss": 2.7279, "step": 16350 }, { "epoch": 0.10158062555055149, "grad_norm": 0.10044725239276886, "learning_rate": 0.0015, "loss": 2.7609, "step": 16375 }, { "epoch": 0.10173571047505614, "grad_norm": 0.10686340183019638, "learning_rate": 0.0015, "loss": 2.7295, "step": 16400 }, { "epoch": 0.10173571047505614, "eval_loss": 4.521287441253662, "perplexity": 91.95391082763672, "step": 16400 }, { "epoch": 0.1018907953995608, "grad_norm": 0.1561044305562973, "learning_rate": 0.0015, "loss": 2.7769, "step": 16425 }, { "epoch": 0.10204588032406546, "grad_norm": 0.12182148545980453, "learning_rate": 0.0015, "loss": 2.757, "step": 16450 }, { "epoch": 0.10220096524857011, "grad_norm": 0.20665724575519562, "learning_rate": 0.0015, "loss": 2.7349, "step": 16475 }, { "epoch": 0.10235605017307478, "grad_norm": 0.09160878509283066, "learning_rate": 0.0015, "loss": 2.7393, "step": 16500 }, { "epoch": 0.10251113509757943, "grad_norm": 0.16651533544063568, "learning_rate": 0.0015, "loss": 2.7441, "step": 16525 }, { "epoch": 0.10266622002208409, "grad_norm": 0.09358719736337662, "learning_rate": 0.0015, "loss": 2.7297, "step": 16550 }, { "epoch": 0.10282130494658875, "grad_norm": 0.20277003943920135, "learning_rate": 0.0015, "loss": 2.7506, "step": 16575 }, { "epoch": 0.10297638987109341, "grad_norm": 0.13382607698440552, "learning_rate": 0.0015, "loss": 2.7924, "step": 16600 }, { "epoch": 0.10297638987109341, "eval_loss": 4.525242328643799, "perplexity": 92.31829833984375, "step": 16600 }, { "epoch": 0.10313147479559807, "grad_norm": 0.09686290472745895, "learning_rate": 0.0015, "loss": 2.7417, "step": 16625 }, { "epoch": 0.10328655972010273, "grad_norm": 0.11446567624807358, "learning_rate": 0.0015, "loss": 2.7582, "step": 16650 }, { "epoch": 0.10344164464460738, "grad_norm": 0.15948985517024994, "learning_rate": 0.0015, "loss": 2.7254, "step": 16675 }, { "epoch": 0.10359672956911205, "grad_norm": 0.1254827231168747, "learning_rate": 0.0015, "loss": 2.7515, "step": 16700 }, { "epoch": 0.1037518144936167, "grad_norm": 0.11295375972986221, "learning_rate": 0.0015, "loss": 2.7058, "step": 16725 }, { "epoch": 0.10390689941812137, "grad_norm": 0.10659389197826385, "learning_rate": 0.0015, "loss": 2.7281, "step": 16750 }, { "epoch": 0.10406198434262602, "grad_norm": 0.1045156791806221, "learning_rate": 0.0015, "loss": 2.7131, "step": 16775 }, { "epoch": 0.10421706926713067, "grad_norm": 0.13835974037647247, "learning_rate": 0.0015, "loss": 2.744, "step": 16800 }, { "epoch": 0.10421706926713067, "eval_loss": 4.507747650146484, "perplexity": 90.7172622680664, "step": 16800 }, { "epoch": 0.10437215419163534, "grad_norm": 0.19872727990150452, "learning_rate": 0.0015, "loss": 2.7642, "step": 16825 }, { "epoch": 0.10452723911614, "grad_norm": 0.13754956424236298, "learning_rate": 0.0015, "loss": 2.7652, "step": 16850 }, { "epoch": 0.10468232404064466, "grad_norm": 0.1451335996389389, "learning_rate": 0.0015, "loss": 2.7561, "step": 16875 }, { "epoch": 0.10483740896514931, "grad_norm": 0.16750144958496094, "learning_rate": 0.0015, "loss": 2.7206, "step": 16900 }, { "epoch": 0.10499249388965397, "grad_norm": 0.12020619958639145, "learning_rate": 0.0015, "loss": 2.699, "step": 16925 }, { "epoch": 0.10514757881415863, "grad_norm": 0.16792155802249908, "learning_rate": 0.0015, "loss": 2.8062, "step": 16950 }, { "epoch": 0.10530266373866329, "grad_norm": 0.11066465824842453, "learning_rate": 0.0015, "loss": 2.6968, "step": 16975 }, { "epoch": 0.10545774866316796, "grad_norm": 0.11885298788547516, "learning_rate": 0.0015, "loss": 2.7699, "step": 17000 }, { "epoch": 0.10545774866316796, "eval_loss": 4.524214744567871, "perplexity": 92.22348022460938, "step": 17000 }, { "epoch": 0.10561283358767261, "grad_norm": 0.1298653483390808, "learning_rate": 0.0015, "loss": 2.7199, "step": 17025 }, { "epoch": 0.10576791851217726, "grad_norm": 0.11387672275304794, "learning_rate": 0.0015, "loss": 2.7528, "step": 17050 }, { "epoch": 0.10592300343668193, "grad_norm": 0.09852533042430878, "learning_rate": 0.0015, "loss": 2.7277, "step": 17075 }, { "epoch": 0.10607808836118658, "grad_norm": 0.11046476662158966, "learning_rate": 0.0015, "loss": 2.722, "step": 17100 }, { "epoch": 0.10623317328569125, "grad_norm": 0.11632421612739563, "learning_rate": 0.0015, "loss": 2.726, "step": 17125 }, { "epoch": 0.1063882582101959, "grad_norm": 0.11760540306568146, "learning_rate": 0.0015, "loss": 2.7267, "step": 17150 }, { "epoch": 0.10654334313470057, "grad_norm": 0.12264183163642883, "learning_rate": 0.0015, "loss": 2.8037, "step": 17175 }, { "epoch": 0.10669842805920522, "grad_norm": 0.15346336364746094, "learning_rate": 0.0015, "loss": 2.7668, "step": 17200 }, { "epoch": 0.10669842805920522, "eval_loss": 4.503612995147705, "perplexity": 90.34294891357422, "step": 17200 }, { "epoch": 0.10685351298370988, "grad_norm": 0.10642746090888977, "learning_rate": 0.0015, "loss": 2.7295, "step": 17225 }, { "epoch": 0.10700859790821454, "grad_norm": 0.10965430736541748, "learning_rate": 0.0015, "loss": 2.7113, "step": 17250 }, { "epoch": 0.1071636828327192, "grad_norm": 0.09912869334220886, "learning_rate": 0.0015, "loss": 2.7353, "step": 17275 }, { "epoch": 0.10731876775722386, "grad_norm": 0.14111942052841187, "learning_rate": 0.0015, "loss": 2.7064, "step": 17300 }, { "epoch": 0.10747385268172852, "grad_norm": 0.11583065241575241, "learning_rate": 0.0015, "loss": 2.722, "step": 17325 }, { "epoch": 0.10762893760623317, "grad_norm": 0.09374859184026718, "learning_rate": 0.0015, "loss": 2.6964, "step": 17350 }, { "epoch": 0.10778402253073784, "grad_norm": 0.11704573035240173, "learning_rate": 0.0015, "loss": 2.7518, "step": 17375 }, { "epoch": 0.10793910745524249, "grad_norm": 0.13960668444633484, "learning_rate": 0.0015, "loss": 2.7373, "step": 17400 }, { "epoch": 0.10793910745524249, "eval_loss": 4.514464378356934, "perplexity": 91.3286361694336, "step": 17400 }, { "epoch": 0.10809419237974716, "grad_norm": 0.1006089448928833, "learning_rate": 0.0015, "loss": 2.7199, "step": 17425 }, { "epoch": 0.10824927730425181, "grad_norm": 0.14851173758506775, "learning_rate": 0.0015, "loss": 2.7202, "step": 17450 }, { "epoch": 0.10840436222875646, "grad_norm": 0.11992091685533524, "learning_rate": 0.0015, "loss": 2.6932, "step": 17475 }, { "epoch": 0.10855944715326113, "grad_norm": 0.12420158833265305, "learning_rate": 0.0015, "loss": 2.7395, "step": 17500 }, { "epoch": 0.10871453207776578, "grad_norm": 0.09945713728666306, "learning_rate": 0.0015, "loss": 2.7323, "step": 17525 }, { "epoch": 0.10886961700227045, "grad_norm": 0.13007710874080658, "learning_rate": 0.0015, "loss": 2.7438, "step": 17550 }, { "epoch": 0.1090247019267751, "grad_norm": 0.10875315964221954, "learning_rate": 0.0015, "loss": 2.7656, "step": 17575 }, { "epoch": 0.10917978685127976, "grad_norm": 0.1075393334031105, "learning_rate": 0.0015, "loss": 2.7174, "step": 17600 }, { "epoch": 0.10917978685127976, "eval_loss": 4.4858293533325195, "perplexity": 88.75052642822266, "step": 17600 }, { "epoch": 0.10933487177578442, "grad_norm": 0.16400013864040375, "learning_rate": 0.0015, "loss": 2.7389, "step": 17625 }, { "epoch": 0.10948995670028908, "grad_norm": 0.1368722766637802, "learning_rate": 0.0015, "loss": 2.7198, "step": 17650 }, { "epoch": 0.10964504162479374, "grad_norm": 0.23104597628116608, "learning_rate": 0.0015, "loss": 2.7346, "step": 17675 }, { "epoch": 0.1098001265492984, "grad_norm": 0.12463794648647308, "learning_rate": 0.0015, "loss": 2.691, "step": 17700 }, { "epoch": 0.10995521147380305, "grad_norm": 0.19538962841033936, "learning_rate": 0.0015, "loss": 2.6917, "step": 17725 }, { "epoch": 0.11011029639830772, "grad_norm": 0.12000603973865509, "learning_rate": 0.0015, "loss": 2.7431, "step": 17750 }, { "epoch": 0.11026538132281237, "grad_norm": 0.15090298652648926, "learning_rate": 0.0015, "loss": 2.7493, "step": 17775 }, { "epoch": 0.11042046624731704, "grad_norm": 0.13190440833568573, "learning_rate": 0.0015, "loss": 2.7582, "step": 17800 }, { "epoch": 0.11042046624731704, "eval_loss": 4.493134021759033, "perplexity": 89.40119171142578, "step": 17800 }, { "epoch": 0.11057555117182169, "grad_norm": 0.12455850094556808, "learning_rate": 0.0015, "loss": 2.7574, "step": 17825 }, { "epoch": 0.11073063609632634, "grad_norm": 0.14911110699176788, "learning_rate": 0.0015, "loss": 2.7285, "step": 17850 }, { "epoch": 0.11088572102083101, "grad_norm": 0.16008728742599487, "learning_rate": 0.0015, "loss": 2.733, "step": 17875 }, { "epoch": 0.11104080594533566, "grad_norm": 0.1668420433998108, "learning_rate": 0.0015, "loss": 2.7259, "step": 17900 }, { "epoch": 0.11119589086984033, "grad_norm": 0.11736566573381424, "learning_rate": 0.0015, "loss": 2.7682, "step": 17925 }, { "epoch": 0.11135097579434498, "grad_norm": 0.11538700759410858, "learning_rate": 0.0015, "loss": 2.7656, "step": 17950 }, { "epoch": 0.11150606071884964, "grad_norm": 0.09440570324659348, "learning_rate": 0.0015, "loss": 2.7517, "step": 17975 }, { "epoch": 0.1116611456433543, "grad_norm": 0.20621652901172638, "learning_rate": 0.0015, "loss": 2.7292, "step": 18000 }, { "epoch": 0.1116611456433543, "eval_loss": 4.493429183959961, "perplexity": 89.42758178710938, "step": 18000 }, { "epoch": 0.11181623056785896, "grad_norm": 0.12027841061353683, "learning_rate": 0.0015, "loss": 2.7049, "step": 18025 }, { "epoch": 0.11197131549236362, "grad_norm": 0.08760379254817963, "learning_rate": 0.0015, "loss": 2.7291, "step": 18050 }, { "epoch": 0.11212640041686828, "grad_norm": 0.1251729428768158, "learning_rate": 0.0015, "loss": 2.7149, "step": 18075 }, { "epoch": 0.11228148534137293, "grad_norm": 0.10340214520692825, "learning_rate": 0.0015, "loss": 2.7437, "step": 18100 }, { "epoch": 0.1124365702658776, "grad_norm": 0.10546920448541641, "learning_rate": 0.0015, "loss": 2.7656, "step": 18125 }, { "epoch": 0.11259165519038225, "grad_norm": 0.12438227981328964, "learning_rate": 0.0015, "loss": 2.7171, "step": 18150 }, { "epoch": 0.11274674011488692, "grad_norm": 0.14557534456253052, "learning_rate": 0.0015, "loss": 2.7395, "step": 18175 }, { "epoch": 0.11290182503939157, "grad_norm": 0.13714823126792908, "learning_rate": 0.0015, "loss": 2.7066, "step": 18200 }, { "epoch": 0.11290182503939157, "eval_loss": 4.4876604080200195, "perplexity": 88.9131851196289, "step": 18200 }, { "epoch": 0.11305690996389622, "grad_norm": 0.12662547826766968, "learning_rate": 0.0015, "loss": 2.6665, "step": 18225 }, { "epoch": 0.11321199488840089, "grad_norm": 0.10047092288732529, "learning_rate": 0.0015, "loss": 2.7332, "step": 18250 }, { "epoch": 0.11336707981290554, "grad_norm": 0.11126455664634705, "learning_rate": 0.0015, "loss": 2.7154, "step": 18275 }, { "epoch": 0.11352216473741021, "grad_norm": 0.10023871064186096, "learning_rate": 0.0015, "loss": 2.7007, "step": 18300 }, { "epoch": 0.11367724966191486, "grad_norm": 0.11821885406970978, "learning_rate": 0.0015, "loss": 2.7081, "step": 18325 }, { "epoch": 0.11383233458641952, "grad_norm": 0.1216677874326706, "learning_rate": 0.0015, "loss": 2.74, "step": 18350 }, { "epoch": 0.11398741951092418, "grad_norm": 0.1125161275267601, "learning_rate": 0.0015, "loss": 2.733, "step": 18375 }, { "epoch": 0.11414250443542884, "grad_norm": 0.18253153562545776, "learning_rate": 0.0015, "loss": 2.7085, "step": 18400 }, { "epoch": 0.11414250443542884, "eval_loss": 4.501376628875732, "perplexity": 90.1411361694336, "step": 18400 }, { "epoch": 0.1142975893599335, "grad_norm": 0.13288918137550354, "learning_rate": 0.0015, "loss": 2.7033, "step": 18425 }, { "epoch": 0.11445267428443816, "grad_norm": 0.1069432720541954, "learning_rate": 0.0015, "loss": 2.7063, "step": 18450 }, { "epoch": 0.11460775920894281, "grad_norm": 0.1035354733467102, "learning_rate": 0.0015, "loss": 2.7174, "step": 18475 }, { "epoch": 0.11476284413344748, "grad_norm": 0.1121230348944664, "learning_rate": 0.0015, "loss": 2.7, "step": 18500 }, { "epoch": 0.11491792905795213, "grad_norm": 0.13324719667434692, "learning_rate": 0.0015, "loss": 2.7423, "step": 18525 }, { "epoch": 0.1150730139824568, "grad_norm": 0.0891190841794014, "learning_rate": 0.0015, "loss": 2.7418, "step": 18550 }, { "epoch": 0.11522809890696145, "grad_norm": 0.10579492896795273, "learning_rate": 0.0015, "loss": 2.7321, "step": 18575 }, { "epoch": 0.1153831838314661, "grad_norm": 0.1010003387928009, "learning_rate": 0.0015, "loss": 2.7071, "step": 18600 }, { "epoch": 0.1153831838314661, "eval_loss": 4.508904933929443, "perplexity": 90.82231140136719, "step": 18600 }, { "epoch": 0.11553826875597077, "grad_norm": 0.1599242389202118, "learning_rate": 0.0015, "loss": 2.7222, "step": 18625 }, { "epoch": 0.11569335368047542, "grad_norm": 0.09344537556171417, "learning_rate": 0.0015, "loss": 2.7424, "step": 18650 }, { "epoch": 0.11584843860498009, "grad_norm": 0.13959461450576782, "learning_rate": 0.0015, "loss": 2.7584, "step": 18675 }, { "epoch": 0.11600352352948474, "grad_norm": 0.11661764234304428, "learning_rate": 0.0015, "loss": 2.7363, "step": 18700 }, { "epoch": 0.1161586084539894, "grad_norm": 0.11968798190355301, "learning_rate": 0.0015, "loss": 2.7314, "step": 18725 }, { "epoch": 0.11631369337849407, "grad_norm": 0.22232107818126678, "learning_rate": 0.0015, "loss": 2.6992, "step": 18750 }, { "epoch": 0.11646877830299872, "grad_norm": 0.1387198567390442, "learning_rate": 0.0015, "loss": 2.7001, "step": 18775 }, { "epoch": 0.11662386322750339, "grad_norm": 0.17059509456157684, "learning_rate": 0.0015, "loss": 2.7002, "step": 18800 }, { "epoch": 0.11662386322750339, "eval_loss": 4.516000270843506, "perplexity": 91.4690170288086, "step": 18800 }, { "epoch": 0.11677894815200804, "grad_norm": 0.10877668112516403, "learning_rate": 0.0015, "loss": 2.7171, "step": 18825 }, { "epoch": 0.11693403307651269, "grad_norm": 0.11746638268232346, "learning_rate": 0.0015, "loss": 2.7006, "step": 18850 }, { "epoch": 0.11708911800101736, "grad_norm": 0.17617632448673248, "learning_rate": 0.0015, "loss": 2.7427, "step": 18875 }, { "epoch": 0.11724420292552201, "grad_norm": 0.09788820147514343, "learning_rate": 0.0015, "loss": 2.7507, "step": 18900 }, { "epoch": 0.11739928785002668, "grad_norm": 0.1285056471824646, "learning_rate": 0.0015, "loss": 2.7386, "step": 18925 }, { "epoch": 0.11755437277453133, "grad_norm": 0.11705992370843887, "learning_rate": 0.0015, "loss": 2.7234, "step": 18950 }, { "epoch": 0.11770945769903599, "grad_norm": 0.09166467934846878, "learning_rate": 0.0015, "loss": 2.7825, "step": 18975 }, { "epoch": 0.11786454262354065, "grad_norm": 0.11318054795265198, "learning_rate": 0.0015, "loss": 2.778, "step": 19000 }, { "epoch": 0.11786454262354065, "eval_loss": 4.499363422393799, "perplexity": 89.95984649658203, "step": 19000 } ], "logging_steps": 25, "max_steps": 161202, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 60, "trial_name": null, "trial_params": null }