{ "best_metric": 5.638747215270996, "best_model_checkpoint": "./results/models/mistral-prot/checkpoint-351531", "epoch": 9.0, "eval_steps": 500, "global_step": 351531, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012801146982769657, "grad_norm": 0.2890625, "learning_rate": 0.001999487954120689, "loss": 5.8774, "step": 500 }, { "epoch": 0.025602293965539313, "grad_norm": 0.97265625, "learning_rate": 0.0019989759082413784, "loss": 5.8257, "step": 1000 }, { "epoch": 0.03840344094830897, "grad_norm": 2.765625, "learning_rate": 0.001998463862362068, "loss": 5.8373, "step": 1500 }, { "epoch": 0.05120458793107863, "grad_norm": 0.703125, "learning_rate": 0.001997951816482757, "loss": 5.8402, "step": 2000 }, { "epoch": 0.06400573491384828, "grad_norm": 0.478515625, "learning_rate": 0.0019974397706034462, "loss": 5.8272, "step": 2500 }, { "epoch": 0.07680688189661794, "grad_norm": 0.98828125, "learning_rate": 0.001996927724724135, "loss": 5.8143, "step": 3000 }, { "epoch": 0.0896080288793876, "grad_norm": 5.6875, "learning_rate": 0.0019964156788448246, "loss": 5.8118, "step": 3500 }, { "epoch": 0.10240917586215725, "grad_norm": 0.75, "learning_rate": 0.0019959036329655136, "loss": 5.8184, "step": 4000 }, { "epoch": 0.11521032284492691, "grad_norm": 3.609375, "learning_rate": 0.001995391587086203, "loss": 5.8083, "step": 4500 }, { "epoch": 0.12801146982769657, "grad_norm": 0.45703125, "learning_rate": 0.001994879541206892, "loss": 5.8025, "step": 5000 }, { "epoch": 0.14081261681046622, "grad_norm": 0.6171875, "learning_rate": 0.0019943674953275814, "loss": 5.8007, "step": 5500 }, { "epoch": 0.15361376379323588, "grad_norm": 3.0625, "learning_rate": 0.0019938554494482704, "loss": 5.7932, "step": 6000 }, { "epoch": 0.16641491077600554, "grad_norm": 0.59765625, "learning_rate": 0.00199334340356896, "loss": 5.7984, "step": 6500 }, { "epoch": 0.1792160577587752, "grad_norm": 0.4609375, "learning_rate": 0.001992831357689649, "loss": 5.7895, "step": 7000 }, { "epoch": 0.19201720474154485, "grad_norm": 0.5, "learning_rate": 0.001992319311810338, "loss": 5.7869, "step": 7500 }, { "epoch": 0.2048183517243145, "grad_norm": 0.8984375, "learning_rate": 0.0019918072659310276, "loss": 5.7823, "step": 8000 }, { "epoch": 0.21761949870708416, "grad_norm": 3.234375, "learning_rate": 0.0019912952200517166, "loss": 5.7778, "step": 8500 }, { "epoch": 0.23042064568985382, "grad_norm": 0.84765625, "learning_rate": 0.001990783174172406, "loss": 5.7752, "step": 9000 }, { "epoch": 0.24322179267262348, "grad_norm": 0.875, "learning_rate": 0.0019902711282930954, "loss": 5.7739, "step": 9500 }, { "epoch": 0.25602293965539313, "grad_norm": 0.8046875, "learning_rate": 0.0019897590824137844, "loss": 5.7784, "step": 10000 }, { "epoch": 0.26882408663816276, "grad_norm": 1.5, "learning_rate": 0.0019892470365344733, "loss": 5.7735, "step": 10500 }, { "epoch": 0.28162523362093245, "grad_norm": 1.0703125, "learning_rate": 0.0019887349906551628, "loss": 5.7704, "step": 11000 }, { "epoch": 0.2944263806037021, "grad_norm": 0.75390625, "learning_rate": 0.001988222944775852, "loss": 5.7677, "step": 11500 }, { "epoch": 0.30722752758647176, "grad_norm": 2.03125, "learning_rate": 0.001987710898896541, "loss": 5.7653, "step": 12000 }, { "epoch": 0.3200286745692414, "grad_norm": 0.69140625, "learning_rate": 0.0019871988530172306, "loss": 5.7659, "step": 12500 }, { "epoch": 0.3328298215520111, "grad_norm": 3.375, "learning_rate": 0.0019866868071379195, "loss": 5.7669, "step": 13000 }, { "epoch": 0.3456309685347807, "grad_norm": 1.1328125, "learning_rate": 0.001986174761258609, "loss": 5.7625, "step": 13500 }, { "epoch": 0.3584321155175504, "grad_norm": 0.953125, "learning_rate": 0.001985662715379298, "loss": 5.762, "step": 14000 }, { "epoch": 0.37123326250032, "grad_norm": 0.8515625, "learning_rate": 0.0019851506694999873, "loss": 5.7618, "step": 14500 }, { "epoch": 0.3840344094830897, "grad_norm": 0.76171875, "learning_rate": 0.0019846386236206763, "loss": 5.7574, "step": 15000 }, { "epoch": 0.39683555646585933, "grad_norm": 0.68359375, "learning_rate": 0.0019841265777413657, "loss": 5.756, "step": 15500 }, { "epoch": 0.409636703448629, "grad_norm": 3.296875, "learning_rate": 0.0019836145318620547, "loss": 5.7518, "step": 16000 }, { "epoch": 0.42243785043139864, "grad_norm": 2.265625, "learning_rate": 0.001983102485982744, "loss": 5.7524, "step": 16500 }, { "epoch": 0.4352389974141683, "grad_norm": 1.546875, "learning_rate": 0.0019825904401034335, "loss": 5.7561, "step": 17000 }, { "epoch": 0.44804014439693796, "grad_norm": 0.8515625, "learning_rate": 0.0019820783942241225, "loss": 5.749, "step": 17500 }, { "epoch": 0.46084129137970764, "grad_norm": 0.8984375, "learning_rate": 0.0019815663483448115, "loss": 5.7498, "step": 18000 }, { "epoch": 0.47364243836247727, "grad_norm": 0.69921875, "learning_rate": 0.001981054302465501, "loss": 5.7474, "step": 18500 }, { "epoch": 0.48644358534524695, "grad_norm": 2.34375, "learning_rate": 0.0019805422565861903, "loss": 5.7462, "step": 19000 }, { "epoch": 0.4992447323280166, "grad_norm": 0.58203125, "learning_rate": 0.0019800302107068793, "loss": 5.7469, "step": 19500 }, { "epoch": 0.5120458793107863, "grad_norm": 3.984375, "learning_rate": 0.0019795181648275687, "loss": 5.7461, "step": 20000 }, { "epoch": 0.524847026293556, "grad_norm": 0.7578125, "learning_rate": 0.0019790061189482577, "loss": 5.7471, "step": 20500 }, { "epoch": 0.5376481732763255, "grad_norm": 4.90625, "learning_rate": 0.001978494073068947, "loss": 5.7454, "step": 21000 }, { "epoch": 0.5504493202590952, "grad_norm": 2.078125, "learning_rate": 0.0019779820271896365, "loss": 5.744, "step": 21500 }, { "epoch": 0.5632504672418649, "grad_norm": 1.078125, "learning_rate": 0.0019774699813103255, "loss": 5.7431, "step": 22000 }, { "epoch": 0.5760516142246346, "grad_norm": 1.8671875, "learning_rate": 0.001976957935431015, "loss": 5.7406, "step": 22500 }, { "epoch": 0.5888527612074042, "grad_norm": 0.9375, "learning_rate": 0.001976445889551704, "loss": 5.7395, "step": 23000 }, { "epoch": 0.6016539081901738, "grad_norm": 0.5703125, "learning_rate": 0.001975933843672393, "loss": 5.7383, "step": 23500 }, { "epoch": 0.6144550551729435, "grad_norm": 0.7890625, "learning_rate": 0.0019754217977930822, "loss": 5.7394, "step": 24000 }, { "epoch": 0.6272562021557132, "grad_norm": 1.8203125, "learning_rate": 0.0019749097519137717, "loss": 5.7354, "step": 24500 }, { "epoch": 0.6400573491384828, "grad_norm": 1.0078125, "learning_rate": 0.0019743977060344606, "loss": 5.7367, "step": 25000 }, { "epoch": 0.6528584961212525, "grad_norm": 0.435546875, "learning_rate": 0.00197388566015515, "loss": 5.7345, "step": 25500 }, { "epoch": 0.6656596431040221, "grad_norm": 2.140625, "learning_rate": 0.001973373614275839, "loss": 5.7333, "step": 26000 }, { "epoch": 0.6784607900867918, "grad_norm": 0.921875, "learning_rate": 0.0019728615683965284, "loss": 5.7334, "step": 26500 }, { "epoch": 0.6912619370695614, "grad_norm": 2.125, "learning_rate": 0.001972349522517218, "loss": 5.7334, "step": 27000 }, { "epoch": 0.7040630840523311, "grad_norm": 1.1875, "learning_rate": 0.001971837476637907, "loss": 5.7328, "step": 27500 }, { "epoch": 0.7168642310351008, "grad_norm": 4.21875, "learning_rate": 0.001971325430758596, "loss": 5.7329, "step": 28000 }, { "epoch": 0.7296653780178703, "grad_norm": 0.92578125, "learning_rate": 0.001970813384879285, "loss": 5.7324, "step": 28500 }, { "epoch": 0.74246652500064, "grad_norm": 3.375, "learning_rate": 0.0019703013389999746, "loss": 5.7314, "step": 29000 }, { "epoch": 0.7552676719834097, "grad_norm": 0.80078125, "learning_rate": 0.0019697892931206636, "loss": 5.7307, "step": 29500 }, { "epoch": 0.7680688189661794, "grad_norm": 1.671875, "learning_rate": 0.001969277247241353, "loss": 5.7291, "step": 30000 }, { "epoch": 0.780869965948949, "grad_norm": 1.6171875, "learning_rate": 0.001968765201362042, "loss": 5.7313, "step": 30500 }, { "epoch": 0.7936711129317187, "grad_norm": 0.65234375, "learning_rate": 0.0019682531554827314, "loss": 5.7286, "step": 31000 }, { "epoch": 0.8064722599144883, "grad_norm": 0.546875, "learning_rate": 0.0019677411096034204, "loss": 5.732, "step": 31500 }, { "epoch": 0.819273406897258, "grad_norm": 1.265625, "learning_rate": 0.0019672290637241098, "loss": 5.7302, "step": 32000 }, { "epoch": 0.8320745538800276, "grad_norm": 0.67578125, "learning_rate": 0.0019667170178447988, "loss": 5.7277, "step": 32500 }, { "epoch": 0.8448757008627973, "grad_norm": 0.84765625, "learning_rate": 0.001966204971965488, "loss": 5.728, "step": 33000 }, { "epoch": 0.857676847845567, "grad_norm": 2.796875, "learning_rate": 0.001965692926086177, "loss": 5.7276, "step": 33500 }, { "epoch": 0.8704779948283367, "grad_norm": 2.875, "learning_rate": 0.0019651808802068666, "loss": 5.7268, "step": 34000 }, { "epoch": 0.8832791418111062, "grad_norm": 1.703125, "learning_rate": 0.001964668834327556, "loss": 5.7294, "step": 34500 }, { "epoch": 0.8960802887938759, "grad_norm": 0.89453125, "learning_rate": 0.001964156788448245, "loss": 5.7281, "step": 35000 }, { "epoch": 0.9088814357766456, "grad_norm": 1.75, "learning_rate": 0.0019636447425689344, "loss": 5.7281, "step": 35500 }, { "epoch": 0.9216825827594153, "grad_norm": 0.84375, "learning_rate": 0.0019631326966896233, "loss": 5.7254, "step": 36000 }, { "epoch": 0.9344837297421849, "grad_norm": 0.80859375, "learning_rate": 0.0019626206508103127, "loss": 5.7248, "step": 36500 }, { "epoch": 0.9472848767249545, "grad_norm": 0.79296875, "learning_rate": 0.001962108604931002, "loss": 5.7229, "step": 37000 }, { "epoch": 0.9600860237077242, "grad_norm": 1.890625, "learning_rate": 0.001961596559051691, "loss": 5.7217, "step": 37500 }, { "epoch": 0.9728871706904939, "grad_norm": 0.8359375, "learning_rate": 0.00196108451317238, "loss": 5.7211, "step": 38000 }, { "epoch": 0.9856883176732635, "grad_norm": 1.0234375, "learning_rate": 0.0019605724672930695, "loss": 5.7208, "step": 38500 }, { "epoch": 0.9984894646560332, "grad_norm": 0.74609375, "learning_rate": 0.001960060421413759, "loss": 5.7199, "step": 39000 }, { "epoch": 1.0, "eval_loss": 5.719020366668701, "eval_runtime": 1.3381, "eval_samples_per_second": 747.35, "eval_steps_per_second": 2.989, "step": 39059 }, { "epoch": 1.0112906116388027, "grad_norm": 1.3125, "learning_rate": 0.001959548375534448, "loss": 5.7187, "step": 39500 }, { "epoch": 1.0240917586215725, "grad_norm": 2.1875, "learning_rate": 0.0019590363296551373, "loss": 5.7184, "step": 40000 }, { "epoch": 1.036892905604342, "grad_norm": 2.5, "learning_rate": 0.0019585242837758263, "loss": 5.7179, "step": 40500 }, { "epoch": 1.049694052587112, "grad_norm": 0.78125, "learning_rate": 0.0019580122378965157, "loss": 5.7175, "step": 41000 }, { "epoch": 1.0624951995698815, "grad_norm": 0.494140625, "learning_rate": 0.0019575001920172047, "loss": 5.7169, "step": 41500 }, { "epoch": 1.075296346552651, "grad_norm": 0.3984375, "learning_rate": 0.001956988146137894, "loss": 5.7159, "step": 42000 }, { "epoch": 1.0880974935354208, "grad_norm": 2.265625, "learning_rate": 0.001956476100258583, "loss": 5.7154, "step": 42500 }, { "epoch": 1.1008986405181904, "grad_norm": 30.875, "learning_rate": 0.0019559640543792725, "loss": 5.7143, "step": 43000 }, { "epoch": 1.11369978750096, "grad_norm": 0.44921875, "learning_rate": 0.0019554520084999615, "loss": 5.7134, "step": 43500 }, { "epoch": 1.1265009344837298, "grad_norm": 2.3125, "learning_rate": 0.001954939962620651, "loss": 5.713, "step": 44000 }, { "epoch": 1.1393020814664994, "grad_norm": 0.86328125, "learning_rate": 0.0019544279167413403, "loss": 5.7128, "step": 44500 }, { "epoch": 1.1521032284492692, "grad_norm": 2.96875, "learning_rate": 0.0019539158708620293, "loss": 5.714, "step": 45000 }, { "epoch": 1.1649043754320387, "grad_norm": 1.15625, "learning_rate": 0.0019534038249827182, "loss": 5.7138, "step": 45500 }, { "epoch": 1.1777055224148083, "grad_norm": 0.7265625, "learning_rate": 0.0019528917791034077, "loss": 5.7114, "step": 46000 }, { "epoch": 1.190506669397578, "grad_norm": 1.0546875, "learning_rate": 0.0019523797332240969, "loss": 5.7114, "step": 46500 }, { "epoch": 1.2033078163803477, "grad_norm": 0.58984375, "learning_rate": 0.001951867687344786, "loss": 5.7114, "step": 47000 }, { "epoch": 1.2161089633631172, "grad_norm": 2.140625, "learning_rate": 0.0019513556414654755, "loss": 5.712, "step": 47500 }, { "epoch": 1.228910110345887, "grad_norm": 1.3125, "learning_rate": 0.0019508435955861644, "loss": 5.7118, "step": 48000 }, { "epoch": 1.2417112573286566, "grad_norm": 1.03125, "learning_rate": 0.0019503315497068536, "loss": 5.7105, "step": 48500 }, { "epoch": 1.2545124043114262, "grad_norm": 0.83203125, "learning_rate": 0.001949819503827543, "loss": 5.709, "step": 49000 }, { "epoch": 1.267313551294196, "grad_norm": 0.92578125, "learning_rate": 0.0019493074579482322, "loss": 5.7086, "step": 49500 }, { "epoch": 1.2801146982769656, "grad_norm": 1.953125, "learning_rate": 0.0019487954120689214, "loss": 5.7086, "step": 50000 }, { "epoch": 1.2929158452597354, "grad_norm": 1.5703125, "learning_rate": 0.0019482833661896106, "loss": 5.7083, "step": 50500 }, { "epoch": 1.305716992242505, "grad_norm": 1.1796875, "learning_rate": 0.0019477713203102998, "loss": 5.7073, "step": 51000 }, { "epoch": 1.3185181392252745, "grad_norm": 0.92578125, "learning_rate": 0.0019472592744309892, "loss": 5.7056, "step": 51500 }, { "epoch": 1.3313192862080443, "grad_norm": 0.703125, "learning_rate": 0.0019467472285516782, "loss": 5.7052, "step": 52000 }, { "epoch": 1.3441204331908139, "grad_norm": 1.125, "learning_rate": 0.0019462351826723674, "loss": 5.7075, "step": 52500 }, { "epoch": 1.3569215801735837, "grad_norm": 0.486328125, "learning_rate": 0.0019457231367930568, "loss": 5.7071, "step": 53000 }, { "epoch": 1.3697227271563532, "grad_norm": 1.703125, "learning_rate": 0.001945211090913746, "loss": 5.7071, "step": 53500 }, { "epoch": 1.3825238741391228, "grad_norm": 2.46875, "learning_rate": 0.001944699045034435, "loss": 5.7083, "step": 54000 }, { "epoch": 1.3953250211218924, "grad_norm": 0.72265625, "learning_rate": 0.0019441869991551244, "loss": 5.7072, "step": 54500 }, { "epoch": 1.4081261681046622, "grad_norm": 4.1875, "learning_rate": 0.0019436749532758136, "loss": 5.7068, "step": 55000 }, { "epoch": 1.4209273150874318, "grad_norm": 0.890625, "learning_rate": 0.0019431629073965028, "loss": 5.7085, "step": 55500 }, { "epoch": 1.4337284620702015, "grad_norm": 0.9296875, "learning_rate": 0.001942650861517192, "loss": 5.707, "step": 56000 }, { "epoch": 1.4465296090529711, "grad_norm": 0.83984375, "learning_rate": 0.0019421388156378812, "loss": 5.7061, "step": 56500 }, { "epoch": 1.4593307560357407, "grad_norm": 1.8046875, "learning_rate": 0.0019416267697585704, "loss": 5.7068, "step": 57000 }, { "epoch": 1.4721319030185105, "grad_norm": 3.984375, "learning_rate": 0.0019411147238792598, "loss": 5.7078, "step": 57500 }, { "epoch": 1.48493305000128, "grad_norm": 0.60546875, "learning_rate": 0.0019406026779999488, "loss": 5.7061, "step": 58000 }, { "epoch": 1.4977341969840499, "grad_norm": 0.671875, "learning_rate": 0.001940090632120638, "loss": 5.7045, "step": 58500 }, { "epoch": 1.5105353439668194, "grad_norm": 2.34375, "learning_rate": 0.0019395785862413274, "loss": 5.7041, "step": 59000 }, { "epoch": 1.523336490949589, "grad_norm": 0.640625, "learning_rate": 0.0019390665403620166, "loss": 5.7041, "step": 59500 }, { "epoch": 1.5361376379323586, "grad_norm": 0.56640625, "learning_rate": 0.0019385544944827055, "loss": 5.7034, "step": 60000 }, { "epoch": 1.5489387849151284, "grad_norm": 0.52734375, "learning_rate": 0.001938042448603395, "loss": 5.7031, "step": 60500 }, { "epoch": 1.5617399318978982, "grad_norm": 3.6875, "learning_rate": 0.0019375304027240841, "loss": 5.7022, "step": 61000 }, { "epoch": 1.5745410788806677, "grad_norm": 1.2421875, "learning_rate": 0.0019370183568447733, "loss": 5.7014, "step": 61500 }, { "epoch": 1.5873422258634373, "grad_norm": 1.28125, "learning_rate": 0.0019365063109654625, "loss": 5.702, "step": 62000 }, { "epoch": 1.600143372846207, "grad_norm": 2.359375, "learning_rate": 0.0019359942650861517, "loss": 5.7015, "step": 62500 }, { "epoch": 1.6129445198289767, "grad_norm": 28.125, "learning_rate": 0.001935482219206841, "loss": 5.7002, "step": 63000 }, { "epoch": 1.6257456668117465, "grad_norm": 0.5703125, "learning_rate": 0.0019349701733275303, "loss": 5.7002, "step": 63500 }, { "epoch": 1.638546813794516, "grad_norm": 0.78515625, "learning_rate": 0.0019344581274482193, "loss": 5.7003, "step": 64000 }, { "epoch": 1.6513479607772856, "grad_norm": 0.65625, "learning_rate": 0.0019339460815689087, "loss": 5.7007, "step": 64500 }, { "epoch": 1.6641491077600552, "grad_norm": 0.68359375, "learning_rate": 0.001933434035689598, "loss": 5.6992, "step": 65000 }, { "epoch": 1.676950254742825, "grad_norm": 0.51953125, "learning_rate": 0.0019329219898102869, "loss": 5.6976, "step": 65500 }, { "epoch": 1.6897514017255946, "grad_norm": 0.98046875, "learning_rate": 0.0019324099439309763, "loss": 5.6983, "step": 66000 }, { "epoch": 1.7025525487083644, "grad_norm": 3.1875, "learning_rate": 0.0019318978980516655, "loss": 5.6977, "step": 66500 }, { "epoch": 1.715353695691134, "grad_norm": 0.8984375, "learning_rate": 0.0019313858521723547, "loss": 5.6967, "step": 67000 }, { "epoch": 1.7281548426739035, "grad_norm": 5.5625, "learning_rate": 0.0019308738062930439, "loss": 5.6962, "step": 67500 }, { "epoch": 1.740955989656673, "grad_norm": 5.28125, "learning_rate": 0.001930361760413733, "loss": 5.6962, "step": 68000 }, { "epoch": 1.7537571366394429, "grad_norm": 0.7890625, "learning_rate": 0.0019298497145344223, "loss": 5.6962, "step": 68500 }, { "epoch": 1.7665582836222127, "grad_norm": 1.40625, "learning_rate": 0.0019293376686551117, "loss": 5.6965, "step": 69000 }, { "epoch": 1.7793594306049823, "grad_norm": 1.3125, "learning_rate": 0.0019288256227758007, "loss": 5.6967, "step": 69500 }, { "epoch": 1.7921605775877518, "grad_norm": 1.1640625, "learning_rate": 0.0019283135768964899, "loss": 5.6962, "step": 70000 }, { "epoch": 1.8049617245705214, "grad_norm": 0.58203125, "learning_rate": 0.0019278015310171793, "loss": 5.6972, "step": 70500 }, { "epoch": 1.8177628715532912, "grad_norm": 1.109375, "learning_rate": 0.0019272894851378685, "loss": 5.6971, "step": 71000 }, { "epoch": 1.830564018536061, "grad_norm": 1.2265625, "learning_rate": 0.0019267774392585574, "loss": 5.6958, "step": 71500 }, { "epoch": 1.8433651655188306, "grad_norm": 1.4453125, "learning_rate": 0.0019262653933792469, "loss": 5.6957, "step": 72000 }, { "epoch": 1.8561663125016001, "grad_norm": 6.03125, "learning_rate": 0.001925753347499936, "loss": 5.6965, "step": 72500 }, { "epoch": 1.8689674594843697, "grad_norm": 34.5, "learning_rate": 0.0019252413016206252, "loss": 5.6964, "step": 73000 }, { "epoch": 1.8817686064671395, "grad_norm": 1.2109375, "learning_rate": 0.0019247292557413144, "loss": 5.6968, "step": 73500 }, { "epoch": 1.894569753449909, "grad_norm": 3.0, "learning_rate": 0.0019242172098620036, "loss": 5.6961, "step": 74000 }, { "epoch": 1.9073709004326789, "grad_norm": 1.078125, "learning_rate": 0.0019237051639826928, "loss": 5.6974, "step": 74500 }, { "epoch": 1.9201720474154484, "grad_norm": 1.0703125, "learning_rate": 0.0019231931181033822, "loss": 5.697, "step": 75000 }, { "epoch": 1.932973194398218, "grad_norm": 1.03125, "learning_rate": 0.0019226810722240712, "loss": 5.695, "step": 75500 }, { "epoch": 1.9457743413809876, "grad_norm": 1.0546875, "learning_rate": 0.0019221690263447604, "loss": 5.6965, "step": 76000 }, { "epoch": 1.9585754883637574, "grad_norm": 1.21875, "learning_rate": 0.0019216569804654498, "loss": 5.6989, "step": 76500 }, { "epoch": 1.9713766353465272, "grad_norm": 0.65234375, "learning_rate": 0.001921144934586139, "loss": 5.6968, "step": 77000 }, { "epoch": 1.9841777823292968, "grad_norm": 1.8359375, "learning_rate": 0.001920632888706828, "loss": 5.6953, "step": 77500 }, { "epoch": 1.9969789293120663, "grad_norm": 1.1875, "learning_rate": 0.0019201208428275174, "loss": 5.6954, "step": 78000 }, { "epoch": 2.0, "eval_loss": 5.694720268249512, "eval_runtime": 1.2928, "eval_samples_per_second": 773.49, "eval_steps_per_second": 3.094, "step": 78118 }, { "epoch": 2.009780076294836, "grad_norm": 0.64453125, "learning_rate": 0.0019196087969482066, "loss": 5.6937, "step": 78500 }, { "epoch": 2.0225812232776055, "grad_norm": 1.8515625, "learning_rate": 0.001919096751068896, "loss": 5.6939, "step": 79000 }, { "epoch": 2.0353823702603755, "grad_norm": 3.171875, "learning_rate": 0.001918584705189585, "loss": 5.6957, "step": 79500 }, { "epoch": 2.048183517243145, "grad_norm": 1.2578125, "learning_rate": 0.0019180726593102742, "loss": 5.6932, "step": 80000 }, { "epoch": 2.0609846642259146, "grad_norm": 1.703125, "learning_rate": 0.0019175606134309636, "loss": 5.6931, "step": 80500 }, { "epoch": 2.073785811208684, "grad_norm": 1.421875, "learning_rate": 0.0019170485675516528, "loss": 5.6922, "step": 81000 }, { "epoch": 2.086586958191454, "grad_norm": 29.625, "learning_rate": 0.0019165365216723418, "loss": 5.6921, "step": 81500 }, { "epoch": 2.099388105174224, "grad_norm": 1.625, "learning_rate": 0.0019160244757930312, "loss": 5.691, "step": 82000 }, { "epoch": 2.1121892521569934, "grad_norm": 1.1328125, "learning_rate": 0.0019155124299137204, "loss": 5.6897, "step": 82500 }, { "epoch": 2.124990399139763, "grad_norm": 1.90625, "learning_rate": 0.0019150003840344096, "loss": 5.6913, "step": 83000 }, { "epoch": 2.1377915461225325, "grad_norm": 1.0, "learning_rate": 0.0019144883381550988, "loss": 5.693, "step": 83500 }, { "epoch": 2.150592693105302, "grad_norm": 1.9609375, "learning_rate": 0.001913976292275788, "loss": 5.691, "step": 84000 }, { "epoch": 2.163393840088072, "grad_norm": 0.70703125, "learning_rate": 0.0019134642463964771, "loss": 5.6912, "step": 84500 }, { "epoch": 2.1761949870708417, "grad_norm": 37.0, "learning_rate": 0.0019129522005171666, "loss": 5.6893, "step": 85000 }, { "epoch": 2.1889961340536113, "grad_norm": 1.328125, "learning_rate": 0.0019124401546378555, "loss": 5.6903, "step": 85500 }, { "epoch": 2.201797281036381, "grad_norm": 1.046875, "learning_rate": 0.0019119281087585447, "loss": 5.6916, "step": 86000 }, { "epoch": 2.2145984280191504, "grad_norm": 0.73828125, "learning_rate": 0.0019114160628792341, "loss": 5.6917, "step": 86500 }, { "epoch": 2.22739957500192, "grad_norm": 2.71875, "learning_rate": 0.0019109040169999231, "loss": 5.6898, "step": 87000 }, { "epoch": 2.24020072198469, "grad_norm": 0.68359375, "learning_rate": 0.0019103919711206123, "loss": 5.69, "step": 87500 }, { "epoch": 2.2530018689674596, "grad_norm": 3.46875, "learning_rate": 0.0019098799252413017, "loss": 5.6909, "step": 88000 }, { "epoch": 2.265803015950229, "grad_norm": 0.92578125, "learning_rate": 0.001909367879361991, "loss": 5.6894, "step": 88500 }, { "epoch": 2.2786041629329987, "grad_norm": 1.4453125, "learning_rate": 0.00190885583348268, "loss": 5.6896, "step": 89000 }, { "epoch": 2.2914053099157683, "grad_norm": 1.1796875, "learning_rate": 0.0019083437876033693, "loss": 5.6879, "step": 89500 }, { "epoch": 2.3042064568985383, "grad_norm": 0.80859375, "learning_rate": 0.0019078317417240585, "loss": 5.6889, "step": 90000 }, { "epoch": 2.317007603881308, "grad_norm": 0.9765625, "learning_rate": 0.0019073196958447477, "loss": 5.6891, "step": 90500 }, { "epoch": 2.3298087508640775, "grad_norm": 0.71875, "learning_rate": 0.0019068076499654369, "loss": 5.6879, "step": 91000 }, { "epoch": 2.342609897846847, "grad_norm": 0.52734375, "learning_rate": 0.001906295604086126, "loss": 5.6875, "step": 91500 }, { "epoch": 2.3554110448296166, "grad_norm": 0.9140625, "learning_rate": 0.0019057835582068153, "loss": 5.6873, "step": 92000 }, { "epoch": 2.368212191812386, "grad_norm": 1.6640625, "learning_rate": 0.0019052715123275047, "loss": 5.6876, "step": 92500 }, { "epoch": 2.381013338795156, "grad_norm": 2.015625, "learning_rate": 0.0019047594664481937, "loss": 5.6874, "step": 93000 }, { "epoch": 2.3938144857779258, "grad_norm": 0.55078125, "learning_rate": 0.001904247420568883, "loss": 5.687, "step": 93500 }, { "epoch": 2.4066156327606953, "grad_norm": 6.125, "learning_rate": 0.0019037353746895723, "loss": 5.6859, "step": 94000 }, { "epoch": 2.419416779743465, "grad_norm": 0.609375, "learning_rate": 0.0019032233288102615, "loss": 5.6844, "step": 94500 }, { "epoch": 2.4322179267262345, "grad_norm": 1.296875, "learning_rate": 0.0019027112829309507, "loss": 5.6844, "step": 95000 }, { "epoch": 2.4450190737090045, "grad_norm": 1.140625, "learning_rate": 0.0019021992370516399, "loss": 5.6854, "step": 95500 }, { "epoch": 2.457820220691774, "grad_norm": 3.640625, "learning_rate": 0.001901687191172329, "loss": 5.6862, "step": 96000 }, { "epoch": 2.4706213676745437, "grad_norm": 0.78515625, "learning_rate": 0.0019011751452930185, "loss": 5.6857, "step": 96500 }, { "epoch": 2.4834225146573132, "grad_norm": 1.09375, "learning_rate": 0.0019006630994137074, "loss": 5.683, "step": 97000 }, { "epoch": 2.496223661640083, "grad_norm": 1.8125, "learning_rate": 0.0019001510535343966, "loss": 5.683, "step": 97500 }, { "epoch": 2.5090248086228524, "grad_norm": 0.89453125, "learning_rate": 0.001899639007655086, "loss": 5.6825, "step": 98000 }, { "epoch": 2.5218259556056224, "grad_norm": 2.03125, "learning_rate": 0.0018991269617757752, "loss": 5.6821, "step": 98500 }, { "epoch": 2.534627102588392, "grad_norm": 1.0546875, "learning_rate": 0.0018986149158964642, "loss": 5.6817, "step": 99000 }, { "epoch": 2.5474282495711615, "grad_norm": 0.55078125, "learning_rate": 0.0018981028700171536, "loss": 5.6814, "step": 99500 }, { "epoch": 2.560229396553931, "grad_norm": 0.5234375, "learning_rate": 0.0018975908241378428, "loss": 5.6819, "step": 100000 }, { "epoch": 2.573030543536701, "grad_norm": 0.54296875, "learning_rate": 0.001897078778258532, "loss": 5.6803, "step": 100500 }, { "epoch": 2.5858316905194707, "grad_norm": 0.55078125, "learning_rate": 0.0018965667323792212, "loss": 5.6809, "step": 101000 }, { "epoch": 2.5986328375022403, "grad_norm": 1.6484375, "learning_rate": 0.0018960546864999104, "loss": 5.6822, "step": 101500 }, { "epoch": 2.61143398448501, "grad_norm": 1.6015625, "learning_rate": 0.0018955426406205996, "loss": 5.6804, "step": 102000 }, { "epoch": 2.6242351314677794, "grad_norm": 0.69140625, "learning_rate": 0.001895030594741289, "loss": 5.6815, "step": 102500 }, { "epoch": 2.637036278450549, "grad_norm": 1.859375, "learning_rate": 0.001894518548861978, "loss": 5.6808, "step": 103000 }, { "epoch": 2.6498374254333186, "grad_norm": 0.62109375, "learning_rate": 0.0018940065029826672, "loss": 5.6811, "step": 103500 }, { "epoch": 2.6626385724160886, "grad_norm": 2.15625, "learning_rate": 0.0018934944571033566, "loss": 5.6795, "step": 104000 }, { "epoch": 2.675439719398858, "grad_norm": 3.640625, "learning_rate": 0.0018929824112240458, "loss": 5.6799, "step": 104500 }, { "epoch": 2.6882408663816277, "grad_norm": 2.015625, "learning_rate": 0.0018924703653447348, "loss": 5.6811, "step": 105000 }, { "epoch": 2.7010420133643973, "grad_norm": 1.7265625, "learning_rate": 0.0018919583194654242, "loss": 5.6798, "step": 105500 }, { "epoch": 2.7138431603471673, "grad_norm": 0.66796875, "learning_rate": 0.0018914462735861134, "loss": 5.6799, "step": 106000 }, { "epoch": 2.726644307329937, "grad_norm": 4.0625, "learning_rate": 0.0018909342277068028, "loss": 5.6795, "step": 106500 }, { "epoch": 2.7394454543127065, "grad_norm": 1.984375, "learning_rate": 0.0018904221818274918, "loss": 5.6804, "step": 107000 }, { "epoch": 2.752246601295476, "grad_norm": 1.640625, "learning_rate": 0.001889910135948181, "loss": 5.6813, "step": 107500 }, { "epoch": 2.7650477482782456, "grad_norm": 1.4296875, "learning_rate": 0.0018893980900688704, "loss": 5.6802, "step": 108000 }, { "epoch": 2.777848895261015, "grad_norm": 6.59375, "learning_rate": 0.0018888860441895596, "loss": 5.6814, "step": 108500 }, { "epoch": 2.7906500422437848, "grad_norm": 0.92578125, "learning_rate": 0.0018883739983102485, "loss": 5.6799, "step": 109000 }, { "epoch": 2.803451189226555, "grad_norm": 2.03125, "learning_rate": 0.001887861952430938, "loss": 5.6792, "step": 109500 }, { "epoch": 2.8162523362093244, "grad_norm": 0.875, "learning_rate": 0.0018873499065516271, "loss": 5.6795, "step": 110000 }, { "epoch": 2.829053483192094, "grad_norm": 0.54296875, "learning_rate": 0.0018868378606723161, "loss": 5.6809, "step": 110500 }, { "epoch": 2.8418546301748635, "grad_norm": 1.3671875, "learning_rate": 0.0018863258147930055, "loss": 5.6804, "step": 111000 }, { "epoch": 2.8546557771576335, "grad_norm": 1.0703125, "learning_rate": 0.0018858137689136947, "loss": 5.6817, "step": 111500 }, { "epoch": 2.867456924140403, "grad_norm": 1.4375, "learning_rate": 0.001885301723034384, "loss": 5.6811, "step": 112000 }, { "epoch": 2.8802580711231727, "grad_norm": 1.3671875, "learning_rate": 0.0018847896771550731, "loss": 5.6811, "step": 112500 }, { "epoch": 2.8930592181059422, "grad_norm": 0.95703125, "learning_rate": 0.0018842776312757623, "loss": 5.6822, "step": 113000 }, { "epoch": 2.905860365088712, "grad_norm": 0.953125, "learning_rate": 0.0018837655853964515, "loss": 5.6842, "step": 113500 }, { "epoch": 2.9186615120714814, "grad_norm": 0.6484375, "learning_rate": 0.001883253539517141, "loss": 5.6826, "step": 114000 }, { "epoch": 2.9314626590542514, "grad_norm": 16.75, "learning_rate": 0.0018827414936378299, "loss": 5.681, "step": 114500 }, { "epoch": 2.944263806037021, "grad_norm": 1.328125, "learning_rate": 0.001882229447758519, "loss": 5.6806, "step": 115000 }, { "epoch": 2.9570649530197906, "grad_norm": 5.96875, "learning_rate": 0.0018817174018792085, "loss": 5.6805, "step": 115500 }, { "epoch": 2.96986610000256, "grad_norm": 1.359375, "learning_rate": 0.0018812053559998977, "loss": 5.6798, "step": 116000 }, { "epoch": 2.98266724698533, "grad_norm": 0.87109375, "learning_rate": 0.0018806933101205867, "loss": 5.6785, "step": 116500 }, { "epoch": 2.9954683939680997, "grad_norm": 0.69140625, "learning_rate": 0.001880181264241276, "loss": 5.6787, "step": 117000 }, { "epoch": 3.0, "eval_loss": 5.67839241027832, "eval_runtime": 1.3366, "eval_samples_per_second": 748.15, "eval_steps_per_second": 2.993, "step": 117177 }, { "epoch": 3.0082695409508693, "grad_norm": 9.75, "learning_rate": 0.0018796692183619653, "loss": 5.6776, "step": 117500 }, { "epoch": 3.021070687933639, "grad_norm": 1.3984375, "learning_rate": 0.0018791571724826545, "loss": 5.6778, "step": 118000 }, { "epoch": 3.0338718349164084, "grad_norm": 1.2578125, "learning_rate": 0.0018786451266033437, "loss": 5.6775, "step": 118500 }, { "epoch": 3.046672981899178, "grad_norm": 0.953125, "learning_rate": 0.0018781330807240329, "loss": 5.6785, "step": 119000 }, { "epoch": 3.059474128881948, "grad_norm": 0.6015625, "learning_rate": 0.001877621034844722, "loss": 5.6803, "step": 119500 }, { "epoch": 3.0722752758647176, "grad_norm": 1.2734375, "learning_rate": 0.0018771089889654115, "loss": 5.68, "step": 120000 }, { "epoch": 3.085076422847487, "grad_norm": 1.0625, "learning_rate": 0.0018765969430861004, "loss": 5.68, "step": 120500 }, { "epoch": 3.0978775698302567, "grad_norm": 1.1796875, "learning_rate": 0.0018760848972067899, "loss": 5.6809, "step": 121000 }, { "epoch": 3.1106787168130263, "grad_norm": 1.09375, "learning_rate": 0.001875572851327479, "loss": 5.6795, "step": 121500 }, { "epoch": 3.123479863795796, "grad_norm": 0.796875, "learning_rate": 0.0018750608054481682, "loss": 5.6798, "step": 122000 }, { "epoch": 3.136281010778566, "grad_norm": 0.953125, "learning_rate": 0.0018745487595688574, "loss": 5.6797, "step": 122500 }, { "epoch": 3.1490821577613355, "grad_norm": 1.2109375, "learning_rate": 0.0018740367136895466, "loss": 5.6787, "step": 123000 }, { "epoch": 3.161883304744105, "grad_norm": 0.953125, "learning_rate": 0.0018735246678102358, "loss": 5.6778, "step": 123500 }, { "epoch": 3.1746844517268746, "grad_norm": 0.66796875, "learning_rate": 0.0018730126219309252, "loss": 5.6766, "step": 124000 }, { "epoch": 3.187485598709644, "grad_norm": 0.80078125, "learning_rate": 0.0018725005760516142, "loss": 5.677, "step": 124500 }, { "epoch": 3.2002867456924142, "grad_norm": 1.046875, "learning_rate": 0.0018719885301723034, "loss": 5.6768, "step": 125000 }, { "epoch": 3.213087892675184, "grad_norm": 1.046875, "learning_rate": 0.0018714764842929928, "loss": 5.6777, "step": 125500 }, { "epoch": 3.2258890396579534, "grad_norm": 1.046875, "learning_rate": 0.001870964438413682, "loss": 5.6744, "step": 126000 }, { "epoch": 3.238690186640723, "grad_norm": 9.1875, "learning_rate": 0.001870452392534371, "loss": 5.6761, "step": 126500 }, { "epoch": 3.2514913336234925, "grad_norm": 1.171875, "learning_rate": 0.0018699403466550604, "loss": 5.6755, "step": 127000 }, { "epoch": 3.2642924806062625, "grad_norm": 1.3671875, "learning_rate": 0.0018694283007757496, "loss": 5.6765, "step": 127500 }, { "epoch": 3.277093627589032, "grad_norm": 4.8125, "learning_rate": 0.0018689162548964388, "loss": 5.6759, "step": 128000 }, { "epoch": 3.2898947745718017, "grad_norm": 0.71875, "learning_rate": 0.001868404209017128, "loss": 5.6742, "step": 128500 }, { "epoch": 3.3026959215545713, "grad_norm": 0.90625, "learning_rate": 0.0018678921631378172, "loss": 5.6755, "step": 129000 }, { "epoch": 3.315497068537341, "grad_norm": 0.8046875, "learning_rate": 0.0018673801172585064, "loss": 5.6754, "step": 129500 }, { "epoch": 3.3282982155201104, "grad_norm": 1.125, "learning_rate": 0.0018668680713791958, "loss": 5.6747, "step": 130000 }, { "epoch": 3.3410993625028804, "grad_norm": 0.53125, "learning_rate": 0.0018663560254998848, "loss": 5.6731, "step": 130500 }, { "epoch": 3.35390050948565, "grad_norm": 0.9140625, "learning_rate": 0.001865843979620574, "loss": 5.6741, "step": 131000 }, { "epoch": 3.3667016564684196, "grad_norm": 1.0625, "learning_rate": 0.0018653319337412634, "loss": 5.6728, "step": 131500 }, { "epoch": 3.379502803451189, "grad_norm": 0.7734375, "learning_rate": 0.0018648198878619523, "loss": 5.6729, "step": 132000 }, { "epoch": 3.3923039504339587, "grad_norm": 0.875, "learning_rate": 0.0018643078419826415, "loss": 5.6729, "step": 132500 }, { "epoch": 3.4051050974167287, "grad_norm": 1.3359375, "learning_rate": 0.001863795796103331, "loss": 5.6722, "step": 133000 }, { "epoch": 3.4179062443994983, "grad_norm": 0.9765625, "learning_rate": 0.0018632837502240201, "loss": 5.6719, "step": 133500 }, { "epoch": 3.430707391382268, "grad_norm": 1.0, "learning_rate": 0.0018627717043447091, "loss": 5.6708, "step": 134000 }, { "epoch": 3.4435085383650375, "grad_norm": 7.90625, "learning_rate": 0.0018622596584653985, "loss": 5.6711, "step": 134500 }, { "epoch": 3.456309685347807, "grad_norm": 0.5390625, "learning_rate": 0.0018617476125860877, "loss": 5.6712, "step": 135000 }, { "epoch": 3.4691108323305766, "grad_norm": 2.625, "learning_rate": 0.0018612355667067771, "loss": 5.6725, "step": 135500 }, { "epoch": 3.4819119793133466, "grad_norm": 2.125, "learning_rate": 0.0018607235208274661, "loss": 5.6723, "step": 136000 }, { "epoch": 3.494713126296116, "grad_norm": 0.8359375, "learning_rate": 0.0018602114749481553, "loss": 5.6712, "step": 136500 }, { "epoch": 3.5075142732788858, "grad_norm": 0.61328125, "learning_rate": 0.0018596994290688447, "loss": 5.6703, "step": 137000 }, { "epoch": 3.5203154202616553, "grad_norm": 0.64453125, "learning_rate": 0.001859187383189534, "loss": 5.671, "step": 137500 }, { "epoch": 3.5331165672444254, "grad_norm": 34.75, "learning_rate": 0.001858675337310223, "loss": 5.669, "step": 138000 }, { "epoch": 3.545917714227195, "grad_norm": 1.3046875, "learning_rate": 0.0018581632914309123, "loss": 5.6699, "step": 138500 }, { "epoch": 3.5587188612099645, "grad_norm": 0.6796875, "learning_rate": 0.0018576512455516015, "loss": 5.6708, "step": 139000 }, { "epoch": 3.571520008192734, "grad_norm": 1.8984375, "learning_rate": 0.0018571391996722907, "loss": 5.6699, "step": 139500 }, { "epoch": 3.5843211551755036, "grad_norm": 0.609375, "learning_rate": 0.0018566271537929799, "loss": 5.6693, "step": 140000 }, { "epoch": 3.597122302158273, "grad_norm": 0.74609375, "learning_rate": 0.001856115107913669, "loss": 5.6688, "step": 140500 }, { "epoch": 3.609923449141043, "grad_norm": 1.5859375, "learning_rate": 0.0018556030620343583, "loss": 5.6677, "step": 141000 }, { "epoch": 3.622724596123813, "grad_norm": 0.65625, "learning_rate": 0.0018550910161550477, "loss": 5.6676, "step": 141500 }, { "epoch": 3.6355257431065824, "grad_norm": 0.478515625, "learning_rate": 0.0018545789702757367, "loss": 5.6683, "step": 142000 }, { "epoch": 3.648326890089352, "grad_norm": 1.203125, "learning_rate": 0.0018540669243964259, "loss": 5.6689, "step": 142500 }, { "epoch": 3.6611280370721215, "grad_norm": 1.2890625, "learning_rate": 0.0018535548785171153, "loss": 5.6685, "step": 143000 }, { "epoch": 3.6739291840548916, "grad_norm": 0.953125, "learning_rate": 0.0018530428326378045, "loss": 5.669, "step": 143500 }, { "epoch": 3.686730331037661, "grad_norm": 0.671875, "learning_rate": 0.0018525307867584934, "loss": 5.668, "step": 144000 }, { "epoch": 3.6995314780204307, "grad_norm": 4.34375, "learning_rate": 0.0018520187408791829, "loss": 5.6681, "step": 144500 }, { "epoch": 3.7123326250032003, "grad_norm": 65.0, "learning_rate": 0.001851506694999872, "loss": 5.6675, "step": 145000 }, { "epoch": 3.72513377198597, "grad_norm": 1.265625, "learning_rate": 0.0018509946491205612, "loss": 5.6677, "step": 145500 }, { "epoch": 3.7379349189687394, "grad_norm": 0.63671875, "learning_rate": 0.0018504826032412504, "loss": 5.668, "step": 146000 }, { "epoch": 3.750736065951509, "grad_norm": 0.671875, "learning_rate": 0.0018499705573619396, "loss": 5.668, "step": 146500 }, { "epoch": 3.763537212934279, "grad_norm": 0.59765625, "learning_rate": 0.0018494585114826288, "loss": 5.6679, "step": 147000 }, { "epoch": 3.7763383599170486, "grad_norm": 0.671875, "learning_rate": 0.0018489464656033182, "loss": 5.6673, "step": 147500 }, { "epoch": 3.789139506899818, "grad_norm": 2.28125, "learning_rate": 0.0018484344197240072, "loss": 5.6681, "step": 148000 }, { "epoch": 3.8019406538825877, "grad_norm": 1.265625, "learning_rate": 0.0018479223738446964, "loss": 5.6675, "step": 148500 }, { "epoch": 3.8147418008653577, "grad_norm": 0.75390625, "learning_rate": 0.0018474103279653858, "loss": 5.667, "step": 149000 }, { "epoch": 3.8275429478481273, "grad_norm": 2.4375, "learning_rate": 0.001846898282086075, "loss": 5.6675, "step": 149500 }, { "epoch": 3.840344094830897, "grad_norm": 0.58203125, "learning_rate": 0.0018463862362067642, "loss": 5.6677, "step": 150000 }, { "epoch": 3.8531452418136665, "grad_norm": 2.46875, "learning_rate": 0.0018458741903274534, "loss": 5.6678, "step": 150500 }, { "epoch": 3.865946388796436, "grad_norm": 1.046875, "learning_rate": 0.0018453621444481426, "loss": 5.6668, "step": 151000 }, { "epoch": 3.8787475357792056, "grad_norm": 1.546875, "learning_rate": 0.001844850098568832, "loss": 5.6688, "step": 151500 }, { "epoch": 3.8915486827619756, "grad_norm": 1.484375, "learning_rate": 0.001844338052689521, "loss": 5.6689, "step": 152000 }, { "epoch": 3.904349829744745, "grad_norm": 2.828125, "learning_rate": 0.0018438260068102102, "loss": 5.6685, "step": 152500 }, { "epoch": 3.9171509767275148, "grad_norm": 4.84375, "learning_rate": 0.0018433139609308996, "loss": 5.6674, "step": 153000 }, { "epoch": 3.9299521237102844, "grad_norm": 1.3515625, "learning_rate": 0.0018428019150515886, "loss": 5.6678, "step": 153500 }, { "epoch": 3.9427532706930544, "grad_norm": 0.8125, "learning_rate": 0.0018422898691722778, "loss": 5.6673, "step": 154000 }, { "epoch": 3.955554417675824, "grad_norm": 0.67578125, "learning_rate": 0.0018417778232929672, "loss": 5.6667, "step": 154500 }, { "epoch": 3.9683555646585935, "grad_norm": 1.1953125, "learning_rate": 0.0018412657774136564, "loss": 5.6673, "step": 155000 }, { "epoch": 3.981156711641363, "grad_norm": 0.53125, "learning_rate": 0.0018407537315343453, "loss": 5.6677, "step": 155500 }, { "epoch": 3.9939578586241327, "grad_norm": 3.03125, "learning_rate": 0.0018402416856550348, "loss": 5.6668, "step": 156000 }, { "epoch": 4.0, "eval_loss": 5.668220520019531, "eval_runtime": 1.2989, "eval_samples_per_second": 769.908, "eval_steps_per_second": 3.08, "step": 156236 }, { "epoch": 4.006759005606902, "grad_norm": 17.75, "learning_rate": 0.001839729639775724, "loss": 5.667, "step": 156500 }, { "epoch": 4.019560152589672, "grad_norm": 0.79296875, "learning_rate": 0.0018392175938964131, "loss": 5.6679, "step": 157000 }, { "epoch": 4.032361299572441, "grad_norm": 0.8515625, "learning_rate": 0.0018387055480171023, "loss": 5.6671, "step": 157500 }, { "epoch": 4.045162446555211, "grad_norm": 1.375, "learning_rate": 0.0018381935021377915, "loss": 5.6664, "step": 158000 }, { "epoch": 4.057963593537981, "grad_norm": 1.7265625, "learning_rate": 0.0018376814562584807, "loss": 5.6662, "step": 158500 }, { "epoch": 4.070764740520751, "grad_norm": 0.72265625, "learning_rate": 0.0018371694103791701, "loss": 5.6671, "step": 159000 }, { "epoch": 4.083565887503521, "grad_norm": 1.0234375, "learning_rate": 0.0018366573644998591, "loss": 5.6659, "step": 159500 }, { "epoch": 4.09636703448629, "grad_norm": 2.484375, "learning_rate": 0.0018361453186205483, "loss": 5.6665, "step": 160000 }, { "epoch": 4.10916818146906, "grad_norm": 0.8046875, "learning_rate": 0.0018356332727412377, "loss": 5.6667, "step": 160500 }, { "epoch": 4.121969328451829, "grad_norm": 100.5, "learning_rate": 0.001835121226861927, "loss": 5.6653, "step": 161000 }, { "epoch": 4.134770475434599, "grad_norm": 1.046875, "learning_rate": 0.001834609180982616, "loss": 5.665, "step": 161500 }, { "epoch": 4.147571622417368, "grad_norm": 0.796875, "learning_rate": 0.0018340971351033053, "loss": 5.6653, "step": 162000 }, { "epoch": 4.160372769400138, "grad_norm": 0.69921875, "learning_rate": 0.0018335850892239945, "loss": 5.6656, "step": 162500 }, { "epoch": 4.173173916382908, "grad_norm": 1.59375, "learning_rate": 0.001833073043344684, "loss": 5.6667, "step": 163000 }, { "epoch": 4.185975063365677, "grad_norm": 0.64453125, "learning_rate": 0.0018325609974653729, "loss": 5.6655, "step": 163500 }, { "epoch": 4.198776210348448, "grad_norm": 18.0, "learning_rate": 0.001832048951586062, "loss": 5.6638, "step": 164000 }, { "epoch": 4.211577357331217, "grad_norm": 0.56640625, "learning_rate": 0.0018315369057067515, "loss": 5.6652, "step": 164500 }, { "epoch": 4.224378504313987, "grad_norm": 1.078125, "learning_rate": 0.0018310248598274407, "loss": 5.6653, "step": 165000 }, { "epoch": 4.237179651296756, "grad_norm": 2.0625, "learning_rate": 0.0018305128139481297, "loss": 5.6651, "step": 165500 }, { "epoch": 4.249980798279526, "grad_norm": 1.5390625, "learning_rate": 0.001830000768068819, "loss": 5.6659, "step": 166000 }, { "epoch": 4.2627819452622955, "grad_norm": 1.078125, "learning_rate": 0.0018294887221895083, "loss": 5.666, "step": 166500 }, { "epoch": 4.275583092245065, "grad_norm": 0.74609375, "learning_rate": 0.0018289766763101975, "loss": 5.6654, "step": 167000 }, { "epoch": 4.288384239227835, "grad_norm": 0.921875, "learning_rate": 0.0018284646304308867, "loss": 5.6649, "step": 167500 }, { "epoch": 4.301185386210604, "grad_norm": 2.1875, "learning_rate": 0.0018279525845515759, "loss": 5.6648, "step": 168000 }, { "epoch": 4.313986533193374, "grad_norm": 1.5546875, "learning_rate": 0.001827440538672265, "loss": 5.6647, "step": 168500 }, { "epoch": 4.326787680176144, "grad_norm": 0.71484375, "learning_rate": 0.0018269284927929545, "loss": 5.6647, "step": 169000 }, { "epoch": 4.339588827158914, "grad_norm": 1.890625, "learning_rate": 0.0018264164469136434, "loss": 5.6648, "step": 169500 }, { "epoch": 4.352389974141683, "grad_norm": 1.640625, "learning_rate": 0.0018259044010343326, "loss": 5.6653, "step": 170000 }, { "epoch": 4.365191121124453, "grad_norm": 1.328125, "learning_rate": 0.001825392355155022, "loss": 5.6641, "step": 170500 }, { "epoch": 4.3779922681072225, "grad_norm": 4.0625, "learning_rate": 0.0018248803092757112, "loss": 5.6647, "step": 171000 }, { "epoch": 4.390793415089992, "grad_norm": 0.95703125, "learning_rate": 0.0018243682633964002, "loss": 5.6635, "step": 171500 }, { "epoch": 4.403594562072762, "grad_norm": 1.2421875, "learning_rate": 0.0018238562175170896, "loss": 5.6634, "step": 172000 }, { "epoch": 4.416395709055531, "grad_norm": 0.9296875, "learning_rate": 0.0018233441716377788, "loss": 5.6635, "step": 172500 }, { "epoch": 4.429196856038301, "grad_norm": 0.86328125, "learning_rate": 0.0018228321257584678, "loss": 5.6632, "step": 173000 }, { "epoch": 4.44199800302107, "grad_norm": 1.3125, "learning_rate": 0.0018223200798791572, "loss": 5.6634, "step": 173500 }, { "epoch": 4.45479915000384, "grad_norm": 3.203125, "learning_rate": 0.0018218080339998464, "loss": 5.6636, "step": 174000 }, { "epoch": 4.46760029698661, "grad_norm": 1.1953125, "learning_rate": 0.0018212959881205356, "loss": 5.6626, "step": 174500 }, { "epoch": 4.48040144396938, "grad_norm": 6.09375, "learning_rate": 0.0018207839422412248, "loss": 5.6632, "step": 175000 }, { "epoch": 4.49320259095215, "grad_norm": 1.0078125, "learning_rate": 0.001820271896361914, "loss": 5.6624, "step": 175500 }, { "epoch": 4.506003737934919, "grad_norm": 3.78125, "learning_rate": 0.0018197598504826032, "loss": 5.6623, "step": 176000 }, { "epoch": 4.518804884917689, "grad_norm": 0.546875, "learning_rate": 0.0018192478046032926, "loss": 5.6628, "step": 176500 }, { "epoch": 4.531606031900458, "grad_norm": 0.458984375, "learning_rate": 0.0018187357587239816, "loss": 5.6627, "step": 177000 }, { "epoch": 4.544407178883228, "grad_norm": 1.6328125, "learning_rate": 0.001818223712844671, "loss": 5.6615, "step": 177500 }, { "epoch": 4.5572083258659974, "grad_norm": 0.703125, "learning_rate": 0.0018177116669653602, "loss": 5.662, "step": 178000 }, { "epoch": 4.570009472848767, "grad_norm": 0.75, "learning_rate": 0.0018171996210860494, "loss": 5.6625, "step": 178500 }, { "epoch": 4.582810619831537, "grad_norm": 0.6484375, "learning_rate": 0.0018166875752067386, "loss": 5.6618, "step": 179000 }, { "epoch": 4.595611766814306, "grad_norm": 0.81640625, "learning_rate": 0.0018161755293274278, "loss": 5.6621, "step": 179500 }, { "epoch": 4.608412913797077, "grad_norm": 0.54296875, "learning_rate": 0.001815663483448117, "loss": 5.6621, "step": 180000 }, { "epoch": 4.621214060779846, "grad_norm": 3.34375, "learning_rate": 0.0018151514375688064, "loss": 5.6606, "step": 180500 }, { "epoch": 4.634015207762616, "grad_norm": 1.0625, "learning_rate": 0.0018146393916894953, "loss": 5.6616, "step": 181000 }, { "epoch": 4.646816354745385, "grad_norm": 13.5625, "learning_rate": 0.0018141273458101845, "loss": 5.6626, "step": 181500 }, { "epoch": 4.659617501728155, "grad_norm": 1.2109375, "learning_rate": 0.001813615299930874, "loss": 5.661, "step": 182000 }, { "epoch": 4.6724186487109245, "grad_norm": 2.84375, "learning_rate": 0.0018131032540515631, "loss": 5.6614, "step": 182500 }, { "epoch": 4.685219795693694, "grad_norm": 3.03125, "learning_rate": 0.0018125912081722521, "loss": 5.6612, "step": 183000 }, { "epoch": 4.698020942676464, "grad_norm": 0.89453125, "learning_rate": 0.0018120791622929415, "loss": 5.66, "step": 183500 }, { "epoch": 4.710822089659233, "grad_norm": 0.765625, "learning_rate": 0.0018115671164136307, "loss": 5.6592, "step": 184000 }, { "epoch": 4.723623236642003, "grad_norm": 4.21875, "learning_rate": 0.00181105507053432, "loss": 5.659, "step": 184500 }, { "epoch": 4.736424383624772, "grad_norm": 0.62890625, "learning_rate": 0.0018105430246550091, "loss": 5.6591, "step": 185000 }, { "epoch": 4.749225530607543, "grad_norm": 5.0625, "learning_rate": 0.0018100309787756983, "loss": 5.6591, "step": 185500 }, { "epoch": 4.762026677590312, "grad_norm": 0.51953125, "learning_rate": 0.0018095189328963875, "loss": 5.659, "step": 186000 }, { "epoch": 4.774827824573082, "grad_norm": 0.640625, "learning_rate": 0.001809006887017077, "loss": 5.6603, "step": 186500 }, { "epoch": 4.7876289715558515, "grad_norm": 0.58984375, "learning_rate": 0.001808494841137766, "loss": 5.6594, "step": 187000 }, { "epoch": 4.800430118538621, "grad_norm": 0.54296875, "learning_rate": 0.001807982795258455, "loss": 5.6604, "step": 187500 }, { "epoch": 4.813231265521391, "grad_norm": 3.40625, "learning_rate": 0.0018074707493791445, "loss": 5.6598, "step": 188000 }, { "epoch": 4.82603241250416, "grad_norm": 2.109375, "learning_rate": 0.0018069587034998337, "loss": 5.6597, "step": 188500 }, { "epoch": 4.83883355948693, "grad_norm": 1.125, "learning_rate": 0.0018064466576205227, "loss": 5.6594, "step": 189000 }, { "epoch": 4.851634706469699, "grad_norm": 1.3125, "learning_rate": 0.001805934611741212, "loss": 5.6584, "step": 189500 }, { "epoch": 4.864435853452469, "grad_norm": 0.59765625, "learning_rate": 0.0018054225658619013, "loss": 5.6582, "step": 190000 }, { "epoch": 4.877237000435239, "grad_norm": 2.796875, "learning_rate": 0.0018049105199825905, "loss": 5.6581, "step": 190500 }, { "epoch": 4.890038147418009, "grad_norm": 1.0546875, "learning_rate": 0.0018043984741032797, "loss": 5.658, "step": 191000 }, { "epoch": 4.902839294400779, "grad_norm": 0.78515625, "learning_rate": 0.0018038864282239689, "loss": 5.6576, "step": 191500 }, { "epoch": 4.915640441383548, "grad_norm": 2.453125, "learning_rate": 0.0018033743823446583, "loss": 5.6576, "step": 192000 }, { "epoch": 4.928441588366318, "grad_norm": 1.328125, "learning_rate": 0.0018028623364653475, "loss": 5.6574, "step": 192500 }, { "epoch": 4.941242735349087, "grad_norm": 2.4375, "learning_rate": 0.0018023502905860364, "loss": 5.6571, "step": 193000 }, { "epoch": 4.954043882331857, "grad_norm": 0.83203125, "learning_rate": 0.0018018382447067259, "loss": 5.6567, "step": 193500 }, { "epoch": 4.9668450293146265, "grad_norm": 0.79296875, "learning_rate": 0.001801326198827415, "loss": 5.6561, "step": 194000 }, { "epoch": 4.979646176297396, "grad_norm": 1.109375, "learning_rate": 0.0018008141529481042, "loss": 5.6571, "step": 194500 }, { "epoch": 4.992447323280166, "grad_norm": 0.63671875, "learning_rate": 0.0018003021070687934, "loss": 5.6571, "step": 195000 }, { "epoch": 5.0, "eval_loss": 5.6548848152160645, "eval_runtime": 1.2976, "eval_samples_per_second": 770.633, "eval_steps_per_second": 3.083, "step": 195295 }, { "epoch": 5.005248470262935, "grad_norm": 2.171875, "learning_rate": 0.0017997900611894826, "loss": 5.6575, "step": 195500 }, { "epoch": 5.018049617245706, "grad_norm": 0.6171875, "learning_rate": 0.0017992780153101718, "loss": 5.6569, "step": 196000 }, { "epoch": 5.030850764228475, "grad_norm": 1.0546875, "learning_rate": 0.001798765969430861, "loss": 5.6566, "step": 196500 }, { "epoch": 5.043651911211245, "grad_norm": 14.4375, "learning_rate": 0.0017982539235515502, "loss": 5.6565, "step": 197000 }, { "epoch": 5.056453058194014, "grad_norm": 2.984375, "learning_rate": 0.0017977418776722394, "loss": 5.6566, "step": 197500 }, { "epoch": 5.069254205176784, "grad_norm": 0.474609375, "learning_rate": 0.0017972298317929288, "loss": 5.6572, "step": 198000 }, { "epoch": 5.0820553521595535, "grad_norm": 1.5078125, "learning_rate": 0.0017967177859136178, "loss": 5.6556, "step": 198500 }, { "epoch": 5.094856499142323, "grad_norm": 1.1640625, "learning_rate": 0.001796205740034307, "loss": 5.6562, "step": 199000 }, { "epoch": 5.107657646125093, "grad_norm": 0.921875, "learning_rate": 0.0017956936941549964, "loss": 5.656, "step": 199500 }, { "epoch": 5.120458793107862, "grad_norm": 0.91796875, "learning_rate": 0.0017951816482756856, "loss": 5.6562, "step": 200000 }, { "epoch": 5.133259940090632, "grad_norm": 0.51171875, "learning_rate": 0.0017946696023963746, "loss": 5.656, "step": 200500 }, { "epoch": 5.146061087073401, "grad_norm": 4.75, "learning_rate": 0.001794157556517064, "loss": 5.6567, "step": 201000 }, { "epoch": 5.158862234056172, "grad_norm": 0.88671875, "learning_rate": 0.0017936455106377532, "loss": 5.6557, "step": 201500 }, { "epoch": 5.171663381038941, "grad_norm": 1.1953125, "learning_rate": 0.0017931334647584424, "loss": 5.6558, "step": 202000 }, { "epoch": 5.184464528021711, "grad_norm": 0.78515625, "learning_rate": 0.0017926214188791316, "loss": 5.6565, "step": 202500 }, { "epoch": 5.197265675004481, "grad_norm": 2.375, "learning_rate": 0.0017921093729998208, "loss": 5.6562, "step": 203000 }, { "epoch": 5.21006682198725, "grad_norm": 1.25, "learning_rate": 0.00179159732712051, "loss": 5.6569, "step": 203500 }, { "epoch": 5.22286796897002, "grad_norm": 0.53125, "learning_rate": 0.0017910852812411994, "loss": 5.6573, "step": 204000 }, { "epoch": 5.235669115952789, "grad_norm": 0.671875, "learning_rate": 0.0017905732353618883, "loss": 5.6566, "step": 204500 }, { "epoch": 5.248470262935559, "grad_norm": 1.796875, "learning_rate": 0.0017900611894825775, "loss": 5.6567, "step": 205000 }, { "epoch": 5.261271409918328, "grad_norm": 3.203125, "learning_rate": 0.001789549143603267, "loss": 5.6555, "step": 205500 }, { "epoch": 5.274072556901098, "grad_norm": 0.52734375, "learning_rate": 0.0017890370977239561, "loss": 5.656, "step": 206000 }, { "epoch": 5.2868737038838685, "grad_norm": 1.1328125, "learning_rate": 0.0017885250518446453, "loss": 5.655, "step": 206500 }, { "epoch": 5.299674850866638, "grad_norm": 1.8671875, "learning_rate": 0.0017880130059653345, "loss": 5.657, "step": 207000 }, { "epoch": 5.312475997849408, "grad_norm": 0.796875, "learning_rate": 0.0017875009600860237, "loss": 5.6563, "step": 207500 }, { "epoch": 5.325277144832177, "grad_norm": 2.8125, "learning_rate": 0.0017869889142067131, "loss": 5.6554, "step": 208000 }, { "epoch": 5.338078291814947, "grad_norm": 1.0078125, "learning_rate": 0.0017864768683274021, "loss": 5.6564, "step": 208500 }, { "epoch": 5.350879438797716, "grad_norm": 0.59375, "learning_rate": 0.0017859648224480913, "loss": 5.6555, "step": 209000 }, { "epoch": 5.363680585780486, "grad_norm": 1.53125, "learning_rate": 0.0017854527765687807, "loss": 5.6548, "step": 209500 }, { "epoch": 5.3764817327632555, "grad_norm": 0.66015625, "learning_rate": 0.00178494073068947, "loss": 5.6547, "step": 210000 }, { "epoch": 5.389282879746025, "grad_norm": 5.03125, "learning_rate": 0.001784428684810159, "loss": 5.6543, "step": 210500 }, { "epoch": 5.402084026728795, "grad_norm": 1.5, "learning_rate": 0.0017839166389308483, "loss": 5.6549, "step": 211000 }, { "epoch": 5.414885173711564, "grad_norm": 1.4921875, "learning_rate": 0.0017834045930515375, "loss": 5.6545, "step": 211500 }, { "epoch": 5.427686320694335, "grad_norm": 0.953125, "learning_rate": 0.0017828925471722267, "loss": 5.6552, "step": 212000 }, { "epoch": 5.440487467677104, "grad_norm": 0.5859375, "learning_rate": 0.0017823805012929159, "loss": 5.6537, "step": 212500 }, { "epoch": 5.453288614659874, "grad_norm": 1.0390625, "learning_rate": 0.001781868455413605, "loss": 5.6547, "step": 213000 }, { "epoch": 5.466089761642643, "grad_norm": 0.94921875, "learning_rate": 0.0017813564095342943, "loss": 5.6555, "step": 213500 }, { "epoch": 5.478890908625413, "grad_norm": 0.796875, "learning_rate": 0.0017808443636549837, "loss": 5.656, "step": 214000 }, { "epoch": 5.4916920556081825, "grad_norm": 1.234375, "learning_rate": 0.0017803323177756727, "loss": 5.6533, "step": 214500 }, { "epoch": 5.504493202590952, "grad_norm": 5.65625, "learning_rate": 0.0017798202718963619, "loss": 5.6543, "step": 215000 }, { "epoch": 5.517294349573722, "grad_norm": 0.4921875, "learning_rate": 0.0017793082260170513, "loss": 5.6539, "step": 215500 }, { "epoch": 5.530095496556491, "grad_norm": 2.9375, "learning_rate": 0.0017787961801377405, "loss": 5.6534, "step": 216000 }, { "epoch": 5.542896643539261, "grad_norm": 2.3125, "learning_rate": 0.0017782841342584294, "loss": 5.654, "step": 216500 }, { "epoch": 5.55569779052203, "grad_norm": 0.54296875, "learning_rate": 0.0017777720883791189, "loss": 5.6548, "step": 217000 }, { "epoch": 5.568498937504801, "grad_norm": 1.7734375, "learning_rate": 0.001777260042499808, "loss": 5.6524, "step": 217500 }, { "epoch": 5.58130008448757, "grad_norm": 0.62890625, "learning_rate": 0.001776747996620497, "loss": 5.6526, "step": 218000 }, { "epoch": 5.59410123147034, "grad_norm": 1.5, "learning_rate": 0.0017762359507411864, "loss": 5.6529, "step": 218500 }, { "epoch": 5.60690237845311, "grad_norm": 0.7265625, "learning_rate": 0.0017757239048618756, "loss": 5.6527, "step": 219000 }, { "epoch": 5.619703525435879, "grad_norm": 3.125, "learning_rate": 0.0017752118589825648, "loss": 5.6531, "step": 219500 }, { "epoch": 5.632504672418649, "grad_norm": 2.03125, "learning_rate": 0.001774699813103254, "loss": 5.6524, "step": 220000 }, { "epoch": 5.645305819401418, "grad_norm": 1.1796875, "learning_rate": 0.0017741877672239432, "loss": 5.6532, "step": 220500 }, { "epoch": 5.658106966384188, "grad_norm": 1.1953125, "learning_rate": 0.0017736757213446326, "loss": 5.6526, "step": 221000 }, { "epoch": 5.670908113366957, "grad_norm": 0.69921875, "learning_rate": 0.0017731636754653218, "loss": 5.6519, "step": 221500 }, { "epoch": 5.683709260349727, "grad_norm": 0.8046875, "learning_rate": 0.0017726516295860108, "loss": 5.6525, "step": 222000 }, { "epoch": 5.696510407332497, "grad_norm": 1.3125, "learning_rate": 0.0017721395837067002, "loss": 5.6529, "step": 222500 }, { "epoch": 5.709311554315267, "grad_norm": 0.5390625, "learning_rate": 0.0017716275378273894, "loss": 5.6526, "step": 223000 }, { "epoch": 5.722112701298037, "grad_norm": 1.375, "learning_rate": 0.0017711154919480786, "loss": 5.6515, "step": 223500 }, { "epoch": 5.734913848280806, "grad_norm": 0.73828125, "learning_rate": 0.0017706034460687678, "loss": 5.6514, "step": 224000 }, { "epoch": 5.747714995263576, "grad_norm": 1.7265625, "learning_rate": 0.001770091400189457, "loss": 5.6523, "step": 224500 }, { "epoch": 5.760516142246345, "grad_norm": 1.125, "learning_rate": 0.0017695793543101462, "loss": 5.652, "step": 225000 }, { "epoch": 5.773317289229115, "grad_norm": 0.52734375, "learning_rate": 0.0017690673084308356, "loss": 5.6518, "step": 225500 }, { "epoch": 5.7861184362118845, "grad_norm": 0.765625, "learning_rate": 0.0017685552625515246, "loss": 5.6524, "step": 226000 }, { "epoch": 5.798919583194654, "grad_norm": 2.140625, "learning_rate": 0.0017680432166722138, "loss": 5.652, "step": 226500 }, { "epoch": 5.811720730177424, "grad_norm": 0.53515625, "learning_rate": 0.0017675311707929032, "loss": 5.6516, "step": 227000 }, { "epoch": 5.824521877160193, "grad_norm": 0.421875, "learning_rate": 0.0017670191249135924, "loss": 5.6511, "step": 227500 }, { "epoch": 5.837323024142963, "grad_norm": 0.7890625, "learning_rate": 0.0017665070790342813, "loss": 5.6505, "step": 228000 }, { "epoch": 5.850124171125733, "grad_norm": 0.765625, "learning_rate": 0.0017659950331549708, "loss": 5.6527, "step": 228500 }, { "epoch": 5.862925318108503, "grad_norm": 5.4375, "learning_rate": 0.00176548298727566, "loss": 5.6513, "step": 229000 }, { "epoch": 5.875726465091272, "grad_norm": 1.4140625, "learning_rate": 0.0017649709413963491, "loss": 5.6503, "step": 229500 }, { "epoch": 5.888527612074042, "grad_norm": 0.73046875, "learning_rate": 0.0017644588955170383, "loss": 5.6508, "step": 230000 }, { "epoch": 5.9013287590568115, "grad_norm": 1.171875, "learning_rate": 0.0017639468496377275, "loss": 5.6517, "step": 230500 }, { "epoch": 5.914129906039581, "grad_norm": 0.59375, "learning_rate": 0.0017634348037584167, "loss": 5.6514, "step": 231000 }, { "epoch": 5.926931053022351, "grad_norm": 0.75, "learning_rate": 0.0017629227578791061, "loss": 5.6513, "step": 231500 }, { "epoch": 5.93973220000512, "grad_norm": 3.78125, "learning_rate": 0.0017624107119997951, "loss": 5.6507, "step": 232000 }, { "epoch": 5.95253334698789, "grad_norm": 0.6171875, "learning_rate": 0.0017618986661204843, "loss": 5.6494, "step": 232500 }, { "epoch": 5.965334493970659, "grad_norm": 0.96484375, "learning_rate": 0.0017613866202411737, "loss": 5.6515, "step": 233000 }, { "epoch": 5.978135640953429, "grad_norm": 2.515625, "learning_rate": 0.001760874574361863, "loss": 5.6506, "step": 233500 }, { "epoch": 5.990936787936199, "grad_norm": 0.66015625, "learning_rate": 0.0017603625284825521, "loss": 5.6504, "step": 234000 }, { "epoch": 6.0, "eval_loss": 5.650012016296387, "eval_runtime": 1.2888, "eval_samples_per_second": 775.89, "eval_steps_per_second": 3.104, "step": 234354 }, { "epoch": 6.003737934918969, "grad_norm": 0.97265625, "learning_rate": 0.0017598504826032413, "loss": 5.6498, "step": 234500 }, { "epoch": 6.016539081901739, "grad_norm": 0.703125, "learning_rate": 0.0017593384367239305, "loss": 5.6508, "step": 235000 }, { "epoch": 6.029340228884508, "grad_norm": 1.0546875, "learning_rate": 0.00175882639084462, "loss": 5.6499, "step": 235500 }, { "epoch": 6.042141375867278, "grad_norm": 0.9921875, "learning_rate": 0.001758314344965309, "loss": 5.6495, "step": 236000 }, { "epoch": 6.054942522850047, "grad_norm": 2.265625, "learning_rate": 0.001757802299085998, "loss": 5.6499, "step": 236500 }, { "epoch": 6.067743669832817, "grad_norm": 0.71875, "learning_rate": 0.0017572902532066875, "loss": 5.6505, "step": 237000 }, { "epoch": 6.0805448168155865, "grad_norm": 1.015625, "learning_rate": 0.0017567782073273767, "loss": 5.6499, "step": 237500 }, { "epoch": 6.093345963798356, "grad_norm": 0.8125, "learning_rate": 0.0017562661614480657, "loss": 5.6518, "step": 238000 }, { "epoch": 6.106147110781126, "grad_norm": 0.78125, "learning_rate": 0.001755754115568755, "loss": 5.6496, "step": 238500 }, { "epoch": 6.118948257763896, "grad_norm": 1.296875, "learning_rate": 0.0017552420696894443, "loss": 5.6497, "step": 239000 }, { "epoch": 6.131749404746666, "grad_norm": 1.3828125, "learning_rate": 0.0017547300238101333, "loss": 5.6508, "step": 239500 }, { "epoch": 6.144550551729435, "grad_norm": 0.7421875, "learning_rate": 0.0017542179779308227, "loss": 5.6498, "step": 240000 }, { "epoch": 6.157351698712205, "grad_norm": 3.296875, "learning_rate": 0.0017537059320515119, "loss": 5.6499, "step": 240500 }, { "epoch": 6.170152845694974, "grad_norm": 1.0390625, "learning_rate": 0.001753193886172201, "loss": 5.6491, "step": 241000 }, { "epoch": 6.182953992677744, "grad_norm": 1.34375, "learning_rate": 0.0017526818402928902, "loss": 5.6488, "step": 241500 }, { "epoch": 6.1957551396605135, "grad_norm": 0.45703125, "learning_rate": 0.0017521697944135794, "loss": 5.6494, "step": 242000 }, { "epoch": 6.208556286643283, "grad_norm": 1.21875, "learning_rate": 0.0017516577485342686, "loss": 5.6489, "step": 242500 }, { "epoch": 6.221357433626053, "grad_norm": 2.140625, "learning_rate": 0.001751145702654958, "loss": 5.6495, "step": 243000 }, { "epoch": 6.234158580608822, "grad_norm": 2.109375, "learning_rate": 0.001750633656775647, "loss": 5.6492, "step": 243500 }, { "epoch": 6.246959727591592, "grad_norm": 0.80859375, "learning_rate": 0.0017501216108963362, "loss": 5.6487, "step": 244000 }, { "epoch": 6.259760874574362, "grad_norm": 0.94921875, "learning_rate": 0.0017496095650170256, "loss": 5.6481, "step": 244500 }, { "epoch": 6.272562021557132, "grad_norm": 0.87890625, "learning_rate": 0.0017490975191377148, "loss": 5.6491, "step": 245000 }, { "epoch": 6.285363168539901, "grad_norm": 1.3359375, "learning_rate": 0.0017485854732584038, "loss": 5.6486, "step": 245500 }, { "epoch": 6.298164315522671, "grad_norm": 0.55078125, "learning_rate": 0.0017480734273790932, "loss": 5.6478, "step": 246000 }, { "epoch": 6.3109654625054405, "grad_norm": 1.015625, "learning_rate": 0.0017475613814997824, "loss": 5.6478, "step": 246500 }, { "epoch": 6.32376660948821, "grad_norm": 1.84375, "learning_rate": 0.0017470493356204716, "loss": 5.6494, "step": 247000 }, { "epoch": 6.33656775647098, "grad_norm": 0.494140625, "learning_rate": 0.0017465372897411608, "loss": 5.6489, "step": 247500 }, { "epoch": 6.349368903453749, "grad_norm": 1.15625, "learning_rate": 0.00174602524386185, "loss": 5.6482, "step": 248000 }, { "epoch": 6.362170050436519, "grad_norm": 0.62890625, "learning_rate": 0.0017455131979825394, "loss": 5.6487, "step": 248500 }, { "epoch": 6.374971197419288, "grad_norm": 0.87890625, "learning_rate": 0.0017450011521032286, "loss": 5.6485, "step": 249000 }, { "epoch": 6.387772344402059, "grad_norm": 1.3203125, "learning_rate": 0.0017444891062239176, "loss": 5.6488, "step": 249500 }, { "epoch": 6.4005734913848285, "grad_norm": 0.90234375, "learning_rate": 0.001743977060344607, "loss": 5.6484, "step": 250000 }, { "epoch": 6.413374638367598, "grad_norm": 1.546875, "learning_rate": 0.0017434650144652962, "loss": 5.6473, "step": 250500 }, { "epoch": 6.426175785350368, "grad_norm": 0.546875, "learning_rate": 0.0017429529685859854, "loss": 5.649, "step": 251000 }, { "epoch": 6.438976932333137, "grad_norm": 1.5078125, "learning_rate": 0.0017424409227066746, "loss": 5.648, "step": 251500 }, { "epoch": 6.451778079315907, "grad_norm": 0.80078125, "learning_rate": 0.0017419288768273638, "loss": 5.6473, "step": 252000 }, { "epoch": 6.464579226298676, "grad_norm": 1.2734375, "learning_rate": 0.001741416830948053, "loss": 5.6482, "step": 252500 }, { "epoch": 6.477380373281446, "grad_norm": 0.6796875, "learning_rate": 0.0017409047850687424, "loss": 5.6474, "step": 253000 }, { "epoch": 6.4901815202642155, "grad_norm": 1.453125, "learning_rate": 0.0017403927391894313, "loss": 5.6468, "step": 253500 }, { "epoch": 6.502982667246985, "grad_norm": 0.7890625, "learning_rate": 0.0017398806933101205, "loss": 5.6471, "step": 254000 }, { "epoch": 6.515783814229755, "grad_norm": 0.5234375, "learning_rate": 0.00173936864743081, "loss": 5.6476, "step": 254500 }, { "epoch": 6.528584961212525, "grad_norm": 8.0625, "learning_rate": 0.0017388566015514991, "loss": 5.6478, "step": 255000 }, { "epoch": 6.541386108195295, "grad_norm": 1.03125, "learning_rate": 0.0017383445556721881, "loss": 5.648, "step": 255500 }, { "epoch": 6.554187255178064, "grad_norm": 0.8671875, "learning_rate": 0.0017378325097928775, "loss": 5.648, "step": 256000 }, { "epoch": 6.566988402160834, "grad_norm": 0.69921875, "learning_rate": 0.0017373204639135667, "loss": 5.6467, "step": 256500 }, { "epoch": 6.579789549143603, "grad_norm": 0.5234375, "learning_rate": 0.001736808418034256, "loss": 5.6476, "step": 257000 }, { "epoch": 6.592590696126373, "grad_norm": 2.15625, "learning_rate": 0.0017362963721549451, "loss": 5.6456, "step": 257500 }, { "epoch": 6.6053918431091425, "grad_norm": 0.72265625, "learning_rate": 0.0017357843262756343, "loss": 5.6473, "step": 258000 }, { "epoch": 6.618192990091912, "grad_norm": 0.59375, "learning_rate": 0.0017352722803963235, "loss": 5.6466, "step": 258500 }, { "epoch": 6.630994137074682, "grad_norm": 1.9921875, "learning_rate": 0.001734760234517013, "loss": 5.6469, "step": 259000 }, { "epoch": 6.643795284057451, "grad_norm": 0.87890625, "learning_rate": 0.001734248188637702, "loss": 5.6466, "step": 259500 }, { "epoch": 6.656596431040221, "grad_norm": 1.09375, "learning_rate": 0.001733736142758391, "loss": 5.6469, "step": 260000 }, { "epoch": 6.669397578022991, "grad_norm": 0.5703125, "learning_rate": 0.0017332240968790805, "loss": 5.6476, "step": 260500 }, { "epoch": 6.682198725005761, "grad_norm": 0.76171875, "learning_rate": 0.0017327120509997695, "loss": 5.6453, "step": 261000 }, { "epoch": 6.69499987198853, "grad_norm": 0.6796875, "learning_rate": 0.0017322000051204587, "loss": 5.6458, "step": 261500 }, { "epoch": 6.7078010189713, "grad_norm": 2.53125, "learning_rate": 0.001731687959241148, "loss": 5.6461, "step": 262000 }, { "epoch": 6.72060216595407, "grad_norm": 0.396484375, "learning_rate": 0.0017311759133618373, "loss": 5.6458, "step": 262500 }, { "epoch": 6.733403312936839, "grad_norm": 1.1640625, "learning_rate": 0.0017306638674825265, "loss": 5.6464, "step": 263000 }, { "epoch": 6.746204459919609, "grad_norm": 2.03125, "learning_rate": 0.0017301518216032157, "loss": 5.647, "step": 263500 }, { "epoch": 6.759005606902378, "grad_norm": 0.953125, "learning_rate": 0.0017296397757239049, "loss": 5.6468, "step": 264000 }, { "epoch": 6.771806753885148, "grad_norm": 0.474609375, "learning_rate": 0.0017291277298445943, "loss": 5.6472, "step": 264500 }, { "epoch": 6.784607900867917, "grad_norm": 3.59375, "learning_rate": 0.0017286156839652833, "loss": 5.6471, "step": 265000 }, { "epoch": 6.797409047850687, "grad_norm": 0.51953125, "learning_rate": 0.0017281036380859724, "loss": 5.6455, "step": 265500 }, { "epoch": 6.8102101948334575, "grad_norm": 7.21875, "learning_rate": 0.0017275915922066619, "loss": 5.6456, "step": 266000 }, { "epoch": 6.823011341816227, "grad_norm": 5.6875, "learning_rate": 0.001727079546327351, "loss": 5.6455, "step": 266500 }, { "epoch": 6.835812488798997, "grad_norm": 2.578125, "learning_rate": 0.00172656750044804, "loss": 5.6462, "step": 267000 }, { "epoch": 6.848613635781766, "grad_norm": 0.953125, "learning_rate": 0.0017260554545687294, "loss": 5.6463, "step": 267500 }, { "epoch": 6.861414782764536, "grad_norm": 4.1875, "learning_rate": 0.0017255434086894186, "loss": 5.6465, "step": 268000 }, { "epoch": 6.874215929747305, "grad_norm": 1.21875, "learning_rate": 0.0017250313628101078, "loss": 5.6455, "step": 268500 }, { "epoch": 6.887017076730075, "grad_norm": 0.515625, "learning_rate": 0.001724519316930797, "loss": 5.646, "step": 269000 }, { "epoch": 6.8998182237128445, "grad_norm": 0.578125, "learning_rate": 0.0017240072710514862, "loss": 5.6466, "step": 269500 }, { "epoch": 6.912619370695614, "grad_norm": 0.4921875, "learning_rate": 0.0017234952251721754, "loss": 5.6451, "step": 270000 }, { "epoch": 6.925420517678384, "grad_norm": 0.55859375, "learning_rate": 0.0017229831792928648, "loss": 5.6459, "step": 270500 }, { "epoch": 6.938221664661153, "grad_norm": 3.8125, "learning_rate": 0.0017224711334135538, "loss": 5.6458, "step": 271000 }, { "epoch": 6.951022811643924, "grad_norm": 0.48828125, "learning_rate": 0.001721959087534243, "loss": 5.646, "step": 271500 }, { "epoch": 6.963823958626693, "grad_norm": 0.65625, "learning_rate": 0.0017214470416549324, "loss": 5.6456, "step": 272000 }, { "epoch": 6.976625105609463, "grad_norm": 0.84765625, "learning_rate": 0.0017209349957756216, "loss": 5.6446, "step": 272500 }, { "epoch": 6.989426252592232, "grad_norm": 1.6953125, "learning_rate": 0.0017204229498963106, "loss": 5.646, "step": 273000 }, { "epoch": 7.0, "eval_loss": 5.643965244293213, "eval_runtime": 1.3794, "eval_samples_per_second": 724.964, "eval_steps_per_second": 2.9, "step": 273413 }, { "epoch": 7.002227399575002, "grad_norm": 0.392578125, "learning_rate": 0.001719910904017, "loss": 5.6451, "step": 273500 }, { "epoch": 7.0150285465577715, "grad_norm": 0.58984375, "learning_rate": 0.0017193988581376892, "loss": 5.6448, "step": 274000 }, { "epoch": 7.027829693540541, "grad_norm": 0.671875, "learning_rate": 0.0017188868122583784, "loss": 5.646, "step": 274500 }, { "epoch": 7.040630840523311, "grad_norm": 10.625, "learning_rate": 0.0017183747663790676, "loss": 5.6441, "step": 275000 }, { "epoch": 7.05343198750608, "grad_norm": 0.6484375, "learning_rate": 0.0017178627204997568, "loss": 5.6451, "step": 275500 }, { "epoch": 7.06623313448885, "grad_norm": 0.84375, "learning_rate": 0.001717350674620446, "loss": 5.645, "step": 276000 }, { "epoch": 7.07903428147162, "grad_norm": 0.66015625, "learning_rate": 0.0017168386287411354, "loss": 5.6449, "step": 276500 }, { "epoch": 7.09183542845439, "grad_norm": 0.74609375, "learning_rate": 0.0017163265828618243, "loss": 5.6449, "step": 277000 }, { "epoch": 7.104636575437159, "grad_norm": 1.1015625, "learning_rate": 0.0017158145369825138, "loss": 5.6445, "step": 277500 }, { "epoch": 7.117437722419929, "grad_norm": 5.0, "learning_rate": 0.001715302491103203, "loss": 5.6439, "step": 278000 }, { "epoch": 7.130238869402699, "grad_norm": 3.640625, "learning_rate": 0.0017147904452238921, "loss": 5.6445, "step": 278500 }, { "epoch": 7.143040016385468, "grad_norm": 0.78125, "learning_rate": 0.0017142783993445813, "loss": 5.6462, "step": 279000 }, { "epoch": 7.155841163368238, "grad_norm": 0.9296875, "learning_rate": 0.0017137663534652705, "loss": 5.6442, "step": 279500 }, { "epoch": 7.168642310351007, "grad_norm": 0.765625, "learning_rate": 0.0017132543075859597, "loss": 5.6443, "step": 280000 }, { "epoch": 7.181443457333777, "grad_norm": 0.75, "learning_rate": 0.0017127422617066491, "loss": 5.6449, "step": 280500 }, { "epoch": 7.194244604316546, "grad_norm": 4.25, "learning_rate": 0.0017122302158273381, "loss": 5.6439, "step": 281000 }, { "epoch": 7.207045751299316, "grad_norm": 1.09375, "learning_rate": 0.0017117181699480273, "loss": 5.6456, "step": 281500 }, { "epoch": 7.2198468982820865, "grad_norm": 0.62109375, "learning_rate": 0.0017112061240687167, "loss": 5.6457, "step": 282000 }, { "epoch": 7.232648045264856, "grad_norm": 1.015625, "learning_rate": 0.0017106940781894057, "loss": 5.6448, "step": 282500 }, { "epoch": 7.245449192247626, "grad_norm": 0.8046875, "learning_rate": 0.001710182032310095, "loss": 5.6431, "step": 283000 }, { "epoch": 7.258250339230395, "grad_norm": 0.734375, "learning_rate": 0.0017096699864307843, "loss": 5.6432, "step": 283500 }, { "epoch": 7.271051486213165, "grad_norm": 0.8828125, "learning_rate": 0.0017091579405514735, "loss": 5.6444, "step": 284000 }, { "epoch": 7.283852633195934, "grad_norm": 0.5859375, "learning_rate": 0.0017086458946721625, "loss": 5.6444, "step": 284500 }, { "epoch": 7.296653780178704, "grad_norm": 0.462890625, "learning_rate": 0.001708133848792852, "loss": 5.645, "step": 285000 }, { "epoch": 7.3094549271614735, "grad_norm": 1.203125, "learning_rate": 0.001707621802913541, "loss": 5.645, "step": 285500 }, { "epoch": 7.322256074144243, "grad_norm": 1.453125, "learning_rate": 0.0017071097570342303, "loss": 5.6457, "step": 286000 }, { "epoch": 7.335057221127013, "grad_norm": 0.7734375, "learning_rate": 0.0017065977111549195, "loss": 5.6445, "step": 286500 }, { "epoch": 7.347858368109783, "grad_norm": 0.58984375, "learning_rate": 0.0017060856652756087, "loss": 5.6456, "step": 287000 }, { "epoch": 7.360659515092553, "grad_norm": 0.94140625, "learning_rate": 0.0017055736193962979, "loss": 5.6455, "step": 287500 }, { "epoch": 7.373460662075322, "grad_norm": 0.66796875, "learning_rate": 0.0017050615735169873, "loss": 5.6451, "step": 288000 }, { "epoch": 7.386261809058092, "grad_norm": 0.734375, "learning_rate": 0.0017045495276376763, "loss": 5.644, "step": 288500 }, { "epoch": 7.399062956040861, "grad_norm": 0.53515625, "learning_rate": 0.0017040374817583654, "loss": 5.6448, "step": 289000 }, { "epoch": 7.411864103023631, "grad_norm": 1.4375, "learning_rate": 0.0017035254358790549, "loss": 5.6455, "step": 289500 }, { "epoch": 7.4246652500064005, "grad_norm": 0.9296875, "learning_rate": 0.001703013389999744, "loss": 5.6435, "step": 290000 }, { "epoch": 7.43746639698917, "grad_norm": 1.1875, "learning_rate": 0.001702501344120433, "loss": 5.6444, "step": 290500 }, { "epoch": 7.45026754397194, "grad_norm": 0.7734375, "learning_rate": 0.0017019892982411224, "loss": 5.6444, "step": 291000 }, { "epoch": 7.463068690954709, "grad_norm": 0.484375, "learning_rate": 0.0017014772523618116, "loss": 5.6432, "step": 291500 }, { "epoch": 7.475869837937479, "grad_norm": 1.703125, "learning_rate": 0.001700965206482501, "loss": 5.644, "step": 292000 }, { "epoch": 7.488670984920249, "grad_norm": 4.5, "learning_rate": 0.00170045316060319, "loss": 5.6447, "step": 292500 }, { "epoch": 7.501472131903019, "grad_norm": 0.86328125, "learning_rate": 0.0016999411147238792, "loss": 5.644, "step": 293000 }, { "epoch": 7.514273278885788, "grad_norm": 2.109375, "learning_rate": 0.0016994290688445686, "loss": 5.6436, "step": 293500 }, { "epoch": 7.527074425868558, "grad_norm": 0.9140625, "learning_rate": 0.0016989170229652578, "loss": 5.6443, "step": 294000 }, { "epoch": 7.539875572851328, "grad_norm": 1.3203125, "learning_rate": 0.0016984049770859468, "loss": 5.6431, "step": 294500 }, { "epoch": 7.552676719834097, "grad_norm": 0.609375, "learning_rate": 0.0016978929312066362, "loss": 5.6443, "step": 295000 }, { "epoch": 7.565477866816867, "grad_norm": 1.125, "learning_rate": 0.0016973808853273254, "loss": 5.6428, "step": 295500 }, { "epoch": 7.578279013799636, "grad_norm": 1.0703125, "learning_rate": 0.0016968688394480146, "loss": 5.6434, "step": 296000 }, { "epoch": 7.591080160782406, "grad_norm": 0.703125, "learning_rate": 0.0016963567935687038, "loss": 5.643, "step": 296500 }, { "epoch": 7.6038813077651755, "grad_norm": 0.6328125, "learning_rate": 0.001695844747689393, "loss": 5.6441, "step": 297000 }, { "epoch": 7.616682454747945, "grad_norm": 0.78515625, "learning_rate": 0.0016953327018100822, "loss": 5.6428, "step": 297500 }, { "epoch": 7.6294836017307155, "grad_norm": 0.8046875, "learning_rate": 0.0016948206559307716, "loss": 5.6421, "step": 298000 }, { "epoch": 7.642284748713485, "grad_norm": 0.49609375, "learning_rate": 0.0016943086100514606, "loss": 5.6424, "step": 298500 }, { "epoch": 7.655085895696255, "grad_norm": 0.64453125, "learning_rate": 0.0016937965641721498, "loss": 5.6422, "step": 299000 }, { "epoch": 7.667887042679024, "grad_norm": 0.64453125, "learning_rate": 0.0016932845182928392, "loss": 5.6428, "step": 299500 }, { "epoch": 7.680688189661794, "grad_norm": 0.5546875, "learning_rate": 0.0016927724724135284, "loss": 5.6436, "step": 300000 }, { "epoch": 7.693489336644563, "grad_norm": 0.7109375, "learning_rate": 0.0016922604265342174, "loss": 5.6432, "step": 300500 }, { "epoch": 7.706290483627333, "grad_norm": 0.51171875, "learning_rate": 0.0016917483806549068, "loss": 5.6432, "step": 301000 }, { "epoch": 7.7190916306101025, "grad_norm": 0.4140625, "learning_rate": 0.001691236334775596, "loss": 5.6434, "step": 301500 }, { "epoch": 7.731892777592872, "grad_norm": 1.2890625, "learning_rate": 0.0016907242888962852, "loss": 5.6424, "step": 302000 }, { "epoch": 7.744693924575642, "grad_norm": 0.51171875, "learning_rate": 0.0016902122430169743, "loss": 5.6427, "step": 302500 }, { "epoch": 7.757495071558411, "grad_norm": 0.50390625, "learning_rate": 0.0016897001971376635, "loss": 5.6427, "step": 303000 }, { "epoch": 7.770296218541182, "grad_norm": 0.5234375, "learning_rate": 0.0016891881512583527, "loss": 5.6424, "step": 303500 }, { "epoch": 7.783097365523951, "grad_norm": 0.494140625, "learning_rate": 0.001688676105379042, "loss": 5.6429, "step": 304000 }, { "epoch": 7.795898512506721, "grad_norm": 1.8203125, "learning_rate": 0.0016881640594997311, "loss": 5.6426, "step": 304500 }, { "epoch": 7.80869965948949, "grad_norm": 0.470703125, "learning_rate": 0.0016876520136204205, "loss": 5.6418, "step": 305000 }, { "epoch": 7.82150080647226, "grad_norm": 0.37890625, "learning_rate": 0.0016871399677411097, "loss": 5.6419, "step": 305500 }, { "epoch": 7.8343019534550296, "grad_norm": 0.4296875, "learning_rate": 0.0016866279218617987, "loss": 5.6429, "step": 306000 }, { "epoch": 7.847103100437799, "grad_norm": 0.3828125, "learning_rate": 0.0016861158759824881, "loss": 5.6431, "step": 306500 }, { "epoch": 7.859904247420569, "grad_norm": 0.47265625, "learning_rate": 0.0016856038301031773, "loss": 5.642, "step": 307000 }, { "epoch": 7.872705394403338, "grad_norm": 0.84765625, "learning_rate": 0.0016850917842238665, "loss": 5.6423, "step": 307500 }, { "epoch": 7.885506541386108, "grad_norm": 0.427734375, "learning_rate": 0.0016845797383445557, "loss": 5.6436, "step": 308000 }, { "epoch": 7.898307688368877, "grad_norm": 1.234375, "learning_rate": 0.001684067692465245, "loss": 5.642, "step": 308500 }, { "epoch": 7.911108835351648, "grad_norm": 0.97265625, "learning_rate": 0.001683555646585934, "loss": 5.6431, "step": 309000 }, { "epoch": 7.9239099823344175, "grad_norm": 0.96484375, "learning_rate": 0.0016830436007066235, "loss": 5.6436, "step": 309500 }, { "epoch": 7.936711129317187, "grad_norm": 0.92578125, "learning_rate": 0.0016825315548273125, "loss": 5.6429, "step": 310000 }, { "epoch": 7.949512276299957, "grad_norm": 0.72265625, "learning_rate": 0.0016820195089480017, "loss": 5.6429, "step": 310500 }, { "epoch": 7.962313423282726, "grad_norm": 1.109375, "learning_rate": 0.001681507463068691, "loss": 5.6424, "step": 311000 }, { "epoch": 7.975114570265496, "grad_norm": 0.84765625, "learning_rate": 0.0016809954171893803, "loss": 5.6424, "step": 311500 }, { "epoch": 7.987915717248265, "grad_norm": 2.640625, "learning_rate": 0.0016804833713100693, "loss": 5.642, "step": 312000 }, { "epoch": 8.0, "eval_loss": 5.642501354217529, "eval_runtime": 1.2812, "eval_samples_per_second": 780.497, "eval_steps_per_second": 3.122, "step": 312472 }, { "epoch": 8.000716864231036, "grad_norm": 0.6328125, "learning_rate": 0.0016799713254307587, "loss": 5.6415, "step": 312500 }, { "epoch": 8.013518011213804, "grad_norm": 0.51171875, "learning_rate": 0.0016794592795514479, "loss": 5.6414, "step": 313000 }, { "epoch": 8.026319158196575, "grad_norm": 1.1953125, "learning_rate": 0.001678947233672137, "loss": 5.6422, "step": 313500 }, { "epoch": 8.039120305179344, "grad_norm": 0.90625, "learning_rate": 0.0016784351877928263, "loss": 5.6416, "step": 314000 }, { "epoch": 8.051921452162114, "grad_norm": 0.56640625, "learning_rate": 0.0016779231419135154, "loss": 5.6421, "step": 314500 }, { "epoch": 8.064722599144883, "grad_norm": 2.15625, "learning_rate": 0.0016774110960342046, "loss": 5.642, "step": 315000 }, { "epoch": 8.077523746127653, "grad_norm": 1.4140625, "learning_rate": 0.001676899050154894, "loss": 5.6418, "step": 315500 }, { "epoch": 8.090324893110422, "grad_norm": 0.9453125, "learning_rate": 0.001676387004275583, "loss": 5.6408, "step": 316000 }, { "epoch": 8.103126040093192, "grad_norm": 0.58203125, "learning_rate": 0.0016758749583962722, "loss": 5.6421, "step": 316500 }, { "epoch": 8.115927187075963, "grad_norm": 0.99609375, "learning_rate": 0.0016753629125169616, "loss": 5.6421, "step": 317000 }, { "epoch": 8.128728334058732, "grad_norm": 0.53125, "learning_rate": 0.0016748508666376508, "loss": 5.6412, "step": 317500 }, { "epoch": 8.141529481041502, "grad_norm": 1.2265625, "learning_rate": 0.0016743388207583398, "loss": 5.6415, "step": 318000 }, { "epoch": 8.15433062802427, "grad_norm": 2.375, "learning_rate": 0.0016738267748790292, "loss": 5.6419, "step": 318500 }, { "epoch": 8.167131775007041, "grad_norm": 0.474609375, "learning_rate": 0.0016733147289997184, "loss": 5.6406, "step": 319000 }, { "epoch": 8.17993292198981, "grad_norm": 4.0625, "learning_rate": 0.0016728026831204078, "loss": 5.6412, "step": 319500 }, { "epoch": 8.19273406897258, "grad_norm": 0.474609375, "learning_rate": 0.0016722906372410968, "loss": 5.6412, "step": 320000 }, { "epoch": 8.205535215955349, "grad_norm": 0.66796875, "learning_rate": 0.001671778591361786, "loss": 5.6424, "step": 320500 }, { "epoch": 8.21833636293812, "grad_norm": 0.6640625, "learning_rate": 0.0016712665454824754, "loss": 5.6402, "step": 321000 }, { "epoch": 8.231137509920888, "grad_norm": 1.0703125, "learning_rate": 0.0016707544996031646, "loss": 5.6406, "step": 321500 }, { "epoch": 8.243938656903659, "grad_norm": 2.53125, "learning_rate": 0.0016702424537238536, "loss": 5.6405, "step": 322000 }, { "epoch": 8.256739803886429, "grad_norm": 0.83203125, "learning_rate": 0.001669730407844543, "loss": 5.6413, "step": 322500 }, { "epoch": 8.269540950869198, "grad_norm": 0.6171875, "learning_rate": 0.0016692183619652322, "loss": 5.64, "step": 323000 }, { "epoch": 8.282342097851968, "grad_norm": 0.66015625, "learning_rate": 0.0016687063160859214, "loss": 5.6413, "step": 323500 }, { "epoch": 8.295143244834737, "grad_norm": 0.984375, "learning_rate": 0.0016681942702066106, "loss": 5.641, "step": 324000 }, { "epoch": 8.307944391817507, "grad_norm": 1.5390625, "learning_rate": 0.0016676822243272998, "loss": 5.6406, "step": 324500 }, { "epoch": 8.320745538800276, "grad_norm": 0.89453125, "learning_rate": 0.001667170178447989, "loss": 5.6407, "step": 325000 }, { "epoch": 8.333546685783046, "grad_norm": 1.5625, "learning_rate": 0.0016666581325686784, "loss": 5.6405, "step": 325500 }, { "epoch": 8.346347832765815, "grad_norm": 0.4453125, "learning_rate": 0.0016661460866893673, "loss": 5.6405, "step": 326000 }, { "epoch": 8.359148979748586, "grad_norm": 0.6953125, "learning_rate": 0.0016656340408100565, "loss": 5.642, "step": 326500 }, { "epoch": 8.371950126731354, "grad_norm": 2.96875, "learning_rate": 0.001665121994930746, "loss": 5.6405, "step": 327000 }, { "epoch": 8.384751273714125, "grad_norm": 0.6328125, "learning_rate": 0.001664609949051435, "loss": 5.6414, "step": 327500 }, { "epoch": 8.397552420696895, "grad_norm": 1.515625, "learning_rate": 0.0016640979031721241, "loss": 5.6414, "step": 328000 }, { "epoch": 8.410353567679664, "grad_norm": 1.609375, "learning_rate": 0.0016635858572928135, "loss": 5.6399, "step": 328500 }, { "epoch": 8.423154714662434, "grad_norm": 1.53125, "learning_rate": 0.0016630738114135027, "loss": 5.6418, "step": 329000 }, { "epoch": 8.435955861645203, "grad_norm": 1.640625, "learning_rate": 0.0016625617655341917, "loss": 5.6404, "step": 329500 }, { "epoch": 8.448757008627974, "grad_norm": 1.1796875, "learning_rate": 0.0016620497196548811, "loss": 5.6408, "step": 330000 }, { "epoch": 8.461558155610742, "grad_norm": 0.625, "learning_rate": 0.0016615376737755703, "loss": 5.6409, "step": 330500 }, { "epoch": 8.474359302593513, "grad_norm": 0.8984375, "learning_rate": 0.0016610256278962595, "loss": 5.6404, "step": 331000 }, { "epoch": 8.487160449576281, "grad_norm": 0.921875, "learning_rate": 0.0016605135820169487, "loss": 5.6401, "step": 331500 }, { "epoch": 8.499961596559052, "grad_norm": 0.44140625, "learning_rate": 0.001660001536137638, "loss": 5.642, "step": 332000 }, { "epoch": 8.51276274354182, "grad_norm": 0.53515625, "learning_rate": 0.001659489490258327, "loss": 5.6404, "step": 332500 }, { "epoch": 8.525563890524591, "grad_norm": 1.8046875, "learning_rate": 0.0016589774443790165, "loss": 5.6402, "step": 333000 }, { "epoch": 8.538365037507361, "grad_norm": 0.7421875, "learning_rate": 0.0016584653984997055, "loss": 5.6396, "step": 333500 }, { "epoch": 8.55116618449013, "grad_norm": 1.6640625, "learning_rate": 0.001657953352620395, "loss": 5.6403, "step": 334000 }, { "epoch": 8.5639673314729, "grad_norm": 2.8125, "learning_rate": 0.001657441306741084, "loss": 5.6409, "step": 334500 }, { "epoch": 8.57676847845567, "grad_norm": 1.265625, "learning_rate": 0.0016569292608617733, "loss": 5.6405, "step": 335000 }, { "epoch": 8.58956962543844, "grad_norm": 0.435546875, "learning_rate": 0.0016564172149824625, "loss": 5.6403, "step": 335500 }, { "epoch": 8.602370772421208, "grad_norm": 1.953125, "learning_rate": 0.0016559051691031517, "loss": 5.6397, "step": 336000 }, { "epoch": 8.615171919403979, "grad_norm": 0.98828125, "learning_rate": 0.0016553931232238409, "loss": 5.6407, "step": 336500 }, { "epoch": 8.627973066386748, "grad_norm": 3.015625, "learning_rate": 0.0016548810773445303, "loss": 5.6401, "step": 337000 }, { "epoch": 8.640774213369518, "grad_norm": 0.455078125, "learning_rate": 0.0016543690314652193, "loss": 5.6412, "step": 337500 }, { "epoch": 8.653575360352288, "grad_norm": 0.9921875, "learning_rate": 0.0016538569855859084, "loss": 5.6402, "step": 338000 }, { "epoch": 8.666376507335057, "grad_norm": 23.5, "learning_rate": 0.0016533449397065979, "loss": 5.6404, "step": 338500 }, { "epoch": 8.679177654317828, "grad_norm": 0.4140625, "learning_rate": 0.001652832893827287, "loss": 5.6408, "step": 339000 }, { "epoch": 8.691978801300596, "grad_norm": 1.171875, "learning_rate": 0.001652320847947976, "loss": 5.6404, "step": 339500 }, { "epoch": 8.704779948283367, "grad_norm": 0.56640625, "learning_rate": 0.0016518088020686654, "loss": 5.6401, "step": 340000 }, { "epoch": 8.717581095266135, "grad_norm": 0.63671875, "learning_rate": 0.0016512967561893546, "loss": 5.6404, "step": 340500 }, { "epoch": 8.730382242248906, "grad_norm": 2.0, "learning_rate": 0.0016507847103100438, "loss": 5.6409, "step": 341000 }, { "epoch": 8.743183389231675, "grad_norm": 1.0546875, "learning_rate": 0.001650272664430733, "loss": 5.6401, "step": 341500 }, { "epoch": 8.755984536214445, "grad_norm": 1.1328125, "learning_rate": 0.0016497606185514222, "loss": 5.6403, "step": 342000 }, { "epoch": 8.768785683197214, "grad_norm": 0.8515625, "learning_rate": 0.0016492485726721114, "loss": 5.6393, "step": 342500 }, { "epoch": 8.781586830179984, "grad_norm": 1.984375, "learning_rate": 0.0016487365267928008, "loss": 5.6401, "step": 343000 }, { "epoch": 8.794387977162753, "grad_norm": 2.640625, "learning_rate": 0.0016482244809134898, "loss": 5.64, "step": 343500 }, { "epoch": 8.807189124145523, "grad_norm": 0.80078125, "learning_rate": 0.001647712435034179, "loss": 5.6399, "step": 344000 }, { "epoch": 8.819990271128294, "grad_norm": 0.96484375, "learning_rate": 0.0016472003891548684, "loss": 5.641, "step": 344500 }, { "epoch": 8.832791418111062, "grad_norm": 1.421875, "learning_rate": 0.0016466883432755576, "loss": 5.6391, "step": 345000 }, { "epoch": 8.845592565093833, "grad_norm": 1.84375, "learning_rate": 0.0016461762973962466, "loss": 5.6407, "step": 345500 }, { "epoch": 8.858393712076602, "grad_norm": 5.90625, "learning_rate": 0.001645664251516936, "loss": 5.6409, "step": 346000 }, { "epoch": 8.871194859059372, "grad_norm": 0.71484375, "learning_rate": 0.0016451522056376252, "loss": 5.6398, "step": 346500 }, { "epoch": 8.88399600604214, "grad_norm": 0.7421875, "learning_rate": 0.0016446401597583142, "loss": 5.6401, "step": 347000 }, { "epoch": 8.896797153024911, "grad_norm": 1.1328125, "learning_rate": 0.0016441281138790036, "loss": 5.6395, "step": 347500 }, { "epoch": 8.90959830000768, "grad_norm": 0.87109375, "learning_rate": 0.0016436160679996928, "loss": 5.6391, "step": 348000 }, { "epoch": 8.92239944699045, "grad_norm": 1.8046875, "learning_rate": 0.0016431040221203822, "loss": 5.6393, "step": 348500 }, { "epoch": 8.93520059397322, "grad_norm": 0.486328125, "learning_rate": 0.0016425919762410712, "loss": 5.6395, "step": 349000 }, { "epoch": 8.94800174095599, "grad_norm": 2.90625, "learning_rate": 0.0016420799303617604, "loss": 5.6394, "step": 349500 }, { "epoch": 8.96080288793876, "grad_norm": 0.88671875, "learning_rate": 0.0016415678844824498, "loss": 5.6396, "step": 350000 }, { "epoch": 8.973604034921529, "grad_norm": 2.15625, "learning_rate": 0.001641055838603139, "loss": 5.6387, "step": 350500 }, { "epoch": 8.9864051819043, "grad_norm": 1.578125, "learning_rate": 0.001640543792723828, "loss": 5.6394, "step": 351000 }, { "epoch": 8.999206328887068, "grad_norm": 1.0078125, "learning_rate": 0.0016400317468445173, "loss": 5.6387, "step": 351500 }, { "epoch": 9.0, "eval_loss": 5.638747215270996, "eval_runtime": 1.2906, "eval_samples_per_second": 774.854, "eval_steps_per_second": 3.099, "step": 351531 } ], "logging_steps": 500, "max_steps": 1952950, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.2189688215083418e+20, "train_batch_size": 256, "trial_name": null, "trial_params": null }