{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9838509316770185, "eval_steps": 500, "global_step": 402, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007453416149068323, "grad_norm": 0.9149440636536349, "learning_rate": 2.3809523809523808e-06, "loss": 0.5267, "step": 1 }, { "epoch": 0.014906832298136646, "grad_norm": 0.813081054678093, "learning_rate": 4.7619047619047615e-06, "loss": 0.501, "step": 2 }, { "epoch": 0.02236024844720497, "grad_norm": 0.7030352003534567, "learning_rate": 7.142857142857143e-06, "loss": 0.4786, "step": 3 }, { "epoch": 0.02981366459627329, "grad_norm": 0.744974289676234, "learning_rate": 9.523809523809523e-06, "loss": 0.5096, "step": 4 }, { "epoch": 0.037267080745341616, "grad_norm": 0.5298082392046644, "learning_rate": 1.1904761904761905e-05, "loss": 0.4947, "step": 5 }, { "epoch": 0.04472049689440994, "grad_norm": 0.6276792078680746, "learning_rate": 1.4285714285714285e-05, "loss": 0.4889, "step": 6 }, { "epoch": 0.05217391304347826, "grad_norm": 0.9938211412587247, "learning_rate": 1.6666666666666667e-05, "loss": 0.4966, "step": 7 }, { "epoch": 0.05962732919254658, "grad_norm": 1.0688888961766796, "learning_rate": 1.9047619047619046e-05, "loss": 0.5423, "step": 8 }, { "epoch": 0.0670807453416149, "grad_norm": 0.7660208255317018, "learning_rate": 2.1428571428571428e-05, "loss": 0.5054, "step": 9 }, { "epoch": 0.07453416149068323, "grad_norm": 0.6232752556675165, "learning_rate": 2.380952380952381e-05, "loss": 0.516, "step": 10 }, { "epoch": 0.08198757763975155, "grad_norm": 0.5696191527866266, "learning_rate": 2.6190476190476192e-05, "loss": 0.4902, "step": 11 }, { "epoch": 0.08944099378881988, "grad_norm": 0.5574577130188229, "learning_rate": 2.857142857142857e-05, "loss": 0.5054, "step": 12 }, { "epoch": 0.0968944099378882, "grad_norm": 0.5992840639020023, "learning_rate": 3.095238095238095e-05, "loss": 0.5008, "step": 13 }, { "epoch": 0.10434782608695652, "grad_norm": 0.5954549641006874, "learning_rate": 3.3333333333333335e-05, "loss": 0.5023, "step": 14 }, { "epoch": 0.11180124223602485, "grad_norm": 0.576921385529578, "learning_rate": 3.571428571428572e-05, "loss": 0.5236, "step": 15 }, { "epoch": 0.11925465838509317, "grad_norm": 0.5706806482628634, "learning_rate": 3.809523809523809e-05, "loss": 0.5017, "step": 16 }, { "epoch": 0.1267080745341615, "grad_norm": 0.7086057812964052, "learning_rate": 4.047619047619048e-05, "loss": 0.5267, "step": 17 }, { "epoch": 0.1341614906832298, "grad_norm": 0.5808757241964403, "learning_rate": 4.2857142857142856e-05, "loss": 0.4904, "step": 18 }, { "epoch": 0.14161490683229813, "grad_norm": 0.6317506464382385, "learning_rate": 4.523809523809524e-05, "loss": 0.5043, "step": 19 }, { "epoch": 0.14906832298136646, "grad_norm": 0.6359839242618298, "learning_rate": 4.761904761904762e-05, "loss": 0.5055, "step": 20 }, { "epoch": 0.1565217391304348, "grad_norm": 0.5752065827486615, "learning_rate": 5e-05, "loss": 0.5084, "step": 21 }, { "epoch": 0.1639751552795031, "grad_norm": 0.574109201187312, "learning_rate": 4.999923510846293e-05, "loss": 0.4981, "step": 22 }, { "epoch": 0.17142857142857143, "grad_norm": 0.5996134853952727, "learning_rate": 4.999694048585699e-05, "loss": 0.4889, "step": 23 }, { "epoch": 0.17888198757763976, "grad_norm": 0.6523805926616552, "learning_rate": 4.999311628819437e-05, "loss": 0.4927, "step": 24 }, { "epoch": 0.18633540372670807, "grad_norm": 0.7101608078052835, "learning_rate": 4.9987762775483666e-05, "loss": 0.5078, "step": 25 }, { "epoch": 0.1937888198757764, "grad_norm": 0.7396307593577484, "learning_rate": 4.99808803117121e-05, "loss": 0.4943, "step": 26 }, { "epoch": 0.20124223602484473, "grad_norm": 0.653880275554884, "learning_rate": 4.9972469364820877e-05, "loss": 0.4981, "step": 27 }, { "epoch": 0.20869565217391303, "grad_norm": 0.9242693837414091, "learning_rate": 4.996253050667329e-05, "loss": 0.4875, "step": 28 }, { "epoch": 0.21614906832298136, "grad_norm": 1.2095187282407587, "learning_rate": 4.995106441301587e-05, "loss": 0.5071, "step": 29 }, { "epoch": 0.2236024844720497, "grad_norm": 0.7405451123886392, "learning_rate": 4.993807186343243e-05, "loss": 0.5168, "step": 30 }, { "epoch": 0.231055900621118, "grad_norm": 0.6770762633940498, "learning_rate": 4.992355374129109e-05, "loss": 0.4956, "step": 31 }, { "epoch": 0.23850931677018633, "grad_norm": 0.8790648152676741, "learning_rate": 4.990751103368418e-05, "loss": 0.4914, "step": 32 }, { "epoch": 0.24596273291925466, "grad_norm": 0.8761927581585033, "learning_rate": 4.988994483136115e-05, "loss": 0.5028, "step": 33 }, { "epoch": 0.253416149068323, "grad_norm": 1.0155828765501556, "learning_rate": 4.98708563286544e-05, "loss": 0.497, "step": 34 }, { "epoch": 0.2608695652173913, "grad_norm": 0.7164731439245322, "learning_rate": 4.985024682339807e-05, "loss": 0.4782, "step": 35 }, { "epoch": 0.2683229813664596, "grad_norm": 0.7789231779550446, "learning_rate": 4.982811771683982e-05, "loss": 0.5117, "step": 36 }, { "epoch": 0.27577639751552796, "grad_norm": 0.7333852855548396, "learning_rate": 4.980447051354555e-05, "loss": 0.5147, "step": 37 }, { "epoch": 0.28322981366459626, "grad_norm": 0.6356599304692622, "learning_rate": 4.977930682129711e-05, "loss": 0.4986, "step": 38 }, { "epoch": 0.2906832298136646, "grad_norm": 0.6005754618090398, "learning_rate": 4.975262835098295e-05, "loss": 0.4853, "step": 39 }, { "epoch": 0.2981366459627329, "grad_norm": 0.6269613620581922, "learning_rate": 4.9724436916481836e-05, "loss": 0.4842, "step": 40 }, { "epoch": 0.30559006211180123, "grad_norm": 0.4930016834035119, "learning_rate": 4.969473443453953e-05, "loss": 0.5001, "step": 41 }, { "epoch": 0.3130434782608696, "grad_norm": 0.6087046909213417, "learning_rate": 4.966352292463842e-05, "loss": 0.4903, "step": 42 }, { "epoch": 0.3204968944099379, "grad_norm": 0.6680768802723248, "learning_rate": 4.96308045088603e-05, "loss": 0.4864, "step": 43 }, { "epoch": 0.3279503105590062, "grad_norm": 0.5024028870492778, "learning_rate": 4.959658141174196e-05, "loss": 0.4928, "step": 44 }, { "epoch": 0.33540372670807456, "grad_norm": 0.5691045158483018, "learning_rate": 4.956085596012407e-05, "loss": 0.4895, "step": 45 }, { "epoch": 0.34285714285714286, "grad_norm": 0.5960753346949116, "learning_rate": 4.952363058299291e-05, "loss": 0.5037, "step": 46 }, { "epoch": 0.35031055900621116, "grad_norm": 0.5809783536316744, "learning_rate": 4.948490781131523e-05, "loss": 0.4955, "step": 47 }, { "epoch": 0.3577639751552795, "grad_norm": 0.4638121725312366, "learning_rate": 4.944469027786616e-05, "loss": 0.4828, "step": 48 }, { "epoch": 0.3652173913043478, "grad_norm": 0.6519884038596414, "learning_rate": 4.940298071705024e-05, "loss": 0.4869, "step": 49 }, { "epoch": 0.37267080745341613, "grad_norm": 0.725573010046233, "learning_rate": 4.935978196471548e-05, "loss": 0.4983, "step": 50 }, { "epoch": 0.3801242236024845, "grad_norm": 0.7723414020223717, "learning_rate": 4.931509695796055e-05, "loss": 0.5097, "step": 51 }, { "epoch": 0.3875776397515528, "grad_norm": 0.6749832378781871, "learning_rate": 4.926892873493509e-05, "loss": 0.5009, "step": 52 }, { "epoch": 0.3950310559006211, "grad_norm": 0.5287344971699504, "learning_rate": 4.922128043463316e-05, "loss": 0.5033, "step": 53 }, { "epoch": 0.40248447204968946, "grad_norm": 0.7590242284157045, "learning_rate": 4.917215529667979e-05, "loss": 0.5041, "step": 54 }, { "epoch": 0.40993788819875776, "grad_norm": 0.9634027388127688, "learning_rate": 4.9121556661110734e-05, "loss": 0.5084, "step": 55 }, { "epoch": 0.41739130434782606, "grad_norm": 0.8942287814733841, "learning_rate": 4.906948796814539e-05, "loss": 0.4982, "step": 56 }, { "epoch": 0.4248447204968944, "grad_norm": 0.7332394854708344, "learning_rate": 4.901595275795287e-05, "loss": 0.4893, "step": 57 }, { "epoch": 0.4322981366459627, "grad_norm": 0.6811910538390739, "learning_rate": 4.896095467041133e-05, "loss": 0.498, "step": 58 }, { "epoch": 0.43975155279503103, "grad_norm": 1.0815817510677823, "learning_rate": 4.8904497444860495e-05, "loss": 0.4951, "step": 59 }, { "epoch": 0.4472049689440994, "grad_norm": 0.5430186184806628, "learning_rate": 4.884658491984735e-05, "loss": 0.4974, "step": 60 }, { "epoch": 0.4546583850931677, "grad_norm": 0.7264258373815644, "learning_rate": 4.878722103286529e-05, "loss": 0.5017, "step": 61 }, { "epoch": 0.462111801242236, "grad_norm": 0.7414704944298521, "learning_rate": 4.8726409820086274e-05, "loss": 0.502, "step": 62 }, { "epoch": 0.46956521739130436, "grad_norm": 0.523437918394872, "learning_rate": 4.8664155416086495e-05, "loss": 0.4957, "step": 63 }, { "epoch": 0.47701863354037266, "grad_norm": 0.6159908506967338, "learning_rate": 4.8600462053565204e-05, "loss": 0.4956, "step": 64 }, { "epoch": 0.484472049689441, "grad_norm": 0.5860033924334783, "learning_rate": 4.8535334063056984e-05, "loss": 0.5036, "step": 65 }, { "epoch": 0.4919254658385093, "grad_norm": 0.510141066708458, "learning_rate": 4.846877587263728e-05, "loss": 0.4862, "step": 66 }, { "epoch": 0.4993788819875776, "grad_norm": 0.5214414519844613, "learning_rate": 4.840079200762135e-05, "loss": 0.4728, "step": 67 }, { "epoch": 0.506832298136646, "grad_norm": 0.5627288353125691, "learning_rate": 4.833138709025656e-05, "loss": 0.49, "step": 68 }, { "epoch": 0.5142857142857142, "grad_norm": 0.6512461745013858, "learning_rate": 4.826056583940815e-05, "loss": 0.4763, "step": 69 }, { "epoch": 0.5217391304347826, "grad_norm": 0.501857537261379, "learning_rate": 4.818833307023839e-05, "loss": 0.4801, "step": 70 }, { "epoch": 0.529192546583851, "grad_norm": 0.5401175474298149, "learning_rate": 4.811469369387917e-05, "loss": 0.5099, "step": 71 }, { "epoch": 0.5366459627329192, "grad_norm": 0.6056173499301607, "learning_rate": 4.803965271709811e-05, "loss": 0.4755, "step": 72 }, { "epoch": 0.5440993788819876, "grad_norm": 0.5300538292613007, "learning_rate": 4.7963215241958156e-05, "loss": 0.4914, "step": 73 }, { "epoch": 0.5515527950310559, "grad_norm": 0.49477384366828114, "learning_rate": 4.7885386465470675e-05, "loss": 0.4942, "step": 74 }, { "epoch": 0.5590062111801242, "grad_norm": 0.5832215591212782, "learning_rate": 4.780617167924209e-05, "loss": 0.4821, "step": 75 }, { "epoch": 0.5664596273291925, "grad_norm": 0.5825547831853878, "learning_rate": 4.772557626911417e-05, "loss": 0.4991, "step": 76 }, { "epoch": 0.5739130434782609, "grad_norm": 0.6260775830743696, "learning_rate": 4.7643605714797754e-05, "loss": 0.5178, "step": 77 }, { "epoch": 0.5813664596273292, "grad_norm": 0.5002047281833832, "learning_rate": 4.756026558950025e-05, "loss": 0.4788, "step": 78 }, { "epoch": 0.5888198757763975, "grad_norm": 0.658164441734046, "learning_rate": 4.747556155954668e-05, "loss": 0.4924, "step": 79 }, { "epoch": 0.5962732919254659, "grad_norm": 0.5642052228011414, "learning_rate": 4.738949938399445e-05, "loss": 0.4793, "step": 80 }, { "epoch": 0.6037267080745342, "grad_norm": 0.6800799125862594, "learning_rate": 4.730208491424174e-05, "loss": 0.4972, "step": 81 }, { "epoch": 0.6111801242236025, "grad_norm": 0.6022489226977471, "learning_rate": 4.721332409362973e-05, "loss": 0.4839, "step": 82 }, { "epoch": 0.6186335403726708, "grad_norm": 0.5548484325825994, "learning_rate": 4.712322295703846e-05, "loss": 0.4786, "step": 83 }, { "epoch": 0.6260869565217392, "grad_norm": 0.5717126912005164, "learning_rate": 4.7031787630476544e-05, "loss": 0.4938, "step": 84 }, { "epoch": 0.6335403726708074, "grad_norm": 0.5272588715397946, "learning_rate": 4.693902433066464e-05, "loss": 0.4793, "step": 85 }, { "epoch": 0.6409937888198758, "grad_norm": 0.5463413109564844, "learning_rate": 4.6844939364612796e-05, "loss": 0.4925, "step": 86 }, { "epoch": 0.6484472049689441, "grad_norm": 0.6075766167098049, "learning_rate": 4.674953912919161e-05, "loss": 0.5025, "step": 87 }, { "epoch": 0.6559006211180124, "grad_norm": 0.6310158980679005, "learning_rate": 4.665283011069733e-05, "loss": 0.4911, "step": 88 }, { "epoch": 0.6633540372670808, "grad_norm": 0.50469934840197, "learning_rate": 4.655481888441083e-05, "loss": 0.4764, "step": 89 }, { "epoch": 0.6708074534161491, "grad_norm": 0.488371832002747, "learning_rate": 4.6455512114150546e-05, "loss": 0.4937, "step": 90 }, { "epoch": 0.6782608695652174, "grad_norm": 0.4770981226634187, "learning_rate": 4.635491655181943e-05, "loss": 0.4884, "step": 91 }, { "epoch": 0.6857142857142857, "grad_norm": 0.5775420955722163, "learning_rate": 4.625303903694586e-05, "loss": 0.4853, "step": 92 }, { "epoch": 0.6931677018633541, "grad_norm": 0.47430309169555146, "learning_rate": 4.614988649621864e-05, "loss": 0.5139, "step": 93 }, { "epoch": 0.7006211180124223, "grad_norm": 0.6638176198018706, "learning_rate": 4.604546594301602e-05, "loss": 0.4953, "step": 94 }, { "epoch": 0.7080745341614907, "grad_norm": 0.5170361990821142, "learning_rate": 4.5939784476928894e-05, "loss": 0.4714, "step": 95 }, { "epoch": 0.715527950310559, "grad_norm": 0.5989923841003543, "learning_rate": 4.583284928327805e-05, "loss": 0.4826, "step": 96 }, { "epoch": 0.7229813664596273, "grad_norm": 0.5119408388808395, "learning_rate": 4.572466763262569e-05, "loss": 0.4876, "step": 97 }, { "epoch": 0.7304347826086957, "grad_norm": 0.4719741454775879, "learning_rate": 4.5615246880281066e-05, "loss": 0.4746, "step": 98 }, { "epoch": 0.737888198757764, "grad_norm": 0.5792116907988438, "learning_rate": 4.550459446580039e-05, "loss": 0.4778, "step": 99 }, { "epoch": 0.7453416149068323, "grad_norm": 0.5650326179874091, "learning_rate": 4.5392717912481025e-05, "loss": 0.5068, "step": 100 }, { "epoch": 0.7527950310559006, "grad_norm": 0.49349361289777244, "learning_rate": 4.527962482684998e-05, "loss": 0.4787, "step": 101 }, { "epoch": 0.760248447204969, "grad_norm": 0.590606894561532, "learning_rate": 4.516532289814674e-05, "loss": 0.4827, "step": 102 }, { "epoch": 0.7677018633540372, "grad_norm": 0.6325065592193169, "learning_rate": 4.5049819897800445e-05, "loss": 0.4822, "step": 103 }, { "epoch": 0.7751552795031056, "grad_norm": 0.48212618053402273, "learning_rate": 4.493312367890154e-05, "loss": 0.4862, "step": 104 }, { "epoch": 0.782608695652174, "grad_norm": 0.6659769851227831, "learning_rate": 4.481524217566783e-05, "loss": 0.4763, "step": 105 }, { "epoch": 0.7900621118012422, "grad_norm": 0.5100428848300065, "learning_rate": 4.4696183402905005e-05, "loss": 0.4861, "step": 106 }, { "epoch": 0.7975155279503106, "grad_norm": 0.4930498732601481, "learning_rate": 4.4575955455461764e-05, "loss": 0.4709, "step": 107 }, { "epoch": 0.8049689440993789, "grad_norm": 0.5557882232422525, "learning_rate": 4.4454566507679395e-05, "loss": 0.4823, "step": 108 }, { "epoch": 0.8124223602484472, "grad_norm": 0.4779459811311798, "learning_rate": 4.4332024812836026e-05, "loss": 0.4902, "step": 109 }, { "epoch": 0.8198757763975155, "grad_norm": 0.4951096303075202, "learning_rate": 4.420833870258544e-05, "loss": 0.4799, "step": 110 }, { "epoch": 0.8273291925465839, "grad_norm": 0.4972554983396596, "learning_rate": 4.4083516586390694e-05, "loss": 0.4757, "step": 111 }, { "epoch": 0.8347826086956521, "grad_norm": 0.6922157872179194, "learning_rate": 4.395756695095222e-05, "loss": 0.4871, "step": 112 }, { "epoch": 0.8422360248447205, "grad_norm": 0.614533156193526, "learning_rate": 4.383049835963095e-05, "loss": 0.4858, "step": 113 }, { "epoch": 0.8496894409937888, "grad_norm": 0.4998865414356172, "learning_rate": 4.370231945186601e-05, "loss": 0.5084, "step": 114 }, { "epoch": 0.8571428571428571, "grad_norm": 0.5297858251800804, "learning_rate": 4.357303894258733e-05, "loss": 0.4844, "step": 115 }, { "epoch": 0.8645962732919255, "grad_norm": 0.490173761162275, "learning_rate": 4.344266562162313e-05, "loss": 0.4775, "step": 116 }, { "epoch": 0.8720496894409938, "grad_norm": 0.5188647432432411, "learning_rate": 4.331120835310228e-05, "loss": 0.4731, "step": 117 }, { "epoch": 0.8795031055900621, "grad_norm": 0.5006594475547541, "learning_rate": 4.3178676074851646e-05, "loss": 0.4834, "step": 118 }, { "epoch": 0.8869565217391304, "grad_norm": 0.6966182509694935, "learning_rate": 4.3045077797788386e-05, "loss": 0.482, "step": 119 }, { "epoch": 0.8944099378881988, "grad_norm": 0.630693615513719, "learning_rate": 4.29104226053073e-05, "loss": 0.4806, "step": 120 }, { "epoch": 0.901863354037267, "grad_norm": 0.4855641171083717, "learning_rate": 4.277471965266325e-05, "loss": 0.4881, "step": 121 }, { "epoch": 0.9093167701863354, "grad_norm": 0.6046785410123138, "learning_rate": 4.2637978166348666e-05, "loss": 0.4985, "step": 122 }, { "epoch": 0.9167701863354037, "grad_norm": 0.6094452228967235, "learning_rate": 4.250020744346629e-05, "loss": 0.488, "step": 123 }, { "epoch": 0.924223602484472, "grad_norm": 0.5207025073876308, "learning_rate": 4.2361416851097e-05, "loss": 0.4839, "step": 124 }, { "epoch": 0.9316770186335404, "grad_norm": 0.5910466009416144, "learning_rate": 4.222161582566299e-05, "loss": 0.4816, "step": 125 }, { "epoch": 0.9391304347826087, "grad_norm": 0.6533302840950863, "learning_rate": 4.208081387228612e-05, "loss": 0.4848, "step": 126 }, { "epoch": 0.9465838509316771, "grad_norm": 0.5615426924165942, "learning_rate": 4.193902056414175e-05, "loss": 0.4952, "step": 127 }, { "epoch": 0.9540372670807453, "grad_norm": 0.5584154804823008, "learning_rate": 4.179624554180778e-05, "loss": 0.4998, "step": 128 }, { "epoch": 0.9614906832298137, "grad_norm": 0.6235842837428364, "learning_rate": 4.165249851260921e-05, "loss": 0.4918, "step": 129 }, { "epoch": 0.968944099378882, "grad_norm": 0.5109516402930713, "learning_rate": 4.1507789249958134e-05, "loss": 0.4812, "step": 130 }, { "epoch": 0.9763975155279503, "grad_norm": 0.5909881028917956, "learning_rate": 4.136212759268926e-05, "loss": 0.4713, "step": 131 }, { "epoch": 0.9838509316770186, "grad_norm": 0.8317014634966002, "learning_rate": 4.121552344439093e-05, "loss": 0.5024, "step": 132 }, { "epoch": 0.991304347826087, "grad_norm": 0.49067977677281605, "learning_rate": 4.1067986772731795e-05, "loss": 0.4816, "step": 133 }, { "epoch": 0.9987577639751553, "grad_norm": 0.6766338838361023, "learning_rate": 4.0919527608783105e-05, "loss": 0.477, "step": 134 }, { "epoch": 1.0, "grad_norm": 0.6766338838361023, "learning_rate": 4.077015604633669e-05, "loss": 0.4422, "step": 135 }, { "epoch": 1.0074534161490682, "grad_norm": 1.3875151975518343, "learning_rate": 4.0619882241218684e-05, "loss": 0.3755, "step": 136 }, { "epoch": 1.0149068322981367, "grad_norm": 0.7353931983255126, "learning_rate": 4.046871641059903e-05, "loss": 0.3713, "step": 137 }, { "epoch": 1.022360248447205, "grad_norm": 0.7602599418526752, "learning_rate": 4.031666883229678e-05, "loss": 0.3719, "step": 138 }, { "epoch": 1.0298136645962732, "grad_norm": 0.5465117063921796, "learning_rate": 4.016374984408137e-05, "loss": 0.3602, "step": 139 }, { "epoch": 1.0372670807453417, "grad_norm": 0.5515849784244337, "learning_rate": 4.000996984296967e-05, "loss": 0.3669, "step": 140 }, { "epoch": 1.04472049689441, "grad_norm": 0.547418960473341, "learning_rate": 3.985533928451914e-05, "loss": 0.3565, "step": 141 }, { "epoch": 1.0521739130434782, "grad_norm": 0.43709687343518044, "learning_rate": 3.969986868211693e-05, "loss": 0.3592, "step": 142 }, { "epoch": 1.0596273291925467, "grad_norm": 0.510533171129714, "learning_rate": 3.954356860626509e-05, "loss": 0.3548, "step": 143 }, { "epoch": 1.067080745341615, "grad_norm": 0.7014866699955652, "learning_rate": 3.938644968386188e-05, "loss": 0.3458, "step": 144 }, { "epoch": 1.0745341614906831, "grad_norm": 0.6054199711863072, "learning_rate": 3.922852259747921e-05, "loss": 0.3717, "step": 145 }, { "epoch": 1.0819875776397516, "grad_norm": 0.5885498856638683, "learning_rate": 3.9069798084636357e-05, "loss": 0.3563, "step": 146 }, { "epoch": 1.0894409937888199, "grad_norm": 0.6718059622184547, "learning_rate": 3.8910286937069894e-05, "loss": 0.3396, "step": 147 }, { "epoch": 1.0968944099378881, "grad_norm": 0.6261142587490304, "learning_rate": 3.875e-05, "loss": 0.3675, "step": 148 }, { "epoch": 1.1043478260869566, "grad_norm": 0.7988169272624631, "learning_rate": 3.858894817139304e-05, "loss": 0.3475, "step": 149 }, { "epoch": 1.1118012422360248, "grad_norm": 0.5931174308761531, "learning_rate": 3.8427142401220634e-05, "loss": 0.3475, "step": 150 }, { "epoch": 1.119254658385093, "grad_norm": 0.6442383924397598, "learning_rate": 3.8264593690715165e-05, "loss": 0.3497, "step": 151 }, { "epoch": 1.1267080745341616, "grad_norm": 0.5725732820991489, "learning_rate": 3.810131309162178e-05, "loss": 0.3559, "step": 152 }, { "epoch": 1.1341614906832298, "grad_norm": 0.5564507237104445, "learning_rate": 3.7937311705447016e-05, "loss": 0.3626, "step": 153 }, { "epoch": 1.141614906832298, "grad_norm": 0.6381547420777212, "learning_rate": 3.777260068270396e-05, "loss": 0.3405, "step": 154 }, { "epoch": 1.1490683229813665, "grad_norm": 0.5057416039471847, "learning_rate": 3.760719122215416e-05, "loss": 0.3524, "step": 155 }, { "epoch": 1.1565217391304348, "grad_norm": 0.5412142450588553, "learning_rate": 3.74410945700462e-05, "loss": 0.3513, "step": 156 }, { "epoch": 1.163975155279503, "grad_norm": 0.5764227357077137, "learning_rate": 3.727432201935107e-05, "loss": 0.349, "step": 157 }, { "epoch": 1.1714285714285715, "grad_norm": 0.43438798608153883, "learning_rate": 3.7106884908994314e-05, "loss": 0.3526, "step": 158 }, { "epoch": 1.1788819875776397, "grad_norm": 0.4757449865488938, "learning_rate": 3.693879462308516e-05, "loss": 0.3554, "step": 159 }, { "epoch": 1.186335403726708, "grad_norm": 0.5422686029816232, "learning_rate": 3.677006259014247e-05, "loss": 0.3449, "step": 160 }, { "epoch": 1.1937888198757765, "grad_norm": 0.4828269978461137, "learning_rate": 3.6600700282317704e-05, "loss": 0.3442, "step": 161 }, { "epoch": 1.2012422360248447, "grad_norm": 0.6064830533601335, "learning_rate": 3.643071921461497e-05, "loss": 0.3602, "step": 162 }, { "epoch": 1.208695652173913, "grad_norm": 0.37720957669313665, "learning_rate": 3.626013094410803e-05, "loss": 0.3632, "step": 163 }, { "epoch": 1.2161490683229814, "grad_norm": 0.5304204188881894, "learning_rate": 3.6088947069154624e-05, "loss": 0.3555, "step": 164 }, { "epoch": 1.2236024844720497, "grad_norm": 0.38241688724581824, "learning_rate": 3.591717922860785e-05, "loss": 0.3461, "step": 165 }, { "epoch": 1.231055900621118, "grad_norm": 0.4132634739723132, "learning_rate": 3.574483910102481e-05, "loss": 0.3473, "step": 166 }, { "epoch": 1.2385093167701864, "grad_norm": 0.41939353269128443, "learning_rate": 3.557193840387263e-05, "loss": 0.3592, "step": 167 }, { "epoch": 1.2459627329192546, "grad_norm": 0.4076888356556453, "learning_rate": 3.539848889273175e-05, "loss": 0.3503, "step": 168 }, { "epoch": 1.253416149068323, "grad_norm": 0.4270944790896845, "learning_rate": 3.522450236049668e-05, "loss": 0.3541, "step": 169 }, { "epoch": 1.2608695652173914, "grad_norm": 0.463757861659065, "learning_rate": 3.504999063657418e-05, "loss": 0.3542, "step": 170 }, { "epoch": 1.2683229813664596, "grad_norm": 0.3981674010803556, "learning_rate": 3.487496558607898e-05, "loss": 0.3487, "step": 171 }, { "epoch": 1.275776397515528, "grad_norm": 0.41590787708330634, "learning_rate": 3.4699439109027074e-05, "loss": 0.3498, "step": 172 }, { "epoch": 1.2832298136645963, "grad_norm": 0.44072247076963306, "learning_rate": 3.452342313952662e-05, "loss": 0.3525, "step": 173 }, { "epoch": 1.2906832298136646, "grad_norm": 0.38745116662865536, "learning_rate": 3.4346929644966564e-05, "loss": 0.3546, "step": 174 }, { "epoch": 1.298136645962733, "grad_norm": 0.40585109654853857, "learning_rate": 3.416997062520292e-05, "loss": 0.3508, "step": 175 }, { "epoch": 1.3055900621118013, "grad_norm": 0.4410011484738261, "learning_rate": 3.399255811174295e-05, "loss": 0.351, "step": 176 }, { "epoch": 1.3130434782608695, "grad_norm": 0.4196810442218097, "learning_rate": 3.38147041669271e-05, "loss": 0.3678, "step": 177 }, { "epoch": 1.320496894409938, "grad_norm": 0.3579720168253587, "learning_rate": 3.36364208831089e-05, "loss": 0.3585, "step": 178 }, { "epoch": 1.3279503105590063, "grad_norm": 0.428787604095252, "learning_rate": 3.345772038183281e-05, "loss": 0.3481, "step": 179 }, { "epoch": 1.3354037267080745, "grad_norm": 0.3731847145526466, "learning_rate": 3.3278614813010034e-05, "loss": 0.3529, "step": 180 }, { "epoch": 1.342857142857143, "grad_norm": 0.4426288735756756, "learning_rate": 3.309911635409246e-05, "loss": 0.3661, "step": 181 }, { "epoch": 1.3503105590062112, "grad_norm": 0.3937030484260621, "learning_rate": 3.291923720924473e-05, "loss": 0.3501, "step": 182 }, { "epoch": 1.3577639751552795, "grad_norm": 0.40353386330372387, "learning_rate": 3.273898960851443e-05, "loss": 0.3537, "step": 183 }, { "epoch": 1.365217391304348, "grad_norm": 0.3721891152141779, "learning_rate": 3.2558385807000654e-05, "loss": 0.3596, "step": 184 }, { "epoch": 1.3726708074534162, "grad_norm": 0.36845786723670104, "learning_rate": 3.2377438084020644e-05, "loss": 0.3532, "step": 185 }, { "epoch": 1.3801242236024844, "grad_norm": 0.43575242497786837, "learning_rate": 3.219615874227504e-05, "loss": 0.3581, "step": 186 }, { "epoch": 1.387577639751553, "grad_norm": 0.3972116108444874, "learning_rate": 3.2014560107011324e-05, "loss": 0.3544, "step": 187 }, { "epoch": 1.3950310559006212, "grad_norm": 0.3668661244872624, "learning_rate": 3.18326545251859e-05, "loss": 0.3551, "step": 188 }, { "epoch": 1.4024844720496894, "grad_norm": 0.35145343781990773, "learning_rate": 3.165045436462451e-05, "loss": 0.3502, "step": 189 }, { "epoch": 1.4099378881987579, "grad_norm": 0.3698602702933315, "learning_rate": 3.146797201318149e-05, "loss": 0.3473, "step": 190 }, { "epoch": 1.4173913043478261, "grad_norm": 0.40228974580805166, "learning_rate": 3.128521987789738e-05, "loss": 0.36, "step": 191 }, { "epoch": 1.4248447204968944, "grad_norm": 0.36657081099285116, "learning_rate": 3.110221038415545e-05, "loss": 0.3364, "step": 192 }, { "epoch": 1.4322981366459628, "grad_norm": 0.4544371717792479, "learning_rate": 3.0918955974836846e-05, "loss": 0.3521, "step": 193 }, { "epoch": 1.439751552795031, "grad_norm": 0.41609260719942065, "learning_rate": 3.073546910947461e-05, "loss": 0.3447, "step": 194 }, { "epoch": 1.4472049689440993, "grad_norm": 0.42225853754054415, "learning_rate": 3.0551762263406576e-05, "loss": 0.3579, "step": 195 }, { "epoch": 1.4546583850931678, "grad_norm": 0.4344358503994225, "learning_rate": 3.0367847926927134e-05, "loss": 0.3537, "step": 196 }, { "epoch": 1.462111801242236, "grad_norm": 0.4160571014028236, "learning_rate": 3.0183738604438006e-05, "loss": 0.3529, "step": 197 }, { "epoch": 1.4695652173913043, "grad_norm": 0.38345984224367013, "learning_rate": 2.999944681359811e-05, "loss": 0.3689, "step": 198 }, { "epoch": 1.4770186335403728, "grad_norm": 0.40710896458852786, "learning_rate": 2.9814985084472418e-05, "loss": 0.3596, "step": 199 }, { "epoch": 1.484472049689441, "grad_norm": 0.36671518555061516, "learning_rate": 2.9630365958680107e-05, "loss": 0.3622, "step": 200 }, { "epoch": 1.4919254658385093, "grad_norm": 0.4044241295664962, "learning_rate": 2.9445601988541782e-05, "loss": 0.3423, "step": 201 }, { "epoch": 1.4993788819875777, "grad_norm": 0.4084788856944912, "learning_rate": 2.9260705736226075e-05, "loss": 0.3523, "step": 202 }, { "epoch": 1.506832298136646, "grad_norm": 0.37362508421237023, "learning_rate": 2.9075689772895538e-05, "loss": 0.342, "step": 203 }, { "epoch": 1.5142857142857142, "grad_norm": 0.3760546446289586, "learning_rate": 2.88905666778519e-05, "loss": 0.3472, "step": 204 }, { "epoch": 1.5217391304347827, "grad_norm": 0.34422522904634617, "learning_rate": 2.870534903768082e-05, "loss": 0.3435, "step": 205 }, { "epoch": 1.529192546583851, "grad_norm": 0.36637523532571264, "learning_rate": 2.852004944539614e-05, "loss": 0.3576, "step": 206 }, { "epoch": 1.5366459627329192, "grad_norm": 0.4326264584972339, "learning_rate": 2.8334680499583617e-05, "loss": 0.3509, "step": 207 }, { "epoch": 1.5440993788819877, "grad_norm": 0.36854187428297824, "learning_rate": 2.814925480354441e-05, "loss": 0.3598, "step": 208 }, { "epoch": 1.551552795031056, "grad_norm": 0.43487065325981833, "learning_rate": 2.7963784964438122e-05, "loss": 0.3453, "step": 209 }, { "epoch": 1.5590062111801242, "grad_norm": 0.37541698341479846, "learning_rate": 2.777828359242567e-05, "loss": 0.3498, "step": 210 }, { "epoch": 1.5664596273291926, "grad_norm": 0.4124753900399903, "learning_rate": 2.759276329981191e-05, "loss": 0.3532, "step": 211 }, { "epoch": 1.5739130434782609, "grad_norm": 0.37293434521740576, "learning_rate": 2.74072367001881e-05, "loss": 0.3485, "step": 212 }, { "epoch": 1.5813664596273291, "grad_norm": 0.4288037014823596, "learning_rate": 2.722171640757434e-05, "loss": 0.3532, "step": 213 }, { "epoch": 1.5888198757763976, "grad_norm": 0.36312626249643093, "learning_rate": 2.703621503556189e-05, "loss": 0.3652, "step": 214 }, { "epoch": 1.5962732919254659, "grad_norm": 0.47318947905865477, "learning_rate": 2.6850745196455594e-05, "loss": 0.3546, "step": 215 }, { "epoch": 1.603726708074534, "grad_norm": 0.3525081596484065, "learning_rate": 2.6665319500416385e-05, "loss": 0.3503, "step": 216 }, { "epoch": 1.6111801242236026, "grad_norm": 0.45691496651133917, "learning_rate": 2.6479950554603862e-05, "loss": 0.3483, "step": 217 }, { "epoch": 1.6186335403726708, "grad_norm": 0.31443317345218114, "learning_rate": 2.6294650962319177e-05, "loss": 0.3639, "step": 218 }, { "epoch": 1.626086956521739, "grad_norm": 0.3939981518889869, "learning_rate": 2.6109433322148112e-05, "loss": 0.3591, "step": 219 }, { "epoch": 1.6335403726708075, "grad_norm": 0.34489071813729033, "learning_rate": 2.592431022710447e-05, "loss": 0.3524, "step": 220 }, { "epoch": 1.6409937888198758, "grad_norm": 0.35520259178429653, "learning_rate": 2.5739294263773933e-05, "loss": 0.3516, "step": 221 }, { "epoch": 1.648447204968944, "grad_norm": 0.3388748252085296, "learning_rate": 2.555439801145823e-05, "loss": 0.3586, "step": 222 }, { "epoch": 1.6559006211180125, "grad_norm": 0.34569372832787504, "learning_rate": 2.5369634041319895e-05, "loss": 0.353, "step": 223 }, { "epoch": 1.6633540372670808, "grad_norm": 0.3945538297720147, "learning_rate": 2.5185014915527587e-05, "loss": 0.3527, "step": 224 }, { "epoch": 1.670807453416149, "grad_norm": 0.3762612151126058, "learning_rate": 2.50005531864019e-05, "loss": 0.3489, "step": 225 }, { "epoch": 1.6782608695652175, "grad_norm": 0.44476348389377207, "learning_rate": 2.4816261395562003e-05, "loss": 0.3563, "step": 226 }, { "epoch": 1.6857142857142857, "grad_norm": 0.3646157307605566, "learning_rate": 2.4632152073072878e-05, "loss": 0.3532, "step": 227 }, { "epoch": 1.693167701863354, "grad_norm": 0.39063038331746, "learning_rate": 2.4448237736593422e-05, "loss": 0.3518, "step": 228 }, { "epoch": 1.7006211180124224, "grad_norm": 0.36030888961041857, "learning_rate": 2.4264530890525395e-05, "loss": 0.3514, "step": 229 }, { "epoch": 1.7080745341614907, "grad_norm": 0.34694110159996094, "learning_rate": 2.408104402516317e-05, "loss": 0.3537, "step": 230 }, { "epoch": 1.715527950310559, "grad_norm": 0.35170013263514505, "learning_rate": 2.3897789615844557e-05, "loss": 0.3556, "step": 231 }, { "epoch": 1.7229813664596274, "grad_norm": 0.339176023939733, "learning_rate": 2.3714780122102626e-05, "loss": 0.342, "step": 232 }, { "epoch": 1.7304347826086957, "grad_norm": 0.35440772273510307, "learning_rate": 2.3532027986818518e-05, "loss": 0.3431, "step": 233 }, { "epoch": 1.737888198757764, "grad_norm": 0.3624908891100353, "learning_rate": 2.3349545635375498e-05, "loss": 0.3549, "step": 234 }, { "epoch": 1.7453416149068324, "grad_norm": 0.3198798531907377, "learning_rate": 2.3167345474814118e-05, "loss": 0.358, "step": 235 }, { "epoch": 1.7527950310559006, "grad_norm": 0.33748379039276893, "learning_rate": 2.298543989298867e-05, "loss": 0.352, "step": 236 }, { "epoch": 1.7602484472049689, "grad_norm": 0.35787480655458953, "learning_rate": 2.2803841257724962e-05, "loss": 0.3517, "step": 237 }, { "epoch": 1.7677018633540373, "grad_norm": 0.3263237153649648, "learning_rate": 2.2622561915979358e-05, "loss": 0.3554, "step": 238 }, { "epoch": 1.7751552795031056, "grad_norm": 0.381086077893375, "learning_rate": 2.2441614192999354e-05, "loss": 0.3614, "step": 239 }, { "epoch": 1.7826086956521738, "grad_norm": 0.3615867584131201, "learning_rate": 2.226101039148557e-05, "loss": 0.3521, "step": 240 }, { "epoch": 1.7900621118012423, "grad_norm": 0.33249983999528465, "learning_rate": 2.2080762790755282e-05, "loss": 0.35, "step": 241 }, { "epoch": 1.7975155279503106, "grad_norm": 0.3466313876235434, "learning_rate": 2.1900883645907545e-05, "loss": 0.3478, "step": 242 }, { "epoch": 1.8049689440993788, "grad_norm": 0.3545791130868414, "learning_rate": 2.1721385186989978e-05, "loss": 0.3383, "step": 243 }, { "epoch": 1.8124223602484473, "grad_norm": 0.3433493456392416, "learning_rate": 2.1542279618167194e-05, "loss": 0.357, "step": 244 }, { "epoch": 1.8198757763975155, "grad_norm": 0.35215043297702653, "learning_rate": 2.13635791168911e-05, "loss": 0.3584, "step": 245 }, { "epoch": 1.8273291925465838, "grad_norm": 0.3306587366051955, "learning_rate": 2.1185295833072914e-05, "loss": 0.3536, "step": 246 }, { "epoch": 1.8347826086956522, "grad_norm": 0.37995802267618, "learning_rate": 2.1007441888257055e-05, "loss": 0.3555, "step": 247 }, { "epoch": 1.8422360248447205, "grad_norm": 0.3211766118761742, "learning_rate": 2.0830029374797085e-05, "loss": 0.3302, "step": 248 }, { "epoch": 1.8496894409937887, "grad_norm": 0.34496506016293466, "learning_rate": 2.0653070355033438e-05, "loss": 0.3331, "step": 249 }, { "epoch": 1.8571428571428572, "grad_norm": 0.35158994392589726, "learning_rate": 2.047657686047338e-05, "loss": 0.3522, "step": 250 }, { "epoch": 1.8645962732919255, "grad_norm": 0.2997777930152674, "learning_rate": 2.030056089097293e-05, "loss": 0.345, "step": 251 }, { "epoch": 1.8720496894409937, "grad_norm": 0.36933604277512955, "learning_rate": 2.0125034413921024e-05, "loss": 0.3553, "step": 252 }, { "epoch": 1.8795031055900622, "grad_norm": 0.30278033326279763, "learning_rate": 1.9950009363425827e-05, "loss": 0.3504, "step": 253 }, { "epoch": 1.8869565217391304, "grad_norm": 0.3317258855093456, "learning_rate": 1.9775497639503325e-05, "loss": 0.3637, "step": 254 }, { "epoch": 1.8944099378881987, "grad_norm": 0.32228882411814175, "learning_rate": 1.9601511107268255e-05, "loss": 0.3553, "step": 255 }, { "epoch": 1.9018633540372671, "grad_norm": 0.31378115419623065, "learning_rate": 1.9428061596127383e-05, "loss": 0.347, "step": 256 }, { "epoch": 1.9093167701863354, "grad_norm": 0.32150232133903023, "learning_rate": 1.9255160898975195e-05, "loss": 0.3416, "step": 257 }, { "epoch": 1.9167701863354036, "grad_norm": 0.30317071268510376, "learning_rate": 1.9082820771392157e-05, "loss": 0.3488, "step": 258 }, { "epoch": 1.924223602484472, "grad_norm": 0.3160493816316982, "learning_rate": 1.891105293084538e-05, "loss": 0.3524, "step": 259 }, { "epoch": 1.9316770186335404, "grad_norm": 0.31367068123245806, "learning_rate": 1.8739869055891972e-05, "loss": 0.3532, "step": 260 }, { "epoch": 1.9391304347826086, "grad_norm": 0.32732364352375504, "learning_rate": 1.8569280785385046e-05, "loss": 0.3437, "step": 261 }, { "epoch": 1.946583850931677, "grad_norm": 0.3044343110689431, "learning_rate": 1.8399299717682304e-05, "loss": 0.351, "step": 262 }, { "epoch": 1.9540372670807453, "grad_norm": 0.37461713163454335, "learning_rate": 1.822993740985754e-05, "loss": 0.3491, "step": 263 }, { "epoch": 1.9614906832298136, "grad_norm": 0.31668154420249334, "learning_rate": 1.806120537691485e-05, "loss": 0.3473, "step": 264 }, { "epoch": 1.968944099378882, "grad_norm": 0.3012343742566849, "learning_rate": 1.7893115091005695e-05, "loss": 0.3473, "step": 265 }, { "epoch": 1.9763975155279503, "grad_norm": 0.39737978489918535, "learning_rate": 1.772567798064894e-05, "loss": 0.3494, "step": 266 }, { "epoch": 1.9838509316770185, "grad_norm": 0.33805655625614117, "learning_rate": 1.7558905429953805e-05, "loss": 0.3475, "step": 267 }, { "epoch": 1.991304347826087, "grad_norm": 0.3322232086747, "learning_rate": 1.739280877784584e-05, "loss": 0.3308, "step": 268 }, { "epoch": 1.9987577639751553, "grad_norm": 0.38072556291830295, "learning_rate": 1.7227399317296043e-05, "loss": 0.3425, "step": 269 }, { "epoch": 2.0, "grad_norm": 0.38072556291830295, "learning_rate": 1.7062688294552992e-05, "loss": 0.3282, "step": 270 }, { "epoch": 2.0074534161490685, "grad_norm": 0.7460455385201656, "learning_rate": 1.6898686908378227e-05, "loss": 0.2385, "step": 271 }, { "epoch": 2.0149068322981365, "grad_norm": 0.4584394574079776, "learning_rate": 1.6735406309284847e-05, "loss": 0.2325, "step": 272 }, { "epoch": 2.022360248447205, "grad_norm": 0.35402810903371174, "learning_rate": 1.657285759877937e-05, "loss": 0.2289, "step": 273 }, { "epoch": 2.0298136645962734, "grad_norm": 0.41508856204183064, "learning_rate": 1.6411051828606964e-05, "loss": 0.2289, "step": 274 }, { "epoch": 2.0372670807453415, "grad_norm": 0.5703640626631827, "learning_rate": 1.6250000000000005e-05, "loss": 0.251, "step": 275 }, { "epoch": 2.04472049689441, "grad_norm": 0.44030030337444215, "learning_rate": 1.6089713062930108e-05, "loss": 0.2393, "step": 276 }, { "epoch": 2.0521739130434784, "grad_norm": 0.3634716004644788, "learning_rate": 1.5930201915363652e-05, "loss": 0.2208, "step": 277 }, { "epoch": 2.0596273291925464, "grad_norm": 0.40263713578890487, "learning_rate": 1.5771477402520797e-05, "loss": 0.2251, "step": 278 }, { "epoch": 2.067080745341615, "grad_norm": 0.38573706299136595, "learning_rate": 1.5613550316138116e-05, "loss": 0.2373, "step": 279 }, { "epoch": 2.0745341614906834, "grad_norm": 0.3886650442753063, "learning_rate": 1.5456431393734912e-05, "loss": 0.2328, "step": 280 }, { "epoch": 2.0819875776397514, "grad_norm": 0.3353566159721077, "learning_rate": 1.530013131788308e-05, "loss": 0.2368, "step": 281 }, { "epoch": 2.08944099378882, "grad_norm": 0.32235589864048775, "learning_rate": 1.5144660715480877e-05, "loss": 0.2372, "step": 282 }, { "epoch": 2.0968944099378883, "grad_norm": 0.3449572995224796, "learning_rate": 1.4990030157030343e-05, "loss": 0.2302, "step": 283 }, { "epoch": 2.1043478260869564, "grad_norm": 0.35370627629200513, "learning_rate": 1.4836250155918632e-05, "loss": 0.2254, "step": 284 }, { "epoch": 2.111801242236025, "grad_norm": 0.32255576283733056, "learning_rate": 1.4683331167703218e-05, "loss": 0.2304, "step": 285 }, { "epoch": 2.1192546583850933, "grad_norm": 0.33160244908908576, "learning_rate": 1.453128358940098e-05, "loss": 0.229, "step": 286 }, { "epoch": 2.1267080745341613, "grad_norm": 0.3342512523189852, "learning_rate": 1.4380117758781318e-05, "loss": 0.2232, "step": 287 }, { "epoch": 2.13416149068323, "grad_norm": 0.3154712145125721, "learning_rate": 1.4229843953663313e-05, "loss": 0.2295, "step": 288 }, { "epoch": 2.1416149068322983, "grad_norm": 0.2998522567104169, "learning_rate": 1.4080472391216898e-05, "loss": 0.2318, "step": 289 }, { "epoch": 2.1490683229813663, "grad_norm": 0.3120711414761453, "learning_rate": 1.3932013227268208e-05, "loss": 0.2199, "step": 290 }, { "epoch": 2.1565217391304348, "grad_norm": 0.31545849948494736, "learning_rate": 1.3784476555609077e-05, "loss": 0.2258, "step": 291 }, { "epoch": 2.1639751552795032, "grad_norm": 0.31288349011845973, "learning_rate": 1.3637872407310748e-05, "loss": 0.2372, "step": 292 }, { "epoch": 2.1714285714285713, "grad_norm": 0.30783876364368057, "learning_rate": 1.3492210750041873e-05, "loss": 0.2273, "step": 293 }, { "epoch": 2.1788819875776397, "grad_norm": 0.29777744790391, "learning_rate": 1.3347501487390801e-05, "loss": 0.2298, "step": 294 }, { "epoch": 2.186335403726708, "grad_norm": 0.30475333543232624, "learning_rate": 1.3203754458192225e-05, "loss": 0.234, "step": 295 }, { "epoch": 2.1937888198757762, "grad_norm": 0.29158802455098537, "learning_rate": 1.3060979435858259e-05, "loss": 0.2177, "step": 296 }, { "epoch": 2.2012422360248447, "grad_norm": 0.2910747993340585, "learning_rate": 1.2919186127713885e-05, "loss": 0.2257, "step": 297 }, { "epoch": 2.208695652173913, "grad_norm": 0.29580086796673544, "learning_rate": 1.2778384174337025e-05, "loss": 0.2256, "step": 298 }, { "epoch": 2.216149068322981, "grad_norm": 0.28921712308433817, "learning_rate": 1.2638583148903008e-05, "loss": 0.2321, "step": 299 }, { "epoch": 2.2236024844720497, "grad_norm": 0.28971064806152724, "learning_rate": 1.2499792556533716e-05, "loss": 0.2319, "step": 300 }, { "epoch": 2.231055900621118, "grad_norm": 0.2855654258873247, "learning_rate": 1.236202183365134e-05, "loss": 0.2255, "step": 301 }, { "epoch": 2.238509316770186, "grad_norm": 0.291778227302567, "learning_rate": 1.2225280347336763e-05, "loss": 0.2329, "step": 302 }, { "epoch": 2.2459627329192546, "grad_norm": 0.28470732682114813, "learning_rate": 1.20895773946927e-05, "loss": 0.2263, "step": 303 }, { "epoch": 2.253416149068323, "grad_norm": 0.27214743085323334, "learning_rate": 1.1954922202211615e-05, "loss": 0.2262, "step": 304 }, { "epoch": 2.260869565217391, "grad_norm": 0.2783027932885126, "learning_rate": 1.1821323925148358e-05, "loss": 0.2278, "step": 305 }, { "epoch": 2.2683229813664596, "grad_norm": 0.2940915504230925, "learning_rate": 1.1688791646897726e-05, "loss": 0.2254, "step": 306 }, { "epoch": 2.275776397515528, "grad_norm": 0.29241392964362756, "learning_rate": 1.1557334378376882e-05, "loss": 0.2274, "step": 307 }, { "epoch": 2.283229813664596, "grad_norm": 0.2859155808493438, "learning_rate": 1.1426961057412672e-05, "loss": 0.2242, "step": 308 }, { "epoch": 2.2906832298136646, "grad_norm": 0.2863824212664396, "learning_rate": 1.1297680548133993e-05, "loss": 0.2211, "step": 309 }, { "epoch": 2.298136645962733, "grad_norm": 0.27218685640719575, "learning_rate": 1.1169501640369051e-05, "loss": 0.2166, "step": 310 }, { "epoch": 2.305590062111801, "grad_norm": 0.2739565076313759, "learning_rate": 1.1042433049047781e-05, "loss": 0.2288, "step": 311 }, { "epoch": 2.3130434782608695, "grad_norm": 0.2858456970822226, "learning_rate": 1.0916483413609315e-05, "loss": 0.2252, "step": 312 }, { "epoch": 2.320496894409938, "grad_norm": 0.2730982695340446, "learning_rate": 1.079166129741455e-05, "loss": 0.2289, "step": 313 }, { "epoch": 2.327950310559006, "grad_norm": 0.2858131409653359, "learning_rate": 1.0667975187163976e-05, "loss": 0.2308, "step": 314 }, { "epoch": 2.3354037267080745, "grad_norm": 0.2666152133579397, "learning_rate": 1.0545433492320603e-05, "loss": 0.2278, "step": 315 }, { "epoch": 2.342857142857143, "grad_norm": 0.2770908273244185, "learning_rate": 1.042404454453824e-05, "loss": 0.2168, "step": 316 }, { "epoch": 2.350310559006211, "grad_norm": 0.2738838241569534, "learning_rate": 1.0303816597095004e-05, "loss": 0.2381, "step": 317 }, { "epoch": 2.3577639751552795, "grad_norm": 0.27892490339684844, "learning_rate": 1.0184757824332187e-05, "loss": 0.2365, "step": 318 }, { "epoch": 2.365217391304348, "grad_norm": 0.27327486293785713, "learning_rate": 1.0066876321098467e-05, "loss": 0.2262, "step": 319 }, { "epoch": 2.372670807453416, "grad_norm": 0.2691733028937572, "learning_rate": 9.950180102199564e-06, "loss": 0.2287, "step": 320 }, { "epoch": 2.3801242236024844, "grad_norm": 0.2706087500883035, "learning_rate": 9.834677101853265e-06, "loss": 0.2244, "step": 321 }, { "epoch": 2.387577639751553, "grad_norm": 0.2855306323449237, "learning_rate": 9.720375173150024e-06, "loss": 0.2349, "step": 322 }, { "epoch": 2.395031055900621, "grad_norm": 0.26777926603611185, "learning_rate": 9.607282087518984e-06, "loss": 0.2154, "step": 323 }, { "epoch": 2.4024844720496894, "grad_norm": 0.2821902505187217, "learning_rate": 9.495405534199617e-06, "loss": 0.2325, "step": 324 }, { "epoch": 2.409937888198758, "grad_norm": 0.27147857463244984, "learning_rate": 9.384753119718937e-06, "loss": 0.2228, "step": 325 }, { "epoch": 2.417391304347826, "grad_norm": 0.2699371891385757, "learning_rate": 9.27533236737431e-06, "loss": 0.2264, "step": 326 }, { "epoch": 2.4248447204968944, "grad_norm": 0.2762402133611527, "learning_rate": 9.167150716721954e-06, "loss": 0.2208, "step": 327 }, { "epoch": 2.432298136645963, "grad_norm": 0.27110209952512, "learning_rate": 9.060215523071117e-06, "loss": 0.2253, "step": 328 }, { "epoch": 2.439751552795031, "grad_norm": 0.27014197741120893, "learning_rate": 8.954534056983984e-06, "loss": 0.2293, "step": 329 }, { "epoch": 2.4472049689440993, "grad_norm": 0.27495397443241254, "learning_rate": 8.850113503781367e-06, "loss": 0.2209, "step": 330 }, { "epoch": 2.454658385093168, "grad_norm": 0.27468805910530714, "learning_rate": 8.746960963054145e-06, "loss": 0.2349, "step": 331 }, { "epoch": 2.462111801242236, "grad_norm": 0.2770381414499631, "learning_rate": 8.645083448180574e-06, "loss": 0.2238, "step": 332 }, { "epoch": 2.4695652173913043, "grad_norm": 0.27376387230241217, "learning_rate": 8.54448788584946e-06, "loss": 0.2207, "step": 333 }, { "epoch": 2.4770186335403728, "grad_norm": 0.27580353431299076, "learning_rate": 8.445181115589179e-06, "loss": 0.2219, "step": 334 }, { "epoch": 2.4844720496894412, "grad_norm": 0.26703692348696817, "learning_rate": 8.34716988930267e-06, "loss": 0.2187, "step": 335 }, { "epoch": 2.4919254658385093, "grad_norm": 0.2651902683216356, "learning_rate": 8.250460870808394e-06, "loss": 0.2238, "step": 336 }, { "epoch": 2.4993788819875777, "grad_norm": 0.2602841629540144, "learning_rate": 8.155060635387206e-06, "loss": 0.2237, "step": 337 }, { "epoch": 2.506832298136646, "grad_norm": 0.26591023651045076, "learning_rate": 8.060975669335365e-06, "loss": 0.2283, "step": 338 }, { "epoch": 2.5142857142857142, "grad_norm": 0.26252263621744065, "learning_rate": 7.968212369523462e-06, "loss": 0.2245, "step": 339 }, { "epoch": 2.5217391304347827, "grad_norm": 0.2748374618397896, "learning_rate": 7.876777042961544e-06, "loss": 0.2228, "step": 340 }, { "epoch": 2.529192546583851, "grad_norm": 0.2597274246070528, "learning_rate": 7.786675906370278e-06, "loss": 0.2283, "step": 341 }, { "epoch": 2.536645962732919, "grad_norm": 0.2687998792747167, "learning_rate": 7.697915085758266e-06, "loss": 0.2205, "step": 342 }, { "epoch": 2.5440993788819877, "grad_norm": 0.26277761372100056, "learning_rate": 7.610500616005556e-06, "loss": 0.2201, "step": 343 }, { "epoch": 2.551552795031056, "grad_norm": 0.273978880458127, "learning_rate": 7.524438440453323e-06, "loss": 0.2332, "step": 344 }, { "epoch": 2.559006211180124, "grad_norm": 0.25462514116789337, "learning_rate": 7.439734410499752e-06, "loss": 0.2188, "step": 345 }, { "epoch": 2.5664596273291926, "grad_norm": 0.271284156037868, "learning_rate": 7.356394285202248e-06, "loss": 0.2252, "step": 346 }, { "epoch": 2.573913043478261, "grad_norm": 0.2709719202368051, "learning_rate": 7.274423730885835e-06, "loss": 0.2377, "step": 347 }, { "epoch": 2.581366459627329, "grad_norm": 0.2687629580156147, "learning_rate": 7.193828320757909e-06, "loss": 0.2212, "step": 348 }, { "epoch": 2.5888198757763976, "grad_norm": 0.2723242179612203, "learning_rate": 7.114613534529333e-06, "loss": 0.2169, "step": 349 }, { "epoch": 2.596273291925466, "grad_norm": 0.25830008261692217, "learning_rate": 7.036784758041846e-06, "loss": 0.2151, "step": 350 }, { "epoch": 2.603726708074534, "grad_norm": 0.27058413732950065, "learning_rate": 6.960347282901894e-06, "loss": 0.226, "step": 351 }, { "epoch": 2.6111801242236026, "grad_norm": 0.2602830736394775, "learning_rate": 6.885306306120837e-06, "loss": 0.2173, "step": 352 }, { "epoch": 2.618633540372671, "grad_norm": 0.2884065006859397, "learning_rate": 6.811666929761612e-06, "loss": 0.223, "step": 353 }, { "epoch": 2.626086956521739, "grad_norm": 0.2678949940900833, "learning_rate": 6.739434160591852e-06, "loss": 0.2232, "step": 354 }, { "epoch": 2.6335403726708075, "grad_norm": 0.26613900004744706, "learning_rate": 6.668612909743448e-06, "loss": 0.2109, "step": 355 }, { "epoch": 2.640993788819876, "grad_norm": 0.2600134334991607, "learning_rate": 6.599207992378657e-06, "loss": 0.2216, "step": 356 }, { "epoch": 2.648447204968944, "grad_norm": 0.26265543365401706, "learning_rate": 6.531224127362726e-06, "loss": 0.223, "step": 357 }, { "epoch": 2.6559006211180125, "grad_norm": 0.26558869901544513, "learning_rate": 6.464665936943023e-06, "loss": 0.2172, "step": 358 }, { "epoch": 2.663354037267081, "grad_norm": 0.2693433022214806, "learning_rate": 6.399537946434801e-06, "loss": 0.2198, "step": 359 }, { "epoch": 2.670807453416149, "grad_norm": 0.2604150700405178, "learning_rate": 6.335844583913515e-06, "loss": 0.22, "step": 360 }, { "epoch": 2.6782608695652175, "grad_norm": 0.26795928209419323, "learning_rate": 6.27359017991373e-06, "loss": 0.2254, "step": 361 }, { "epoch": 2.685714285714286, "grad_norm": 0.26825259134506735, "learning_rate": 6.212778967134715e-06, "loss": 0.2286, "step": 362 }, { "epoch": 2.693167701863354, "grad_norm": 0.2624757714546698, "learning_rate": 6.153415080152655e-06, "loss": 0.2224, "step": 363 }, { "epoch": 2.7006211180124224, "grad_norm": 0.2538413882658132, "learning_rate": 6.095502555139516e-06, "loss": 0.2116, "step": 364 }, { "epoch": 2.708074534161491, "grad_norm": 0.26097610705351, "learning_rate": 6.039045329588671e-06, "loss": 0.2135, "step": 365 }, { "epoch": 2.715527950310559, "grad_norm": 0.26841385627861275, "learning_rate": 5.984047242047134e-06, "loss": 0.2126, "step": 366 }, { "epoch": 2.7229813664596274, "grad_norm": 0.26691120572518234, "learning_rate": 5.930512031854617e-06, "loss": 0.2231, "step": 367 }, { "epoch": 2.730434782608696, "grad_norm": 0.2595939680410351, "learning_rate": 5.8784433388892726e-06, "loss": 0.2187, "step": 368 }, { "epoch": 2.737888198757764, "grad_norm": 0.2620978338502642, "learning_rate": 5.827844703320216e-06, "loss": 0.2275, "step": 369 }, { "epoch": 2.7453416149068324, "grad_norm": 0.27293115847294985, "learning_rate": 5.778719565366846e-06, "loss": 0.2216, "step": 370 }, { "epoch": 2.752795031055901, "grad_norm": 0.26642579246201126, "learning_rate": 5.731071265064913e-06, "loss": 0.2253, "step": 371 }, { "epoch": 2.760248447204969, "grad_norm": 0.2673788170594958, "learning_rate": 5.684903042039452e-06, "loss": 0.2271, "step": 372 }, { "epoch": 2.7677018633540373, "grad_norm": 0.2554915331836976, "learning_rate": 5.640218035284521e-06, "loss": 0.2295, "step": 373 }, { "epoch": 2.775155279503106, "grad_norm": 0.2678452830442339, "learning_rate": 5.59701928294976e-06, "loss": 0.2215, "step": 374 }, { "epoch": 2.782608695652174, "grad_norm": 0.2815341168477551, "learning_rate": 5.555309722133842e-06, "loss": 0.2281, "step": 375 }, { "epoch": 2.7900621118012423, "grad_norm": 0.2624788732767209, "learning_rate": 5.515092188684775e-06, "loss": 0.2247, "step": 376 }, { "epoch": 2.7975155279503108, "grad_norm": 0.27489059149241923, "learning_rate": 5.476369417007091e-06, "loss": 0.2327, "step": 377 }, { "epoch": 2.804968944099379, "grad_norm": 0.28772902740092976, "learning_rate": 5.439144039875931e-06, "loss": 0.2167, "step": 378 }, { "epoch": 2.8124223602484473, "grad_norm": 0.26111982505416115, "learning_rate": 5.403418588258045e-06, "loss": 0.2197, "step": 379 }, { "epoch": 2.8198757763975157, "grad_norm": 0.267283694967123, "learning_rate": 5.369195491139709e-06, "loss": 0.2262, "step": 380 }, { "epoch": 2.8273291925465838, "grad_norm": 0.27325385624236986, "learning_rate": 5.336477075361577e-06, "loss": 0.2172, "step": 381 }, { "epoch": 2.8347826086956522, "grad_norm": 0.2647115488846027, "learning_rate": 5.305265565460477e-06, "loss": 0.2244, "step": 382 }, { "epoch": 2.8422360248447207, "grad_norm": 0.26432835544386557, "learning_rate": 5.275563083518169e-06, "loss": 0.216, "step": 383 }, { "epoch": 2.8496894409937887, "grad_norm": 0.2574806933503353, "learning_rate": 5.247371649017059e-06, "loss": 0.2309, "step": 384 }, { "epoch": 2.857142857142857, "grad_norm": 0.26132718645601316, "learning_rate": 5.220693178702895e-06, "loss": 0.2142, "step": 385 }, { "epoch": 2.8645962732919257, "grad_norm": 0.2778266622426319, "learning_rate": 5.195529486454448e-06, "loss": 0.2246, "step": 386 }, { "epoch": 2.8720496894409937, "grad_norm": 0.2548174424842807, "learning_rate": 5.171882283160185e-06, "loss": 0.2207, "step": 387 }, { "epoch": 2.879503105590062, "grad_norm": 0.26469737507644614, "learning_rate": 5.149753176601936e-06, "loss": 0.2245, "step": 388 }, { "epoch": 2.8869565217391306, "grad_norm": 0.2660467641695125, "learning_rate": 5.129143671345609e-06, "loss": 0.227, "step": 389 }, { "epoch": 2.8944099378881987, "grad_norm": 0.253693975774761, "learning_rate": 5.110055168638854e-06, "loss": 0.219, "step": 390 }, { "epoch": 2.901863354037267, "grad_norm": 0.2795604845969123, "learning_rate": 5.09248896631582e-06, "loss": 0.2321, "step": 391 }, { "epoch": 2.9093167701863356, "grad_norm": 0.7496235586126184, "learning_rate": 5.07644625870891e-06, "loss": 0.2366, "step": 392 }, { "epoch": 2.9167701863354036, "grad_norm": 0.2618794070766183, "learning_rate": 5.06192813656757e-06, "loss": 0.2283, "step": 393 }, { "epoch": 2.924223602484472, "grad_norm": 0.2616116501289447, "learning_rate": 5.048935586984133e-06, "loss": 0.2177, "step": 394 }, { "epoch": 2.9316770186335406, "grad_norm": 0.2719537845754615, "learning_rate": 5.0374694933267114e-06, "loss": 0.2273, "step": 395 }, { "epoch": 2.9391304347826086, "grad_norm": 0.26226288303787226, "learning_rate": 5.027530635179121e-06, "loss": 0.2132, "step": 396 }, { "epoch": 2.946583850931677, "grad_norm": 0.24929011431595574, "learning_rate": 5.019119688287901e-06, "loss": 0.2238, "step": 397 }, { "epoch": 2.9540372670807455, "grad_norm": 0.2606373590411471, "learning_rate": 5.012237224516342e-06, "loss": 0.2216, "step": 398 }, { "epoch": 2.9614906832298136, "grad_norm": 0.29892861469452736, "learning_rate": 5.00688371180563e-06, "loss": 0.2362, "step": 399 }, { "epoch": 2.968944099378882, "grad_norm": 0.2665719018831708, "learning_rate": 5.003059514143014e-06, "loss": 0.221, "step": 400 }, { "epoch": 2.9763975155279505, "grad_norm": 0.25860725999612966, "learning_rate": 5.000764891537067e-06, "loss": 0.2204, "step": 401 }, { "epoch": 2.9838509316770185, "grad_norm": 0.25161372246359515, "learning_rate": 5e-06, "loss": 0.2245, "step": 402 }, { "epoch": 2.9838509316770185, "step": 402, "total_flos": 8.574354709534474e+17, "train_loss": 0.3579157315083404, "train_runtime": 11555.4492, "train_samples_per_second": 6.688, "train_steps_per_second": 0.035 } ], "logging_steps": 1, "max_steps": 402, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.574354709534474e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }