{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9991154356479433, "eval_steps": 142, "global_step": 1130, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 4.680586814880371, "learning_rate": 1e-05, "loss": 3.3182, "step": 1 }, { "epoch": 0.0, "eval_loss": 3.3362529277801514, "eval_runtime": 14.4477, "eval_samples_per_second": 33.016, "eval_steps_per_second": 8.306, "step": 1 }, { "epoch": 0.0, "grad_norm": 4.609802722930908, "learning_rate": 2e-05, "loss": 3.2788, "step": 2 }, { "epoch": 0.01, "grad_norm": 4.793943405151367, "learning_rate": 3e-05, "loss": 3.3432, "step": 3 }, { "epoch": 0.01, "grad_norm": 4.687256336212158, "learning_rate": 4e-05, "loss": 3.2521, "step": 4 }, { "epoch": 0.01, "grad_norm": 4.676945209503174, "learning_rate": 5e-05, "loss": 3.1085, "step": 5 }, { "epoch": 0.01, "grad_norm": 4.490086078643799, "learning_rate": 6e-05, "loss": 2.8093, "step": 6 }, { "epoch": 0.01, "grad_norm": 4.042544364929199, "learning_rate": 7e-05, "loss": 2.3501, "step": 7 }, { "epoch": 0.01, "grad_norm": 3.4973549842834473, "learning_rate": 8e-05, "loss": 1.6118, "step": 8 }, { "epoch": 0.02, "grad_norm": 2.9255049228668213, "learning_rate": 9e-05, "loss": 0.9938, "step": 9 }, { "epoch": 0.02, "grad_norm": 1.9458708763122559, "learning_rate": 0.0001, "loss": 0.4821, "step": 10 }, { "epoch": 0.02, "grad_norm": 1.3317277431488037, "learning_rate": 9.999991309598974e-05, "loss": 0.3336, "step": 11 }, { "epoch": 0.02, "grad_norm": 0.7918155193328857, "learning_rate": 9.999965238426104e-05, "loss": 0.1707, "step": 12 }, { "epoch": 0.02, "grad_norm": 0.779201090335846, "learning_rate": 9.999921786572015e-05, "loss": 0.1089, "step": 13 }, { "epoch": 0.02, "grad_norm": 1.8333582878112793, "learning_rate": 9.999860954187756e-05, "loss": 0.1829, "step": 14 }, { "epoch": 0.03, "grad_norm": 0.7508969902992249, "learning_rate": 9.999782741484788e-05, "loss": 0.1284, "step": 15 }, { "epoch": 0.03, "grad_norm": 0.6024438142776489, "learning_rate": 9.999687148734995e-05, "loss": 0.1321, "step": 16 }, { "epoch": 0.03, "grad_norm": 0.2141278237104416, "learning_rate": 9.999574176270667e-05, "loss": 0.1294, "step": 17 }, { "epoch": 0.03, "grad_norm": 0.8326414227485657, "learning_rate": 9.999443824484519e-05, "loss": 0.1414, "step": 18 }, { "epoch": 0.03, "grad_norm": 0.25254812836647034, "learning_rate": 9.999296093829672e-05, "loss": 0.1389, "step": 19 }, { "epoch": 0.04, "grad_norm": 0.24321849644184113, "learning_rate": 9.999130984819662e-05, "loss": 0.1354, "step": 20 }, { "epoch": 0.04, "grad_norm": 0.27592459321022034, "learning_rate": 9.998948498028435e-05, "loss": 0.1032, "step": 21 }, { "epoch": 0.04, "grad_norm": 0.5619893670082092, "learning_rate": 9.998748634090344e-05, "loss": 0.1264, "step": 22 }, { "epoch": 0.04, "grad_norm": 0.7668437361717224, "learning_rate": 9.998531393700148e-05, "loss": 0.1223, "step": 23 }, { "epoch": 0.04, "grad_norm": 0.38381776213645935, "learning_rate": 9.99829677761301e-05, "loss": 0.0988, "step": 24 }, { "epoch": 0.04, "grad_norm": 1.8528074026107788, "learning_rate": 9.998044786644491e-05, "loss": 0.1421, "step": 25 }, { "epoch": 0.05, "grad_norm": 2.61940336227417, "learning_rate": 9.997775421670556e-05, "loss": 0.2738, "step": 26 }, { "epoch": 0.05, "grad_norm": 0.5642948150634766, "learning_rate": 9.997488683627559e-05, "loss": 0.1113, "step": 27 }, { "epoch": 0.05, "grad_norm": 0.31713128089904785, "learning_rate": 9.997184573512245e-05, "loss": 0.0593, "step": 28 }, { "epoch": 0.05, "grad_norm": 1.540770411491394, "learning_rate": 9.996863092381752e-05, "loss": 0.2008, "step": 29 }, { "epoch": 0.05, "grad_norm": 0.5343081951141357, "learning_rate": 9.9965242413536e-05, "loss": 0.1141, "step": 30 }, { "epoch": 0.05, "grad_norm": 0.3453178107738495, "learning_rate": 9.99616802160569e-05, "loss": 0.1137, "step": 31 }, { "epoch": 0.06, "grad_norm": 0.24307988584041595, "learning_rate": 9.995794434376297e-05, "loss": 0.0971, "step": 32 }, { "epoch": 0.06, "grad_norm": 0.1228247806429863, "learning_rate": 9.995403480964072e-05, "loss": 0.1246, "step": 33 }, { "epoch": 0.06, "grad_norm": 0.28919702768325806, "learning_rate": 9.994995162728029e-05, "loss": 0.1485, "step": 34 }, { "epoch": 0.06, "grad_norm": 0.16266010701656342, "learning_rate": 9.994569481087552e-05, "loss": 0.1196, "step": 35 }, { "epoch": 0.06, "grad_norm": 0.13942277431488037, "learning_rate": 9.994126437522375e-05, "loss": 0.1266, "step": 36 }, { "epoch": 0.07, "grad_norm": 0.22137008607387543, "learning_rate": 9.99366603357259e-05, "loss": 0.1226, "step": 37 }, { "epoch": 0.07, "grad_norm": 0.4418635070323944, "learning_rate": 9.993188270838635e-05, "loss": 0.1577, "step": 38 }, { "epoch": 0.07, "grad_norm": 0.1568412482738495, "learning_rate": 9.992693150981292e-05, "loss": 0.1205, "step": 39 }, { "epoch": 0.07, "grad_norm": 0.33117881417274475, "learning_rate": 9.992180675721672e-05, "loss": 0.1179, "step": 40 }, { "epoch": 0.07, "grad_norm": 0.18135391175746918, "learning_rate": 9.991650846841226e-05, "loss": 0.098, "step": 41 }, { "epoch": 0.07, "grad_norm": 0.23688584566116333, "learning_rate": 9.99110366618172e-05, "loss": 0.0839, "step": 42 }, { "epoch": 0.08, "grad_norm": 0.40056225657463074, "learning_rate": 9.990539135645245e-05, "loss": 0.1648, "step": 43 }, { "epoch": 0.08, "grad_norm": 0.16901437938213348, "learning_rate": 9.9899572571942e-05, "loss": 0.052, "step": 44 }, { "epoch": 0.08, "grad_norm": 0.6529514193534851, "learning_rate": 9.989358032851284e-05, "loss": 0.1448, "step": 45 }, { "epoch": 0.08, "grad_norm": 0.25050362944602966, "learning_rate": 9.9887414646995e-05, "loss": 0.083, "step": 46 }, { "epoch": 0.08, "grad_norm": 0.38230955600738525, "learning_rate": 9.988107554882138e-05, "loss": 0.0912, "step": 47 }, { "epoch": 0.08, "grad_norm": 0.24738825857639313, "learning_rate": 9.987456305602769e-05, "loss": 0.1337, "step": 48 }, { "epoch": 0.09, "grad_norm": 0.22692906856536865, "learning_rate": 9.986787719125241e-05, "loss": 0.0924, "step": 49 }, { "epoch": 0.09, "grad_norm": 0.6348592638969421, "learning_rate": 9.986101797773667e-05, "loss": 0.1401, "step": 50 }, { "epoch": 0.09, "grad_norm": 0.27844250202178955, "learning_rate": 9.985398543932421e-05, "loss": 0.1028, "step": 51 }, { "epoch": 0.09, "grad_norm": 0.4353552460670471, "learning_rate": 9.984677960046123e-05, "loss": 0.1245, "step": 52 }, { "epoch": 0.09, "grad_norm": 0.3049747943878174, "learning_rate": 9.98394004861964e-05, "loss": 0.0936, "step": 53 }, { "epoch": 0.1, "grad_norm": 0.34531188011169434, "learning_rate": 9.983184812218072e-05, "loss": 0.0775, "step": 54 }, { "epoch": 0.1, "grad_norm": 0.252056360244751, "learning_rate": 9.98241225346674e-05, "loss": 0.1082, "step": 55 }, { "epoch": 0.1, "grad_norm": 0.14545601606369019, "learning_rate": 9.981622375051183e-05, "loss": 0.0857, "step": 56 }, { "epoch": 0.1, "grad_norm": 0.313376784324646, "learning_rate": 9.980815179717145e-05, "loss": 0.0798, "step": 57 }, { "epoch": 0.1, "grad_norm": 0.3164367079734802, "learning_rate": 9.979990670270564e-05, "loss": 0.1103, "step": 58 }, { "epoch": 0.1, "grad_norm": 0.30412447452545166, "learning_rate": 9.979148849577572e-05, "loss": 0.0889, "step": 59 }, { "epoch": 0.11, "grad_norm": 0.2582318186759949, "learning_rate": 9.978289720564471e-05, "loss": 0.0844, "step": 60 }, { "epoch": 0.11, "grad_norm": 0.2912735939025879, "learning_rate": 9.977413286217728e-05, "loss": 0.077, "step": 61 }, { "epoch": 0.11, "grad_norm": 0.3888057768344879, "learning_rate": 9.976519549583974e-05, "loss": 0.1386, "step": 62 }, { "epoch": 0.11, "grad_norm": 0.42121654748916626, "learning_rate": 9.975608513769976e-05, "loss": 0.0823, "step": 63 }, { "epoch": 0.11, "grad_norm": 0.4052259624004364, "learning_rate": 9.974680181942645e-05, "loss": 0.0846, "step": 64 }, { "epoch": 0.11, "grad_norm": 0.2273682802915573, "learning_rate": 9.973734557329009e-05, "loss": 0.0589, "step": 65 }, { "epoch": 0.12, "grad_norm": 0.5051669478416443, "learning_rate": 9.972771643216212e-05, "loss": 0.1111, "step": 66 }, { "epoch": 0.12, "grad_norm": 0.2672370970249176, "learning_rate": 9.971791442951497e-05, "loss": 0.0819, "step": 67 }, { "epoch": 0.12, "grad_norm": 0.28557881712913513, "learning_rate": 9.970793959942198e-05, "loss": 0.0912, "step": 68 }, { "epoch": 0.12, "grad_norm": 0.34148702025413513, "learning_rate": 9.969779197655726e-05, "loss": 0.1036, "step": 69 }, { "epoch": 0.12, "grad_norm": 0.3550778925418854, "learning_rate": 9.968747159619556e-05, "loss": 0.0833, "step": 70 }, { "epoch": 0.13, "grad_norm": 0.3434258699417114, "learning_rate": 9.967697849421221e-05, "loss": 0.1186, "step": 71 }, { "epoch": 0.13, "grad_norm": 0.23545867204666138, "learning_rate": 9.966631270708287e-05, "loss": 0.1185, "step": 72 }, { "epoch": 0.13, "grad_norm": 0.28094542026519775, "learning_rate": 9.965547427188357e-05, "loss": 0.052, "step": 73 }, { "epoch": 0.13, "grad_norm": 0.13517600297927856, "learning_rate": 9.964446322629043e-05, "loss": 0.0695, "step": 74 }, { "epoch": 0.13, "grad_norm": 0.16696467995643616, "learning_rate": 9.963327960857962e-05, "loss": 0.1003, "step": 75 }, { "epoch": 0.13, "grad_norm": 0.18569788336753845, "learning_rate": 9.962192345762717e-05, "loss": 0.0495, "step": 76 }, { "epoch": 0.14, "grad_norm": 0.19817449152469635, "learning_rate": 9.961039481290888e-05, "loss": 0.067, "step": 77 }, { "epoch": 0.14, "grad_norm": 0.14672966301441193, "learning_rate": 9.959869371450021e-05, "loss": 0.0737, "step": 78 }, { "epoch": 0.14, "grad_norm": 0.302121639251709, "learning_rate": 9.958682020307601e-05, "loss": 0.0779, "step": 79 }, { "epoch": 0.14, "grad_norm": 0.35970303416252136, "learning_rate": 9.957477431991054e-05, "loss": 0.134, "step": 80 }, { "epoch": 0.14, "grad_norm": 0.308292955160141, "learning_rate": 9.956255610687719e-05, "loss": 0.1006, "step": 81 }, { "epoch": 0.15, "grad_norm": 0.27124735713005066, "learning_rate": 9.955016560644847e-05, "loss": 0.0572, "step": 82 }, { "epoch": 0.15, "grad_norm": 0.1820615977048874, "learning_rate": 9.953760286169571e-05, "loss": 0.0595, "step": 83 }, { "epoch": 0.15, "grad_norm": 0.36385104060173035, "learning_rate": 9.952486791628905e-05, "loss": 0.0874, "step": 84 }, { "epoch": 0.15, "grad_norm": 0.848340630531311, "learning_rate": 9.95119608144972e-05, "loss": 0.1178, "step": 85 }, { "epoch": 0.15, "grad_norm": 0.7947489023208618, "learning_rate": 9.94988816011873e-05, "loss": 0.1115, "step": 86 }, { "epoch": 0.15, "grad_norm": 0.33932074904441833, "learning_rate": 9.94856303218248e-05, "loss": 0.0546, "step": 87 }, { "epoch": 0.16, "grad_norm": 0.26873940229415894, "learning_rate": 9.947220702247329e-05, "loss": 0.0873, "step": 88 }, { "epoch": 0.16, "grad_norm": 0.3373044431209564, "learning_rate": 9.945861174979429e-05, "loss": 0.1051, "step": 89 }, { "epoch": 0.16, "grad_norm": 0.24391719698905945, "learning_rate": 9.944484455104717e-05, "loss": 0.0986, "step": 90 }, { "epoch": 0.16, "grad_norm": 0.4301680326461792, "learning_rate": 9.943090547408888e-05, "loss": 0.1524, "step": 91 }, { "epoch": 0.16, "grad_norm": 0.4246341288089752, "learning_rate": 9.941679456737394e-05, "loss": 0.1619, "step": 92 }, { "epoch": 0.16, "grad_norm": 0.2680893838405609, "learning_rate": 9.940251187995411e-05, "loss": 0.1187, "step": 93 }, { "epoch": 0.17, "grad_norm": 0.18920297920703888, "learning_rate": 9.938805746147827e-05, "loss": 0.105, "step": 94 }, { "epoch": 0.17, "grad_norm": 0.22168701887130737, "learning_rate": 9.937343136219233e-05, "loss": 0.0856, "step": 95 }, { "epoch": 0.17, "grad_norm": 0.2235199213027954, "learning_rate": 9.935863363293896e-05, "loss": 0.1026, "step": 96 }, { "epoch": 0.17, "grad_norm": 0.16578496992588043, "learning_rate": 9.93436643251574e-05, "loss": 0.0777, "step": 97 }, { "epoch": 0.17, "grad_norm": 0.15994016826152802, "learning_rate": 9.932852349088342e-05, "loss": 0.0957, "step": 98 }, { "epoch": 0.18, "grad_norm": 0.18692170083522797, "learning_rate": 9.931321118274897e-05, "loss": 0.0913, "step": 99 }, { "epoch": 0.18, "grad_norm": 0.15477485954761505, "learning_rate": 9.929772745398206e-05, "loss": 0.0911, "step": 100 }, { "epoch": 0.18, "grad_norm": 0.28473320603370667, "learning_rate": 9.928207235840664e-05, "loss": 0.1283, "step": 101 }, { "epoch": 0.18, "grad_norm": 0.38557159900665283, "learning_rate": 9.926624595044234e-05, "loss": 0.1125, "step": 102 }, { "epoch": 0.18, "grad_norm": 0.13523289561271667, "learning_rate": 9.925024828510427e-05, "loss": 0.0555, "step": 103 }, { "epoch": 0.18, "grad_norm": 0.15305563807487488, "learning_rate": 9.923407941800291e-05, "loss": 0.1003, "step": 104 }, { "epoch": 0.19, "grad_norm": 0.21130621433258057, "learning_rate": 9.921773940534382e-05, "loss": 0.0945, "step": 105 }, { "epoch": 0.19, "grad_norm": 0.2301904559135437, "learning_rate": 9.920122830392748e-05, "loss": 0.1019, "step": 106 }, { "epoch": 0.19, "grad_norm": 0.16425654292106628, "learning_rate": 9.918454617114918e-05, "loss": 0.0781, "step": 107 }, { "epoch": 0.19, "grad_norm": 0.2672991454601288, "learning_rate": 9.916769306499866e-05, "loss": 0.085, "step": 108 }, { "epoch": 0.19, "grad_norm": 0.3746218681335449, "learning_rate": 9.915066904406e-05, "loss": 0.1698, "step": 109 }, { "epoch": 0.19, "grad_norm": 0.1691233068704605, "learning_rate": 9.913347416751148e-05, "loss": 0.046, "step": 110 }, { "epoch": 0.2, "grad_norm": 0.3089153468608856, "learning_rate": 9.91161084951252e-05, "loss": 0.131, "step": 111 }, { "epoch": 0.2, "grad_norm": 0.1581045240163803, "learning_rate": 9.909857208726705e-05, "loss": 0.0654, "step": 112 }, { "epoch": 0.2, "grad_norm": 0.2545772194862366, "learning_rate": 9.908086500489637e-05, "loss": 0.1021, "step": 113 }, { "epoch": 0.2, "grad_norm": 0.2257249355316162, "learning_rate": 9.906298730956586e-05, "loss": 0.0636, "step": 114 }, { "epoch": 0.2, "grad_norm": 0.17862719297409058, "learning_rate": 9.904493906342123e-05, "loss": 0.0942, "step": 115 }, { "epoch": 0.21, "grad_norm": 0.23423053324222565, "learning_rate": 9.902672032920106e-05, "loss": 0.0676, "step": 116 }, { "epoch": 0.21, "grad_norm": 0.26653358340263367, "learning_rate": 9.900833117023664e-05, "loss": 0.0918, "step": 117 }, { "epoch": 0.21, "grad_norm": 0.4517073631286621, "learning_rate": 9.89897716504516e-05, "loss": 0.1102, "step": 118 }, { "epoch": 0.21, "grad_norm": 0.20187437534332275, "learning_rate": 9.897104183436183e-05, "loss": 0.0713, "step": 119 }, { "epoch": 0.21, "grad_norm": 0.5759711861610413, "learning_rate": 9.895214178707516e-05, "loss": 0.0837, "step": 120 }, { "epoch": 0.21, "grad_norm": 0.43704915046691895, "learning_rate": 9.89330715742912e-05, "loss": 0.0868, "step": 121 }, { "epoch": 0.22, "grad_norm": 0.30784374475479126, "learning_rate": 9.891383126230104e-05, "loss": 0.1171, "step": 122 }, { "epoch": 0.22, "grad_norm": 0.23538921773433685, "learning_rate": 9.889442091798712e-05, "loss": 0.055, "step": 123 }, { "epoch": 0.22, "grad_norm": 0.27727362513542175, "learning_rate": 9.887484060882291e-05, "loss": 0.041, "step": 124 }, { "epoch": 0.22, "grad_norm": 0.21666617691516876, "learning_rate": 9.885509040287268e-05, "loss": 0.0624, "step": 125 }, { "epoch": 0.22, "grad_norm": 0.2829636335372925, "learning_rate": 9.883517036879132e-05, "loss": 0.0946, "step": 126 }, { "epoch": 0.22, "grad_norm": 0.34035512804985046, "learning_rate": 9.88150805758241e-05, "loss": 0.0635, "step": 127 }, { "epoch": 0.23, "grad_norm": 0.44064444303512573, "learning_rate": 9.879482109380634e-05, "loss": 0.0931, "step": 128 }, { "epoch": 0.23, "grad_norm": 0.5164741277694702, "learning_rate": 9.877439199316323e-05, "loss": 0.0891, "step": 129 }, { "epoch": 0.23, "grad_norm": 0.5549228191375732, "learning_rate": 9.875379334490962e-05, "loss": 0.1144, "step": 130 }, { "epoch": 0.23, "grad_norm": 0.41133901476860046, "learning_rate": 9.873302522064972e-05, "loss": 0.1022, "step": 131 }, { "epoch": 0.23, "grad_norm": 0.5204330682754517, "learning_rate": 9.871208769257685e-05, "loss": 0.0867, "step": 132 }, { "epoch": 0.24, "grad_norm": 0.3383274972438812, "learning_rate": 9.869098083347323e-05, "loss": 0.0558, "step": 133 }, { "epoch": 0.24, "grad_norm": 0.7084139585494995, "learning_rate": 9.866970471670967e-05, "loss": 0.1208, "step": 134 }, { "epoch": 0.24, "grad_norm": 0.2974587380886078, "learning_rate": 9.864825941624537e-05, "loss": 0.1199, "step": 135 }, { "epoch": 0.24, "grad_norm": 0.1811504065990448, "learning_rate": 9.862664500662764e-05, "loss": 0.1025, "step": 136 }, { "epoch": 0.24, "grad_norm": 0.23748300969600677, "learning_rate": 9.860486156299164e-05, "loss": 0.0864, "step": 137 }, { "epoch": 0.24, "grad_norm": 0.21784676611423492, "learning_rate": 9.85829091610601e-05, "loss": 0.095, "step": 138 }, { "epoch": 0.25, "grad_norm": 0.26979225873947144, "learning_rate": 9.856078787714309e-05, "loss": 0.0864, "step": 139 }, { "epoch": 0.25, "grad_norm": 0.1479984074831009, "learning_rate": 9.853849778813777e-05, "loss": 0.0904, "step": 140 }, { "epoch": 0.25, "grad_norm": 0.17924343049526215, "learning_rate": 9.851603897152803e-05, "loss": 0.0752, "step": 141 }, { "epoch": 0.25, "grad_norm": 0.16448016464710236, "learning_rate": 9.849341150538434e-05, "loss": 0.0781, "step": 142 }, { "epoch": 0.25, "eval_loss": 0.0848281979560852, "eval_runtime": 14.6961, "eval_samples_per_second": 32.458, "eval_steps_per_second": 8.165, "step": 142 }, { "epoch": 0.25, "grad_norm": 0.14405055344104767, "learning_rate": 9.847061546836339e-05, "loss": 0.1007, "step": 143 }, { "epoch": 0.25, "grad_norm": 0.20907168090343475, "learning_rate": 9.844765093970787e-05, "loss": 0.1126, "step": 144 }, { "epoch": 0.26, "grad_norm": 0.1777975857257843, "learning_rate": 9.842451799924616e-05, "loss": 0.0928, "step": 145 }, { "epoch": 0.26, "grad_norm": 0.1817995309829712, "learning_rate": 9.840121672739208e-05, "loss": 0.046, "step": 146 }, { "epoch": 0.26, "grad_norm": 0.2099136859178543, "learning_rate": 9.837774720514457e-05, "loss": 0.1032, "step": 147 }, { "epoch": 0.26, "grad_norm": 0.19467169046401978, "learning_rate": 9.835410951408748e-05, "loss": 0.0913, "step": 148 }, { "epoch": 0.26, "grad_norm": 0.24700500071048737, "learning_rate": 9.833030373638919e-05, "loss": 0.1101, "step": 149 }, { "epoch": 0.27, "grad_norm": 0.22854459285736084, "learning_rate": 9.830632995480242e-05, "loss": 0.0729, "step": 150 }, { "epoch": 0.27, "grad_norm": 0.206742063164711, "learning_rate": 9.828218825266388e-05, "loss": 0.0861, "step": 151 }, { "epoch": 0.27, "grad_norm": 0.40378740429878235, "learning_rate": 9.8257878713894e-05, "loss": 0.0948, "step": 152 }, { "epoch": 0.27, "grad_norm": 0.5055291652679443, "learning_rate": 9.823340142299662e-05, "loss": 0.193, "step": 153 }, { "epoch": 0.27, "grad_norm": 0.3036790192127228, "learning_rate": 9.820875646505874e-05, "loss": 0.0859, "step": 154 }, { "epoch": 0.27, "grad_norm": 0.1878231018781662, "learning_rate": 9.818394392575019e-05, "loss": 0.0702, "step": 155 }, { "epoch": 0.28, "grad_norm": 0.17990007996559143, "learning_rate": 9.815896389132333e-05, "loss": 0.0967, "step": 156 }, { "epoch": 0.28, "grad_norm": 0.22680750489234924, "learning_rate": 9.813381644861277e-05, "loss": 0.0959, "step": 157 }, { "epoch": 0.28, "grad_norm": 0.299663782119751, "learning_rate": 9.810850168503506e-05, "loss": 0.0801, "step": 158 }, { "epoch": 0.28, "grad_norm": 0.3132835924625397, "learning_rate": 9.808301968858837e-05, "loss": 0.1151, "step": 159 }, { "epoch": 0.28, "grad_norm": 0.16891297698020935, "learning_rate": 9.805737054785222e-05, "loss": 0.0799, "step": 160 }, { "epoch": 0.28, "grad_norm": 0.23542983829975128, "learning_rate": 9.803155435198712e-05, "loss": 0.0645, "step": 161 }, { "epoch": 0.29, "grad_norm": 0.1784803569316864, "learning_rate": 9.800557119073433e-05, "loss": 0.0475, "step": 162 }, { "epoch": 0.29, "grad_norm": 0.37001606822013855, "learning_rate": 9.797942115441545e-05, "loss": 0.1331, "step": 163 }, { "epoch": 0.29, "grad_norm": 0.18926851451396942, "learning_rate": 9.795310433393226e-05, "loss": 0.0744, "step": 164 }, { "epoch": 0.29, "grad_norm": 0.15572589635849, "learning_rate": 9.792662082076618e-05, "loss": 0.0551, "step": 165 }, { "epoch": 0.29, "grad_norm": 0.2562514841556549, "learning_rate": 9.789997070697821e-05, "loss": 0.106, "step": 166 }, { "epoch": 0.3, "grad_norm": 0.4756919741630554, "learning_rate": 9.787315408520838e-05, "loss": 0.1229, "step": 167 }, { "epoch": 0.3, "grad_norm": 0.2322833091020584, "learning_rate": 9.78461710486756e-05, "loss": 0.1212, "step": 168 }, { "epoch": 0.3, "grad_norm": 0.28180065751075745, "learning_rate": 9.78190216911772e-05, "loss": 0.0855, "step": 169 }, { "epoch": 0.3, "grad_norm": 0.2058788239955902, "learning_rate": 9.779170610708872e-05, "loss": 0.0445, "step": 170 }, { "epoch": 0.3, "grad_norm": 0.16885802149772644, "learning_rate": 9.776422439136352e-05, "loss": 0.0619, "step": 171 }, { "epoch": 0.3, "grad_norm": 0.2562006413936615, "learning_rate": 9.773657663953243e-05, "loss": 0.1111, "step": 172 }, { "epoch": 0.31, "grad_norm": 0.2394249439239502, "learning_rate": 9.770876294770349e-05, "loss": 0.0562, "step": 173 }, { "epoch": 0.31, "grad_norm": 0.214800164103508, "learning_rate": 9.768078341256155e-05, "loss": 0.0428, "step": 174 }, { "epoch": 0.31, "grad_norm": 0.30056923627853394, "learning_rate": 9.765263813136796e-05, "loss": 0.1173, "step": 175 }, { "epoch": 0.31, "grad_norm": 0.3174525499343872, "learning_rate": 9.762432720196024e-05, "loss": 0.0871, "step": 176 }, { "epoch": 0.31, "grad_norm": 0.31560906767845154, "learning_rate": 9.75958507227517e-05, "loss": 0.1133, "step": 177 }, { "epoch": 0.31, "grad_norm": 0.21752357482910156, "learning_rate": 9.756720879273117e-05, "loss": 0.0421, "step": 178 }, { "epoch": 0.32, "grad_norm": 0.3215218484401703, "learning_rate": 9.753840151146259e-05, "loss": 0.0596, "step": 179 }, { "epoch": 0.32, "grad_norm": 0.3161137104034424, "learning_rate": 9.750942897908468e-05, "loss": 0.122, "step": 180 }, { "epoch": 0.32, "grad_norm": 0.4206744134426117, "learning_rate": 9.748029129631062e-05, "loss": 0.0966, "step": 181 }, { "epoch": 0.32, "grad_norm": 0.28242579102516174, "learning_rate": 9.745098856442768e-05, "loss": 0.0853, "step": 182 }, { "epoch": 0.32, "grad_norm": 0.24647079408168793, "learning_rate": 9.742152088529684e-05, "loss": 0.1077, "step": 183 }, { "epoch": 0.33, "grad_norm": 0.29940977692604065, "learning_rate": 9.739188836135247e-05, "loss": 0.0837, "step": 184 }, { "epoch": 0.33, "grad_norm": 0.21811984479427338, "learning_rate": 9.7362091095602e-05, "loss": 0.1, "step": 185 }, { "epoch": 0.33, "grad_norm": 0.24434742331504822, "learning_rate": 9.733212919162549e-05, "loss": 0.0839, "step": 186 }, { "epoch": 0.33, "grad_norm": 0.1656690537929535, "learning_rate": 9.730200275357535e-05, "loss": 0.0894, "step": 187 }, { "epoch": 0.33, "grad_norm": 0.16984042525291443, "learning_rate": 9.727171188617587e-05, "loss": 0.0732, "step": 188 }, { "epoch": 0.33, "grad_norm": 0.19889003038406372, "learning_rate": 9.7241256694723e-05, "loss": 0.0832, "step": 189 }, { "epoch": 0.34, "grad_norm": 0.1639273464679718, "learning_rate": 9.721063728508383e-05, "loss": 0.0912, "step": 190 }, { "epoch": 0.34, "grad_norm": 0.26211172342300415, "learning_rate": 9.717985376369639e-05, "loss": 0.0986, "step": 191 }, { "epoch": 0.34, "grad_norm": 0.12403538823127747, "learning_rate": 9.714890623756912e-05, "loss": 0.0844, "step": 192 }, { "epoch": 0.34, "grad_norm": 0.1882586032152176, "learning_rate": 9.711779481428057e-05, "loss": 0.1163, "step": 193 }, { "epoch": 0.34, "grad_norm": 0.1780715435743332, "learning_rate": 9.708651960197904e-05, "loss": 0.1038, "step": 194 }, { "epoch": 0.34, "grad_norm": 0.1291002333164215, "learning_rate": 9.705508070938218e-05, "loss": 0.0746, "step": 195 }, { "epoch": 0.35, "grad_norm": 0.2384466975927353, "learning_rate": 9.702347824577666e-05, "loss": 0.0909, "step": 196 }, { "epoch": 0.35, "grad_norm": 0.25463247299194336, "learning_rate": 9.699171232101768e-05, "loss": 0.0977, "step": 197 }, { "epoch": 0.35, "grad_norm": 0.19303986430168152, "learning_rate": 9.69597830455287e-05, "loss": 0.1137, "step": 198 }, { "epoch": 0.35, "grad_norm": 0.21899022161960602, "learning_rate": 9.692769053030099e-05, "loss": 0.0671, "step": 199 }, { "epoch": 0.35, "grad_norm": 0.16923530399799347, "learning_rate": 9.689543488689332e-05, "loss": 0.0776, "step": 200 }, { "epoch": 0.36, "grad_norm": 0.260955810546875, "learning_rate": 9.686301622743144e-05, "loss": 0.092, "step": 201 }, { "epoch": 0.36, "grad_norm": 0.1771455854177475, "learning_rate": 9.683043466460782e-05, "loss": 0.06, "step": 202 }, { "epoch": 0.36, "grad_norm": 0.18851810693740845, "learning_rate": 9.67976903116812e-05, "loss": 0.0965, "step": 203 }, { "epoch": 0.36, "grad_norm": 0.589522659778595, "learning_rate": 9.676478328247622e-05, "loss": 0.1673, "step": 204 }, { "epoch": 0.36, "grad_norm": 0.21747058629989624, "learning_rate": 9.673171369138296e-05, "loss": 0.0997, "step": 205 }, { "epoch": 0.36, "grad_norm": 0.4168107807636261, "learning_rate": 9.669848165335666e-05, "loss": 0.0795, "step": 206 }, { "epoch": 0.37, "grad_norm": 0.6520416140556335, "learning_rate": 9.666508728391719e-05, "loss": 0.1177, "step": 207 }, { "epoch": 0.37, "grad_norm": 0.3752453029155731, "learning_rate": 9.663153069914875e-05, "loss": 0.0871, "step": 208 }, { "epoch": 0.37, "grad_norm": 0.26546868681907654, "learning_rate": 9.65978120156994e-05, "loss": 0.0647, "step": 209 }, { "epoch": 0.37, "grad_norm": 0.20044176280498505, "learning_rate": 9.656393135078068e-05, "loss": 0.1072, "step": 210 }, { "epoch": 0.37, "grad_norm": 0.25033503770828247, "learning_rate": 9.652988882216724e-05, "loss": 0.1326, "step": 211 }, { "epoch": 0.38, "grad_norm": 0.24569682776927948, "learning_rate": 9.649568454819637e-05, "loss": 0.0931, "step": 212 }, { "epoch": 0.38, "grad_norm": 0.5623157620429993, "learning_rate": 9.64613186477676e-05, "loss": 0.2157, "step": 213 }, { "epoch": 0.38, "grad_norm": 0.15052182972431183, "learning_rate": 9.642679124034233e-05, "loss": 0.1236, "step": 214 }, { "epoch": 0.38, "grad_norm": 0.14209671318531036, "learning_rate": 9.639210244594334e-05, "loss": 0.0971, "step": 215 }, { "epoch": 0.38, "grad_norm": 0.1627768725156784, "learning_rate": 9.635725238515445e-05, "loss": 0.1161, "step": 216 }, { "epoch": 0.38, "grad_norm": 0.10190293937921524, "learning_rate": 9.63222411791201e-05, "loss": 0.0999, "step": 217 }, { "epoch": 0.39, "grad_norm": 0.11575043201446533, "learning_rate": 9.62870689495448e-05, "loss": 0.0986, "step": 218 }, { "epoch": 0.39, "grad_norm": 0.09842410683631897, "learning_rate": 9.62517358186929e-05, "loss": 0.1176, "step": 219 }, { "epoch": 0.39, "grad_norm": 0.14816004037857056, "learning_rate": 9.621624190938803e-05, "loss": 0.0833, "step": 220 }, { "epoch": 0.39, "grad_norm": 0.11311839520931244, "learning_rate": 9.618058734501269e-05, "loss": 0.0815, "step": 221 }, { "epoch": 0.39, "grad_norm": 0.17481163144111633, "learning_rate": 9.614477224950789e-05, "loss": 0.0678, "step": 222 }, { "epoch": 0.39, "grad_norm": 0.2277013659477234, "learning_rate": 9.610879674737264e-05, "loss": 0.0941, "step": 223 }, { "epoch": 0.4, "grad_norm": 0.14689870178699493, "learning_rate": 9.607266096366352e-05, "loss": 0.0991, "step": 224 }, { "epoch": 0.4, "grad_norm": 0.24558769166469574, "learning_rate": 9.603636502399436e-05, "loss": 0.0878, "step": 225 }, { "epoch": 0.4, "grad_norm": 0.1541660875082016, "learning_rate": 9.599990905453567e-05, "loss": 0.0784, "step": 226 }, { "epoch": 0.4, "grad_norm": 0.12188339233398438, "learning_rate": 9.59632931820142e-05, "loss": 0.0464, "step": 227 }, { "epoch": 0.4, "grad_norm": 0.32710394263267517, "learning_rate": 9.592651753371265e-05, "loss": 0.0541, "step": 228 }, { "epoch": 0.41, "grad_norm": 0.3118465840816498, "learning_rate": 9.588958223746903e-05, "loss": 0.0845, "step": 229 }, { "epoch": 0.41, "grad_norm": 0.26805219054222107, "learning_rate": 9.585248742167639e-05, "loss": 0.0485, "step": 230 }, { "epoch": 0.41, "grad_norm": 0.7972936630249023, "learning_rate": 9.581523321528223e-05, "loss": 0.1013, "step": 231 }, { "epoch": 0.41, "grad_norm": 0.6285438537597656, "learning_rate": 9.577781974778817e-05, "loss": 0.0767, "step": 232 }, { "epoch": 0.41, "grad_norm": 0.6384493708610535, "learning_rate": 9.57402471492494e-05, "loss": 0.1855, "step": 233 }, { "epoch": 0.41, "grad_norm": 0.5759001970291138, "learning_rate": 9.570251555027432e-05, "loss": 0.1585, "step": 234 }, { "epoch": 0.42, "grad_norm": 0.42002353072166443, "learning_rate": 9.566462508202402e-05, "loss": 0.1479, "step": 235 }, { "epoch": 0.42, "grad_norm": 0.18405884504318237, "learning_rate": 9.562657587621184e-05, "loss": 0.09, "step": 236 }, { "epoch": 0.42, "grad_norm": 0.20893922448158264, "learning_rate": 9.558836806510291e-05, "loss": 0.0685, "step": 237 }, { "epoch": 0.42, "grad_norm": 0.31388092041015625, "learning_rate": 9.555000178151374e-05, "loss": 0.0983, "step": 238 }, { "epoch": 0.42, "grad_norm": 0.20344533026218414, "learning_rate": 9.551147715881166e-05, "loss": 0.0944, "step": 239 }, { "epoch": 0.42, "grad_norm": 0.1582648903131485, "learning_rate": 9.547279433091446e-05, "loss": 0.0662, "step": 240 }, { "epoch": 0.43, "grad_norm": 0.16737405955791473, "learning_rate": 9.543395343228983e-05, "loss": 0.1565, "step": 241 }, { "epoch": 0.43, "grad_norm": 0.21974924206733704, "learning_rate": 9.539495459795499e-05, "loss": 0.1243, "step": 242 }, { "epoch": 0.43, "grad_norm": 0.1147058829665184, "learning_rate": 9.535579796347612e-05, "loss": 0.0727, "step": 243 }, { "epoch": 0.43, "grad_norm": 0.13460345566272736, "learning_rate": 9.531648366496799e-05, "loss": 0.0691, "step": 244 }, { "epoch": 0.43, "grad_norm": 0.1404263824224472, "learning_rate": 9.527701183909336e-05, "loss": 0.0975, "step": 245 }, { "epoch": 0.44, "grad_norm": 0.17380090057849884, "learning_rate": 9.523738262306269e-05, "loss": 0.0873, "step": 246 }, { "epoch": 0.44, "grad_norm": 0.13862797617912292, "learning_rate": 9.519759615463346e-05, "loss": 0.0738, "step": 247 }, { "epoch": 0.44, "grad_norm": 0.17551685869693756, "learning_rate": 9.51576525721098e-05, "loss": 0.0676, "step": 248 }, { "epoch": 0.44, "grad_norm": 0.20715269446372986, "learning_rate": 9.511755201434205e-05, "loss": 0.0737, "step": 249 }, { "epoch": 0.44, "grad_norm": 0.14763356745243073, "learning_rate": 9.507729462072614e-05, "loss": 0.07, "step": 250 }, { "epoch": 0.44, "grad_norm": 0.317452073097229, "learning_rate": 9.503688053120327e-05, "loss": 0.1252, "step": 251 }, { "epoch": 0.45, "grad_norm": 0.21908459067344666, "learning_rate": 9.499630988625925e-05, "loss": 0.0877, "step": 252 }, { "epoch": 0.45, "grad_norm": 0.3233601450920105, "learning_rate": 9.49555828269242e-05, "loss": 0.0891, "step": 253 }, { "epoch": 0.45, "grad_norm": 0.4098372161388397, "learning_rate": 9.491469949477187e-05, "loss": 0.0805, "step": 254 }, { "epoch": 0.45, "grad_norm": 0.40573808550834656, "learning_rate": 9.487366003191931e-05, "loss": 0.1284, "step": 255 }, { "epoch": 0.45, "grad_norm": 0.3391616940498352, "learning_rate": 9.483246458102625e-05, "loss": 0.0901, "step": 256 }, { "epoch": 0.45, "grad_norm": 0.1822938323020935, "learning_rate": 9.479111328529473e-05, "loss": 0.0398, "step": 257 }, { "epoch": 0.46, "grad_norm": 0.4700302183628082, "learning_rate": 9.474960628846843e-05, "loss": 0.1509, "step": 258 }, { "epoch": 0.46, "grad_norm": 0.20210890471935272, "learning_rate": 9.470794373483236e-05, "loss": 0.0765, "step": 259 }, { "epoch": 0.46, "grad_norm": 0.28329914808273315, "learning_rate": 9.466612576921223e-05, "loss": 0.0666, "step": 260 }, { "epoch": 0.46, "grad_norm": 0.41083166003227234, "learning_rate": 9.462415253697401e-05, "loss": 0.1248, "step": 261 }, { "epoch": 0.46, "grad_norm": 0.17644570767879486, "learning_rate": 9.458202418402338e-05, "loss": 0.0532, "step": 262 }, { "epoch": 0.47, "grad_norm": 0.2667219936847687, "learning_rate": 9.453974085680526e-05, "loss": 0.0937, "step": 263 }, { "epoch": 0.47, "grad_norm": 0.20900332927703857, "learning_rate": 9.449730270230326e-05, "loss": 0.0853, "step": 264 }, { "epoch": 0.47, "grad_norm": 0.26425743103027344, "learning_rate": 9.445470986803922e-05, "loss": 0.12, "step": 265 }, { "epoch": 0.47, "grad_norm": 0.1956167221069336, "learning_rate": 9.441196250207267e-05, "loss": 0.0965, "step": 266 }, { "epoch": 0.47, "grad_norm": 0.21896903216838837, "learning_rate": 9.436906075300032e-05, "loss": 0.0867, "step": 267 }, { "epoch": 0.47, "grad_norm": 0.2082919031381607, "learning_rate": 9.432600476995551e-05, "loss": 0.0847, "step": 268 }, { "epoch": 0.48, "grad_norm": 0.1674569696187973, "learning_rate": 9.428279470260776e-05, "loss": 0.0846, "step": 269 }, { "epoch": 0.48, "grad_norm": 0.23109744489192963, "learning_rate": 9.423943070116218e-05, "loss": 0.136, "step": 270 }, { "epoch": 0.48, "grad_norm": 0.21344415843486786, "learning_rate": 9.4195912916359e-05, "loss": 0.1091, "step": 271 }, { "epoch": 0.48, "grad_norm": 0.16391590237617493, "learning_rate": 9.415224149947306e-05, "loss": 0.0901, "step": 272 }, { "epoch": 0.48, "grad_norm": 0.2023243010044098, "learning_rate": 9.410841660231315e-05, "loss": 0.0635, "step": 273 }, { "epoch": 0.48, "grad_norm": 0.1723608821630478, "learning_rate": 9.406443837722168e-05, "loss": 0.1001, "step": 274 }, { "epoch": 0.49, "grad_norm": 0.1470147669315338, "learning_rate": 9.402030697707398e-05, "loss": 0.0721, "step": 275 }, { "epoch": 0.49, "grad_norm": 0.15082985162734985, "learning_rate": 9.397602255527791e-05, "loss": 0.0698, "step": 276 }, { "epoch": 0.49, "grad_norm": 0.16322006285190582, "learning_rate": 9.393158526577323e-05, "loss": 0.0809, "step": 277 }, { "epoch": 0.49, "grad_norm": 0.10098633915185928, "learning_rate": 9.388699526303105e-05, "loss": 0.0386, "step": 278 }, { "epoch": 0.49, "grad_norm": 0.19049708545207977, "learning_rate": 9.38422527020534e-05, "loss": 0.0559, "step": 279 }, { "epoch": 0.5, "grad_norm": 0.22742775082588196, "learning_rate": 9.37973577383726e-05, "loss": 0.0802, "step": 280 }, { "epoch": 0.5, "grad_norm": 0.2055177539587021, "learning_rate": 9.375231052805072e-05, "loss": 0.1048, "step": 281 }, { "epoch": 0.5, "grad_norm": 0.1366245150566101, "learning_rate": 9.370711122767913e-05, "loss": 0.0204, "step": 282 }, { "epoch": 0.5, "grad_norm": 0.3235447406768799, "learning_rate": 9.36617599943778e-05, "loss": 0.0974, "step": 283 }, { "epoch": 0.5, "grad_norm": 0.09579204767942429, "learning_rate": 9.361625698579493e-05, "loss": 0.0151, "step": 284 }, { "epoch": 0.5, "eval_loss": 0.07987037301063538, "eval_runtime": 14.6437, "eval_samples_per_second": 32.574, "eval_steps_per_second": 8.195, "step": 284 }, { "epoch": 0.5, "grad_norm": 0.22850771248340607, "learning_rate": 9.357060236010625e-05, "loss": 0.0458, "step": 285 }, { "epoch": 0.51, "grad_norm": 0.4980478882789612, "learning_rate": 9.352479627601457e-05, "loss": 0.1306, "step": 286 }, { "epoch": 0.51, "grad_norm": 0.18770304322242737, "learning_rate": 9.347883889274923e-05, "loss": 0.0218, "step": 287 }, { "epoch": 0.51, "grad_norm": 0.6386083364486694, "learning_rate": 9.34327303700654e-05, "loss": 0.0912, "step": 288 }, { "epoch": 0.51, "grad_norm": 0.4997164309024811, "learning_rate": 9.338647086824372e-05, "loss": 0.1083, "step": 289 }, { "epoch": 0.51, "grad_norm": 0.31682559847831726, "learning_rate": 9.334006054808966e-05, "loss": 0.0947, "step": 290 }, { "epoch": 0.51, "grad_norm": 0.28325051069259644, "learning_rate": 9.329349957093292e-05, "loss": 0.0794, "step": 291 }, { "epoch": 0.52, "grad_norm": 0.5778185725212097, "learning_rate": 9.324678809862695e-05, "loss": 0.1223, "step": 292 }, { "epoch": 0.52, "grad_norm": 0.2953624725341797, "learning_rate": 9.319992629354828e-05, "loss": 0.0747, "step": 293 }, { "epoch": 0.52, "grad_norm": 0.28283196687698364, "learning_rate": 9.31529143185961e-05, "loss": 0.1099, "step": 294 }, { "epoch": 0.52, "grad_norm": 0.36138102412223816, "learning_rate": 9.310575233719154e-05, "loss": 0.1303, "step": 295 }, { "epoch": 0.52, "grad_norm": 0.16202205419540405, "learning_rate": 9.305844051327725e-05, "loss": 0.0805, "step": 296 }, { "epoch": 0.53, "grad_norm": 0.11523901671171188, "learning_rate": 9.30109790113167e-05, "loss": 0.0775, "step": 297 }, { "epoch": 0.53, "grad_norm": 0.10756238549947739, "learning_rate": 9.296336799629369e-05, "loss": 0.0795, "step": 298 }, { "epoch": 0.53, "grad_norm": 0.18142195045948029, "learning_rate": 9.291560763371173e-05, "loss": 0.0833, "step": 299 }, { "epoch": 0.53, "grad_norm": 0.14596430957317352, "learning_rate": 9.28676980895935e-05, "loss": 0.0904, "step": 300 }, { "epoch": 0.53, "grad_norm": 0.10054739564657211, "learning_rate": 9.28196395304803e-05, "loss": 0.0898, "step": 301 }, { "epoch": 0.53, "grad_norm": 0.24579764902591705, "learning_rate": 9.277143212343134e-05, "loss": 0.145, "step": 302 }, { "epoch": 0.54, "grad_norm": 0.13506978750228882, "learning_rate": 9.272307603602334e-05, "loss": 0.0847, "step": 303 }, { "epoch": 0.54, "grad_norm": 0.17480792105197906, "learning_rate": 9.267457143634979e-05, "loss": 0.125, "step": 304 }, { "epoch": 0.54, "grad_norm": 0.31458401679992676, "learning_rate": 9.262591849302048e-05, "loss": 0.1047, "step": 305 }, { "epoch": 0.54, "grad_norm": 0.17494355142116547, "learning_rate": 9.257711737516082e-05, "loss": 0.0576, "step": 306 }, { "epoch": 0.54, "grad_norm": 0.2996468245983124, "learning_rate": 9.252816825241134e-05, "loss": 0.1012, "step": 307 }, { "epoch": 0.54, "grad_norm": 0.19122976064682007, "learning_rate": 9.247907129492707e-05, "loss": 0.0878, "step": 308 }, { "epoch": 0.55, "grad_norm": 0.16079925000667572, "learning_rate": 9.242982667337685e-05, "loss": 0.0778, "step": 309 }, { "epoch": 0.55, "grad_norm": 0.2628028392791748, "learning_rate": 9.238043455894293e-05, "loss": 0.0987, "step": 310 }, { "epoch": 0.55, "grad_norm": 0.318097859621048, "learning_rate": 9.23308951233202e-05, "loss": 0.1108, "step": 311 }, { "epoch": 0.55, "grad_norm": 0.2207389920949936, "learning_rate": 9.228120853871571e-05, "loss": 0.0826, "step": 312 }, { "epoch": 0.55, "grad_norm": 0.34375905990600586, "learning_rate": 9.223137497784797e-05, "loss": 0.1174, "step": 313 }, { "epoch": 0.56, "grad_norm": 0.16714760661125183, "learning_rate": 9.218139461394644e-05, "loss": 0.0883, "step": 314 }, { "epoch": 0.56, "grad_norm": 0.24213539063930511, "learning_rate": 9.213126762075088e-05, "loss": 0.0686, "step": 315 }, { "epoch": 0.56, "grad_norm": 0.2654499113559723, "learning_rate": 9.208099417251077e-05, "loss": 0.1185, "step": 316 }, { "epoch": 0.56, "grad_norm": 0.1353083997964859, "learning_rate": 9.203057444398469e-05, "loss": 0.0806, "step": 317 }, { "epoch": 0.56, "grad_norm": 0.30304938554763794, "learning_rate": 9.198000861043967e-05, "loss": 0.0817, "step": 318 }, { "epoch": 0.56, "grad_norm": 0.14495517313480377, "learning_rate": 9.192929684765067e-05, "loss": 0.0436, "step": 319 }, { "epoch": 0.57, "grad_norm": 0.2180556207895279, "learning_rate": 9.187843933189995e-05, "loss": 0.1255, "step": 320 }, { "epoch": 0.57, "grad_norm": 0.3051697611808777, "learning_rate": 9.182743623997634e-05, "loss": 0.1241, "step": 321 }, { "epoch": 0.57, "grad_norm": 0.42936787009239197, "learning_rate": 9.17762877491748e-05, "loss": 0.1847, "step": 322 }, { "epoch": 0.57, "grad_norm": 0.20895107090473175, "learning_rate": 9.172499403729566e-05, "loss": 0.0939, "step": 323 }, { "epoch": 0.57, "grad_norm": 0.15273532271385193, "learning_rate": 9.167355528264414e-05, "loss": 0.1012, "step": 324 }, { "epoch": 0.57, "grad_norm": 0.15428248047828674, "learning_rate": 9.162197166402956e-05, "loss": 0.061, "step": 325 }, { "epoch": 0.58, "grad_norm": 0.13089029490947723, "learning_rate": 9.157024336076487e-05, "loss": 0.089, "step": 326 }, { "epoch": 0.58, "grad_norm": 0.12000248581171036, "learning_rate": 9.151837055266594e-05, "loss": 0.0813, "step": 327 }, { "epoch": 0.58, "grad_norm": 0.12965545058250427, "learning_rate": 9.146635342005099e-05, "loss": 0.113, "step": 328 }, { "epoch": 0.58, "grad_norm": 0.12225235253572464, "learning_rate": 9.14141921437399e-05, "loss": 0.0968, "step": 329 }, { "epoch": 0.58, "grad_norm": 0.11869696527719498, "learning_rate": 9.136188690505363e-05, "loss": 0.0752, "step": 330 }, { "epoch": 0.59, "grad_norm": 0.22600843012332916, "learning_rate": 9.130943788581359e-05, "loss": 0.1049, "step": 331 }, { "epoch": 0.59, "grad_norm": 0.13381795585155487, "learning_rate": 9.125684526834099e-05, "loss": 0.0917, "step": 332 }, { "epoch": 0.59, "grad_norm": 0.12936879694461823, "learning_rate": 9.120410923545619e-05, "loss": 0.0782, "step": 333 }, { "epoch": 0.59, "grad_norm": 0.14804388582706451, "learning_rate": 9.115122997047811e-05, "loss": 0.0959, "step": 334 }, { "epoch": 0.59, "grad_norm": 0.18504676222801208, "learning_rate": 9.109820765722357e-05, "loss": 0.1126, "step": 335 }, { "epoch": 0.59, "grad_norm": 0.1957363486289978, "learning_rate": 9.10450424800066e-05, "loss": 0.101, "step": 336 }, { "epoch": 0.6, "grad_norm": 0.15677915513515472, "learning_rate": 9.099173462363792e-05, "loss": 0.0775, "step": 337 }, { "epoch": 0.6, "grad_norm": 0.124906025826931, "learning_rate": 9.093828427342418e-05, "loss": 0.07, "step": 338 }, { "epoch": 0.6, "grad_norm": 0.12451624125242233, "learning_rate": 9.088469161516735e-05, "loss": 0.0588, "step": 339 }, { "epoch": 0.6, "grad_norm": 0.509678304195404, "learning_rate": 9.083095683516414e-05, "loss": 0.1563, "step": 340 }, { "epoch": 0.6, "grad_norm": 0.2245551496744156, "learning_rate": 9.077708012020524e-05, "loss": 0.1029, "step": 341 }, { "epoch": 0.61, "grad_norm": 0.2735763192176819, "learning_rate": 9.072306165757476e-05, "loss": 0.0958, "step": 342 }, { "epoch": 0.61, "grad_norm": 0.2062731385231018, "learning_rate": 9.066890163504955e-05, "loss": 0.0638, "step": 343 }, { "epoch": 0.61, "grad_norm": 0.1664024293422699, "learning_rate": 9.061460024089853e-05, "loss": 0.0555, "step": 344 }, { "epoch": 0.61, "grad_norm": 0.15788845717906952, "learning_rate": 9.056015766388205e-05, "loss": 0.0509, "step": 345 }, { "epoch": 0.61, "grad_norm": 0.185616135597229, "learning_rate": 9.050557409325125e-05, "loss": 0.1196, "step": 346 }, { "epoch": 0.61, "grad_norm": 0.24650661647319794, "learning_rate": 9.045084971874738e-05, "loss": 0.0723, "step": 347 }, { "epoch": 0.62, "grad_norm": 0.23959776759147644, "learning_rate": 9.039598473060113e-05, "loss": 0.1139, "step": 348 }, { "epoch": 0.62, "grad_norm": 0.24370582401752472, "learning_rate": 9.034097931953201e-05, "loss": 0.0559, "step": 349 }, { "epoch": 0.62, "grad_norm": 0.11590461432933807, "learning_rate": 9.028583367674765e-05, "loss": 0.0285, "step": 350 }, { "epoch": 0.62, "grad_norm": 0.21419131755828857, "learning_rate": 9.023054799394316e-05, "loss": 0.0686, "step": 351 }, { "epoch": 0.62, "grad_norm": 0.2115790992975235, "learning_rate": 9.017512246330042e-05, "loss": 0.071, "step": 352 }, { "epoch": 0.62, "grad_norm": 0.2025454044342041, "learning_rate": 9.011955727748748e-05, "loss": 0.0993, "step": 353 }, { "epoch": 0.63, "grad_norm": 0.22743502259254456, "learning_rate": 9.006385262965786e-05, "loss": 0.0705, "step": 354 }, { "epoch": 0.63, "grad_norm": 0.16963045299053192, "learning_rate": 9.00080087134498e-05, "loss": 0.0569, "step": 355 }, { "epoch": 0.63, "grad_norm": 0.12319042533636093, "learning_rate": 8.995202572298576e-05, "loss": 0.0427, "step": 356 }, { "epoch": 0.63, "grad_norm": 0.15424852073192596, "learning_rate": 8.989590385287155e-05, "loss": 0.0564, "step": 357 }, { "epoch": 0.63, "grad_norm": 0.306594580411911, "learning_rate": 8.983964329819583e-05, "loss": 0.095, "step": 358 }, { "epoch": 0.64, "grad_norm": 0.13876177370548248, "learning_rate": 8.978324425452931e-05, "loss": 0.0641, "step": 359 }, { "epoch": 0.64, "grad_norm": 0.19561870396137238, "learning_rate": 8.972670691792409e-05, "loss": 0.0635, "step": 360 }, { "epoch": 0.64, "grad_norm": 0.3458711504936218, "learning_rate": 8.967003148491304e-05, "loss": 0.1328, "step": 361 }, { "epoch": 0.64, "grad_norm": 0.1129189059138298, "learning_rate": 8.961321815250905e-05, "loss": 0.0205, "step": 362 }, { "epoch": 0.64, "grad_norm": 0.3680332601070404, "learning_rate": 8.955626711820438e-05, "loss": 0.1302, "step": 363 }, { "epoch": 0.64, "grad_norm": 0.2695287764072418, "learning_rate": 8.949917857996996e-05, "loss": 0.0511, "step": 364 }, { "epoch": 0.65, "grad_norm": 0.17332953214645386, "learning_rate": 8.94419527362547e-05, "loss": 0.0494, "step": 365 }, { "epoch": 0.65, "grad_norm": 0.11610284447669983, "learning_rate": 8.938458978598483e-05, "loss": 0.0381, "step": 366 }, { "epoch": 0.65, "grad_norm": 0.23595061898231506, "learning_rate": 8.932708992856315e-05, "loss": 0.0802, "step": 367 }, { "epoch": 0.65, "grad_norm": 0.26876452565193176, "learning_rate": 8.926945336386838e-05, "loss": 0.0461, "step": 368 }, { "epoch": 0.65, "grad_norm": 0.19504375755786896, "learning_rate": 8.921168029225448e-05, "loss": 0.0317, "step": 369 }, { "epoch": 0.65, "grad_norm": 0.4416268467903137, "learning_rate": 8.915377091454992e-05, "loss": 0.0952, "step": 370 }, { "epoch": 0.66, "grad_norm": 0.32519325613975525, "learning_rate": 8.909572543205698e-05, "loss": 0.1027, "step": 371 }, { "epoch": 0.66, "grad_norm": 0.3939536511898041, "learning_rate": 8.903754404655106e-05, "loss": 0.1718, "step": 372 }, { "epoch": 0.66, "grad_norm": 0.20514678955078125, "learning_rate": 8.897922696027999e-05, "loss": 0.06, "step": 373 }, { "epoch": 0.66, "grad_norm": 0.3049127459526062, "learning_rate": 8.892077437596332e-05, "loss": 0.1014, "step": 374 }, { "epoch": 0.66, "grad_norm": 0.19251297414302826, "learning_rate": 8.88621864967916e-05, "loss": 0.048, "step": 375 }, { "epoch": 0.67, "grad_norm": 0.21226820349693298, "learning_rate": 8.880346352642575e-05, "loss": 0.0652, "step": 376 }, { "epoch": 0.67, "grad_norm": 0.42633509635925293, "learning_rate": 8.874460566899616e-05, "loss": 0.1083, "step": 377 }, { "epoch": 0.67, "grad_norm": 0.24073313176631927, "learning_rate": 8.868561312910221e-05, "loss": 0.0851, "step": 378 }, { "epoch": 0.67, "grad_norm": 0.3047339618206024, "learning_rate": 8.862648611181145e-05, "loss": 0.086, "step": 379 }, { "epoch": 0.67, "grad_norm": 0.2227114737033844, "learning_rate": 8.856722482265886e-05, "loss": 0.1196, "step": 380 }, { "epoch": 0.67, "grad_norm": 0.1860799938440323, "learning_rate": 8.850782946764619e-05, "loss": 0.0779, "step": 381 }, { "epoch": 0.68, "grad_norm": 0.2109043002128601, "learning_rate": 8.844830025324122e-05, "loss": 0.076, "step": 382 }, { "epoch": 0.68, "grad_norm": 0.1998620331287384, "learning_rate": 8.838863738637706e-05, "loss": 0.1027, "step": 383 }, { "epoch": 0.68, "grad_norm": 0.12607474625110626, "learning_rate": 8.832884107445139e-05, "loss": 0.0436, "step": 384 }, { "epoch": 0.68, "grad_norm": 0.2890150845050812, "learning_rate": 8.826891152532579e-05, "loss": 0.0966, "step": 385 }, { "epoch": 0.68, "grad_norm": 0.4496447443962097, "learning_rate": 8.820884894732497e-05, "loss": 0.1575, "step": 386 }, { "epoch": 0.68, "grad_norm": 0.19411596655845642, "learning_rate": 8.814865354923613e-05, "loss": 0.1201, "step": 387 }, { "epoch": 0.69, "grad_norm": 0.19513021409511566, "learning_rate": 8.808832554030808e-05, "loss": 0.0747, "step": 388 }, { "epoch": 0.69, "grad_norm": 0.14038780331611633, "learning_rate": 8.802786513025068e-05, "loss": 0.0608, "step": 389 }, { "epoch": 0.69, "grad_norm": 0.14907363057136536, "learning_rate": 8.796727252923402e-05, "loss": 0.0843, "step": 390 }, { "epoch": 0.69, "grad_norm": 0.18512780964374542, "learning_rate": 8.790654794788769e-05, "loss": 0.0988, "step": 391 }, { "epoch": 0.69, "grad_norm": 0.17880797386169434, "learning_rate": 8.784569159730007e-05, "loss": 0.079, "step": 392 }, { "epoch": 0.7, "grad_norm": 0.16263402998447418, "learning_rate": 8.778470368901762e-05, "loss": 0.0704, "step": 393 }, { "epoch": 0.7, "grad_norm": 0.27071598172187805, "learning_rate": 8.772358443504405e-05, "loss": 0.0983, "step": 394 }, { "epoch": 0.7, "grad_norm": 0.23446398973464966, "learning_rate": 8.766233404783974e-05, "loss": 0.0577, "step": 395 }, { "epoch": 0.7, "grad_norm": 0.2932927906513214, "learning_rate": 8.760095274032083e-05, "loss": 0.0946, "step": 396 }, { "epoch": 0.7, "grad_norm": 0.29224956035614014, "learning_rate": 8.75394407258586e-05, "loss": 0.078, "step": 397 }, { "epoch": 0.7, "grad_norm": 0.15467233955860138, "learning_rate": 8.747779821827868e-05, "loss": 0.0779, "step": 398 }, { "epoch": 0.71, "grad_norm": 0.1883499026298523, "learning_rate": 8.741602543186032e-05, "loss": 0.0721, "step": 399 }, { "epoch": 0.71, "grad_norm": 0.25179481506347656, "learning_rate": 8.735412258133562e-05, "loss": 0.0875, "step": 400 }, { "epoch": 0.71, "grad_norm": 0.23851999640464783, "learning_rate": 8.729208988188881e-05, "loss": 0.0959, "step": 401 }, { "epoch": 0.71, "grad_norm": 0.2662704885005951, "learning_rate": 8.722992754915554e-05, "loss": 0.1025, "step": 402 }, { "epoch": 0.71, "grad_norm": 0.17909982800483704, "learning_rate": 8.716763579922204e-05, "loss": 0.0504, "step": 403 }, { "epoch": 0.71, "grad_norm": 0.17002324759960175, "learning_rate": 8.710521484862439e-05, "loss": 0.0856, "step": 404 }, { "epoch": 0.72, "grad_norm": 0.2229025810956955, "learning_rate": 8.704266491434788e-05, "loss": 0.0591, "step": 405 }, { "epoch": 0.72, "grad_norm": 0.1444559544324875, "learning_rate": 8.697998621382607e-05, "loss": 0.0297, "step": 406 }, { "epoch": 0.72, "grad_norm": 0.2677093744277954, "learning_rate": 8.69171789649402e-05, "loss": 0.0543, "step": 407 }, { "epoch": 0.72, "grad_norm": 0.5875506401062012, "learning_rate": 8.685424338601834e-05, "loss": 0.1199, "step": 408 }, { "epoch": 0.72, "grad_norm": 0.34576529264450073, "learning_rate": 8.679117969583464e-05, "loss": 0.1003, "step": 409 }, { "epoch": 0.73, "grad_norm": 0.2765222489833832, "learning_rate": 8.672798811360863e-05, "loss": 0.0358, "step": 410 }, { "epoch": 0.73, "grad_norm": 0.17154745757579803, "learning_rate": 8.666466885900438e-05, "loss": 0.0736, "step": 411 }, { "epoch": 0.73, "grad_norm": 0.1607416719198227, "learning_rate": 8.660122215212977e-05, "loss": 0.0678, "step": 412 }, { "epoch": 0.73, "grad_norm": 0.1216413602232933, "learning_rate": 8.653764821353573e-05, "loss": 0.0341, "step": 413 }, { "epoch": 0.73, "grad_norm": 0.3713608980178833, "learning_rate": 8.647394726421547e-05, "loss": 0.118, "step": 414 }, { "epoch": 0.73, "grad_norm": 0.48576387763023376, "learning_rate": 8.641011952560371e-05, "loss": 0.0931, "step": 415 }, { "epoch": 0.74, "grad_norm": 0.14704179763793945, "learning_rate": 8.63461652195759e-05, "loss": 0.0285, "step": 416 }, { "epoch": 0.74, "grad_norm": 0.3306657671928406, "learning_rate": 8.628208456844747e-05, "loss": 0.0737, "step": 417 }, { "epoch": 0.74, "grad_norm": 0.39767885208129883, "learning_rate": 8.621787779497305e-05, "loss": 0.097, "step": 418 }, { "epoch": 0.74, "grad_norm": 0.3198534846305847, "learning_rate": 8.615354512234569e-05, "loss": 0.0731, "step": 419 }, { "epoch": 0.74, "grad_norm": 0.19191338121891022, "learning_rate": 8.608908677419606e-05, "loss": 0.0697, "step": 420 }, { "epoch": 0.74, "grad_norm": 0.26490989327430725, "learning_rate": 8.602450297459172e-05, "loss": 0.1214, "step": 421 }, { "epoch": 0.75, "grad_norm": 0.3545917868614197, "learning_rate": 8.595979394803634e-05, "loss": 0.0933, "step": 422 }, { "epoch": 0.75, "grad_norm": 0.28239014744758606, "learning_rate": 8.589495991946885e-05, "loss": 0.0707, "step": 423 }, { "epoch": 0.75, "grad_norm": 0.48272502422332764, "learning_rate": 8.583000111426276e-05, "loss": 0.0831, "step": 424 }, { "epoch": 0.75, "grad_norm": 0.15938633680343628, "learning_rate": 8.576491775822527e-05, "loss": 0.0899, "step": 425 }, { "epoch": 0.75, "grad_norm": 0.2616162896156311, "learning_rate": 8.569971007759657e-05, "loss": 0.118, "step": 426 }, { "epoch": 0.75, "eval_loss": 0.0783080980181694, "eval_runtime": 14.6414, "eval_samples_per_second": 32.579, "eval_steps_per_second": 8.196, "step": 426 }, { "epoch": 0.76, "grad_norm": 0.19521737098693848, "learning_rate": 8.563437829904903e-05, "loss": 0.0814, "step": 427 }, { "epoch": 0.76, "grad_norm": 0.194011390209198, "learning_rate": 8.55689226496864e-05, "loss": 0.0799, "step": 428 }, { "epoch": 0.76, "grad_norm": 0.2743787169456482, "learning_rate": 8.550334335704298e-05, "loss": 0.0869, "step": 429 }, { "epoch": 0.76, "grad_norm": 0.1369010955095291, "learning_rate": 8.543764064908295e-05, "loss": 0.0435, "step": 430 }, { "epoch": 0.76, "grad_norm": 0.24237819015979767, "learning_rate": 8.537181475419944e-05, "loss": 0.1148, "step": 431 }, { "epoch": 0.76, "grad_norm": 0.14511409401893616, "learning_rate": 8.530586590121383e-05, "loss": 0.0764, "step": 432 }, { "epoch": 0.77, "grad_norm": 0.15356196463108063, "learning_rate": 8.523979431937492e-05, "loss": 0.05, "step": 433 }, { "epoch": 0.77, "grad_norm": 0.21860916912555695, "learning_rate": 8.51736002383581e-05, "loss": 0.0971, "step": 434 }, { "epoch": 0.77, "grad_norm": 0.23724305629730225, "learning_rate": 8.510728388826463e-05, "loss": 0.1049, "step": 435 }, { "epoch": 0.77, "grad_norm": 0.25301918387413025, "learning_rate": 8.50408454996208e-05, "loss": 0.0848, "step": 436 }, { "epoch": 0.77, "grad_norm": 0.22409550845623016, "learning_rate": 8.497428530337706e-05, "loss": 0.101, "step": 437 }, { "epoch": 0.77, "grad_norm": 0.1324710100889206, "learning_rate": 8.490760353090737e-05, "loss": 0.0723, "step": 438 }, { "epoch": 0.78, "grad_norm": 0.1362515389919281, "learning_rate": 8.484080041400826e-05, "loss": 0.0709, "step": 439 }, { "epoch": 0.78, "grad_norm": 0.16375669836997986, "learning_rate": 8.477387618489807e-05, "loss": 0.0405, "step": 440 }, { "epoch": 0.78, "grad_norm": 0.21752700209617615, "learning_rate": 8.470683107621616e-05, "loss": 0.0455, "step": 441 }, { "epoch": 0.78, "grad_norm": 0.1989530771970749, "learning_rate": 8.463966532102207e-05, "loss": 0.0704, "step": 442 }, { "epoch": 0.78, "grad_norm": 0.192123144865036, "learning_rate": 8.457237915279476e-05, "loss": 0.063, "step": 443 }, { "epoch": 0.79, "grad_norm": 0.1221012994647026, "learning_rate": 8.450497280543174e-05, "loss": 0.0302, "step": 444 }, { "epoch": 0.79, "grad_norm": 0.5705539584159851, "learning_rate": 8.443744651324827e-05, "loss": 0.1531, "step": 445 }, { "epoch": 0.79, "grad_norm": 0.21490426361560822, "learning_rate": 8.436980051097659e-05, "loss": 0.0626, "step": 446 }, { "epoch": 0.79, "grad_norm": 0.2654309570789337, "learning_rate": 8.430203503376505e-05, "loss": 0.0838, "step": 447 }, { "epoch": 0.79, "grad_norm": 0.22457195818424225, "learning_rate": 8.423415031717733e-05, "loss": 0.0309, "step": 448 }, { "epoch": 0.79, "grad_norm": 0.10934180021286011, "learning_rate": 8.416614659719157e-05, "loss": 0.0132, "step": 449 }, { "epoch": 0.8, "grad_norm": 0.32622861862182617, "learning_rate": 8.409802411019963e-05, "loss": 0.107, "step": 450 }, { "epoch": 0.8, "grad_norm": 0.24298590421676636, "learning_rate": 8.40297830930062e-05, "loss": 0.1268, "step": 451 }, { "epoch": 0.8, "grad_norm": 0.29994437098503113, "learning_rate": 8.396142378282798e-05, "loss": 0.0747, "step": 452 }, { "epoch": 0.8, "grad_norm": 0.16668649017810822, "learning_rate": 8.389294641729293e-05, "loss": 0.0479, "step": 453 }, { "epoch": 0.8, "grad_norm": 0.26706060767173767, "learning_rate": 8.382435123443934e-05, "loss": 0.1116, "step": 454 }, { "epoch": 0.8, "grad_norm": 0.1750030219554901, "learning_rate": 8.375563847271506e-05, "loss": 0.0597, "step": 455 }, { "epoch": 0.81, "grad_norm": 0.2318125069141388, "learning_rate": 8.36868083709767e-05, "loss": 0.0909, "step": 456 }, { "epoch": 0.81, "grad_norm": 0.1834569126367569, "learning_rate": 8.361786116848872e-05, "loss": 0.0813, "step": 457 }, { "epoch": 0.81, "grad_norm": 0.27685895562171936, "learning_rate": 8.354879710492264e-05, "loss": 0.1301, "step": 458 }, { "epoch": 0.81, "grad_norm": 0.16120545566082, "learning_rate": 8.347961642035624e-05, "loss": 0.0717, "step": 459 }, { "epoch": 0.81, "grad_norm": 0.17625439167022705, "learning_rate": 8.341031935527267e-05, "loss": 0.0867, "step": 460 }, { "epoch": 0.82, "grad_norm": 0.2321135252714157, "learning_rate": 8.334090615055966e-05, "loss": 0.1122, "step": 461 }, { "epoch": 0.82, "grad_norm": 0.12720270454883575, "learning_rate": 8.327137704750862e-05, "loss": 0.0375, "step": 462 }, { "epoch": 0.82, "grad_norm": 0.2046743929386139, "learning_rate": 8.320173228781389e-05, "loss": 0.0808, "step": 463 }, { "epoch": 0.82, "grad_norm": 0.16513489186763763, "learning_rate": 8.313197211357181e-05, "loss": 0.0825, "step": 464 }, { "epoch": 0.82, "grad_norm": 0.12492749094963074, "learning_rate": 8.306209676727994e-05, "loss": 0.0876, "step": 465 }, { "epoch": 0.82, "grad_norm": 0.1343008577823639, "learning_rate": 8.299210649183619e-05, "loss": 0.0852, "step": 466 }, { "epoch": 0.83, "grad_norm": 0.13951613008975983, "learning_rate": 8.2922001530538e-05, "loss": 0.1003, "step": 467 }, { "epoch": 0.83, "grad_norm": 0.16553768515586853, "learning_rate": 8.285178212708143e-05, "loss": 0.0662, "step": 468 }, { "epoch": 0.83, "grad_norm": 0.15311822295188904, "learning_rate": 8.278144852556042e-05, "loss": 0.0785, "step": 469 }, { "epoch": 0.83, "grad_norm": 0.1948017179965973, "learning_rate": 8.271100097046584e-05, "loss": 0.0898, "step": 470 }, { "epoch": 0.83, "grad_norm": 0.11078551411628723, "learning_rate": 8.264043970668469e-05, "loss": 0.0386, "step": 471 }, { "epoch": 0.84, "grad_norm": 0.1605585515499115, "learning_rate": 8.256976497949924e-05, "loss": 0.0497, "step": 472 }, { "epoch": 0.84, "grad_norm": 0.1617887318134308, "learning_rate": 8.249897703458619e-05, "loss": 0.0624, "step": 473 }, { "epoch": 0.84, "grad_norm": 0.1274091750383377, "learning_rate": 8.242807611801578e-05, "loss": 0.0578, "step": 474 }, { "epoch": 0.84, "grad_norm": 0.15953154861927032, "learning_rate": 8.235706247625098e-05, "loss": 0.042, "step": 475 }, { "epoch": 0.84, "grad_norm": 0.27984094619750977, "learning_rate": 8.228593635614659e-05, "loss": 0.1037, "step": 476 }, { "epoch": 0.84, "grad_norm": 0.1895013153553009, "learning_rate": 8.22146980049484e-05, "loss": 0.0728, "step": 477 }, { "epoch": 0.85, "grad_norm": 0.1580246388912201, "learning_rate": 8.214334767029239e-05, "loss": 0.0398, "step": 478 }, { "epoch": 0.85, "grad_norm": 0.2391231805086136, "learning_rate": 8.207188560020373e-05, "loss": 0.0707, "step": 479 }, { "epoch": 0.85, "grad_norm": 0.25975751876831055, "learning_rate": 8.200031204309603e-05, "loss": 0.1369, "step": 480 }, { "epoch": 0.85, "grad_norm": 0.32591861486434937, "learning_rate": 8.192862724777051e-05, "loss": 0.0878, "step": 481 }, { "epoch": 0.85, "grad_norm": 0.32488566637039185, "learning_rate": 8.185683146341496e-05, "loss": 0.0692, "step": 482 }, { "epoch": 0.85, "grad_norm": 0.1918002963066101, "learning_rate": 8.178492493960309e-05, "loss": 0.0942, "step": 483 }, { "epoch": 0.86, "grad_norm": 0.43140751123428345, "learning_rate": 8.171290792629347e-05, "loss": 0.0979, "step": 484 }, { "epoch": 0.86, "grad_norm": 0.1771157830953598, "learning_rate": 8.164078067382882e-05, "loss": 0.0894, "step": 485 }, { "epoch": 0.86, "grad_norm": 0.25012728571891785, "learning_rate": 8.1568543432935e-05, "loss": 0.0734, "step": 486 }, { "epoch": 0.86, "grad_norm": 0.19328337907791138, "learning_rate": 8.149619645472031e-05, "loss": 0.0869, "step": 487 }, { "epoch": 0.86, "grad_norm": 0.18180640041828156, "learning_rate": 8.142373999067439e-05, "loss": 0.0897, "step": 488 }, { "epoch": 0.87, "grad_norm": 0.25753355026245117, "learning_rate": 8.135117429266757e-05, "loss": 0.0883, "step": 489 }, { "epoch": 0.87, "grad_norm": 0.23837833106517792, "learning_rate": 8.127849961294984e-05, "loss": 0.0549, "step": 490 }, { "epoch": 0.87, "grad_norm": 0.25032365322113037, "learning_rate": 8.120571620415006e-05, "loss": 0.0976, "step": 491 }, { "epoch": 0.87, "grad_norm": 0.30728307366371155, "learning_rate": 8.113282431927502e-05, "loss": 0.0709, "step": 492 }, { "epoch": 0.87, "grad_norm": 0.1391928791999817, "learning_rate": 8.10598242117086e-05, "loss": 0.0378, "step": 493 }, { "epoch": 0.87, "grad_norm": 0.1786775141954422, "learning_rate": 8.098671613521089e-05, "loss": 0.0671, "step": 494 }, { "epoch": 0.88, "grad_norm": 0.21963584423065186, "learning_rate": 8.091350034391732e-05, "loss": 0.0936, "step": 495 }, { "epoch": 0.88, "grad_norm": 0.13954299688339233, "learning_rate": 8.084017709233767e-05, "loss": 0.052, "step": 496 }, { "epoch": 0.88, "grad_norm": 0.19656923413276672, "learning_rate": 8.076674663535537e-05, "loss": 0.0584, "step": 497 }, { "epoch": 0.88, "grad_norm": 0.14330637454986572, "learning_rate": 8.069320922822643e-05, "loss": 0.0786, "step": 498 }, { "epoch": 0.88, "grad_norm": 0.19019991159439087, "learning_rate": 8.061956512657871e-05, "loss": 0.0837, "step": 499 }, { "epoch": 0.88, "grad_norm": 0.2079285979270935, "learning_rate": 8.05458145864109e-05, "loss": 0.0459, "step": 500 }, { "epoch": 0.89, "grad_norm": 0.3516862392425537, "learning_rate": 8.047195786409172e-05, "loss": 0.191, "step": 501 }, { "epoch": 0.89, "grad_norm": 0.192392036318779, "learning_rate": 8.039799521635896e-05, "loss": 0.1072, "step": 502 }, { "epoch": 0.89, "grad_norm": 0.1787678301334381, "learning_rate": 8.032392690031867e-05, "loss": 0.0649, "step": 503 }, { "epoch": 0.89, "grad_norm": 0.21046535670757294, "learning_rate": 8.024975317344421e-05, "loss": 0.1065, "step": 504 }, { "epoch": 0.89, "grad_norm": 0.1215684562921524, "learning_rate": 8.017547429357532e-05, "loss": 0.0433, "step": 505 }, { "epoch": 0.9, "grad_norm": 0.1342051476240158, "learning_rate": 8.010109051891731e-05, "loss": 0.0774, "step": 506 }, { "epoch": 0.9, "grad_norm": 0.10215850174427032, "learning_rate": 8.002660210804011e-05, "loss": 0.0338, "step": 507 }, { "epoch": 0.9, "grad_norm": 0.23539598286151886, "learning_rate": 7.995200931987743e-05, "loss": 0.0516, "step": 508 }, { "epoch": 0.9, "grad_norm": 0.15601155161857605, "learning_rate": 7.987731241372572e-05, "loss": 0.0559, "step": 509 }, { "epoch": 0.9, "grad_norm": 0.165851429104805, "learning_rate": 7.98025116492434e-05, "loss": 0.0372, "step": 510 }, { "epoch": 0.9, "grad_norm": 0.21045421063899994, "learning_rate": 7.972760728644996e-05, "loss": 0.086, "step": 511 }, { "epoch": 0.91, "grad_norm": 0.14814500510692596, "learning_rate": 7.965259958572496e-05, "loss": 0.0587, "step": 512 }, { "epoch": 0.91, "grad_norm": 0.15543898940086365, "learning_rate": 7.95774888078072e-05, "loss": 0.0682, "step": 513 }, { "epoch": 0.91, "grad_norm": 0.13527697324752808, "learning_rate": 7.950227521379382e-05, "loss": 0.0468, "step": 514 }, { "epoch": 0.91, "grad_norm": 0.35773295164108276, "learning_rate": 7.94269590651393e-05, "loss": 0.1273, "step": 515 }, { "epoch": 0.91, "grad_norm": 0.22433511912822723, "learning_rate": 7.935154062365467e-05, "loss": 0.0438, "step": 516 }, { "epoch": 0.91, "grad_norm": 0.1453983038663864, "learning_rate": 7.927602015150655e-05, "loss": 0.0367, "step": 517 }, { "epoch": 0.92, "grad_norm": 0.2343645691871643, "learning_rate": 7.920039791121617e-05, "loss": 0.128, "step": 518 }, { "epoch": 0.92, "grad_norm": 0.25178173184394836, "learning_rate": 7.912467416565861e-05, "loss": 0.1094, "step": 519 }, { "epoch": 0.92, "grad_norm": 0.3170076012611389, "learning_rate": 7.904884917806174e-05, "loss": 0.1323, "step": 520 }, { "epoch": 0.92, "grad_norm": 0.15917453169822693, "learning_rate": 7.897292321200538e-05, "loss": 0.036, "step": 521 }, { "epoch": 0.92, "grad_norm": 0.31578320264816284, "learning_rate": 7.889689653142036e-05, "loss": 0.0909, "step": 522 }, { "epoch": 0.93, "grad_norm": 0.16602741181850433, "learning_rate": 7.882076940058764e-05, "loss": 0.0371, "step": 523 }, { "epoch": 0.93, "grad_norm": 0.3235325813293457, "learning_rate": 7.874454208413731e-05, "loss": 0.1561, "step": 524 }, { "epoch": 0.93, "grad_norm": 0.1256486028432846, "learning_rate": 7.866821484704776e-05, "loss": 0.0364, "step": 525 }, { "epoch": 0.93, "grad_norm": 0.2234162986278534, "learning_rate": 7.859178795464472e-05, "loss": 0.0883, "step": 526 }, { "epoch": 0.93, "grad_norm": 0.1564294993877411, "learning_rate": 7.851526167260034e-05, "loss": 0.0679, "step": 527 }, { "epoch": 0.93, "grad_norm": 0.16309525072574615, "learning_rate": 7.84386362669322e-05, "loss": 0.0912, "step": 528 }, { "epoch": 0.94, "grad_norm": 0.21584004163742065, "learning_rate": 7.836191200400255e-05, "loss": 0.0695, "step": 529 }, { "epoch": 0.94, "grad_norm": 0.15948422253131866, "learning_rate": 7.828508915051724e-05, "loss": 0.0459, "step": 530 }, { "epoch": 0.94, "grad_norm": 0.24016940593719482, "learning_rate": 7.82081679735248e-05, "loss": 0.1127, "step": 531 }, { "epoch": 0.94, "grad_norm": 0.2894397974014282, "learning_rate": 7.813114874041557e-05, "loss": 0.0584, "step": 532 }, { "epoch": 0.94, "grad_norm": 0.20707662403583527, "learning_rate": 7.805403171892079e-05, "loss": 0.1045, "step": 533 }, { "epoch": 0.94, "grad_norm": 0.23427248001098633, "learning_rate": 7.797681717711161e-05, "loss": 0.1345, "step": 534 }, { "epoch": 0.95, "grad_norm": 0.13141866028308868, "learning_rate": 7.789950538339812e-05, "loss": 0.052, "step": 535 }, { "epoch": 0.95, "grad_norm": 0.21118536591529846, "learning_rate": 7.782209660652855e-05, "loss": 0.1272, "step": 536 }, { "epoch": 0.95, "grad_norm": 0.15485352277755737, "learning_rate": 7.77445911155882e-05, "loss": 0.0686, "step": 537 }, { "epoch": 0.95, "grad_norm": 0.11380946636199951, "learning_rate": 7.766698917999861e-05, "loss": 0.0735, "step": 538 }, { "epoch": 0.95, "grad_norm": 0.26798170804977417, "learning_rate": 7.758929106951656e-05, "loss": 0.0934, "step": 539 }, { "epoch": 0.96, "grad_norm": 0.23003587126731873, "learning_rate": 7.751149705423312e-05, "loss": 0.0816, "step": 540 }, { "epoch": 0.96, "grad_norm": 0.2122953236103058, "learning_rate": 7.743360740457278e-05, "loss": 0.0827, "step": 541 }, { "epoch": 0.96, "grad_norm": 0.22673499584197998, "learning_rate": 7.735562239129247e-05, "loss": 0.1232, "step": 542 }, { "epoch": 0.96, "grad_norm": 0.24960415065288544, "learning_rate": 7.727754228548058e-05, "loss": 0.1124, "step": 543 }, { "epoch": 0.96, "grad_norm": 0.10405872017145157, "learning_rate": 7.719936735855611e-05, "loss": 0.0687, "step": 544 }, { "epoch": 0.96, "grad_norm": 0.16980154812335968, "learning_rate": 7.712109788226762e-05, "loss": 0.0874, "step": 545 }, { "epoch": 0.97, "grad_norm": 0.1486412137746811, "learning_rate": 7.704273412869238e-05, "loss": 0.0815, "step": 546 }, { "epoch": 0.97, "grad_norm": 0.14432762563228607, "learning_rate": 7.696427637023538e-05, "loss": 0.0752, "step": 547 }, { "epoch": 0.97, "grad_norm": 0.2627028822898865, "learning_rate": 7.688572487962835e-05, "loss": 0.0982, "step": 548 }, { "epoch": 0.97, "grad_norm": 0.16832011938095093, "learning_rate": 7.680707992992888e-05, "loss": 0.0895, "step": 549 }, { "epoch": 0.97, "grad_norm": 0.14999301731586456, "learning_rate": 7.672834179451942e-05, "loss": 0.0544, "step": 550 }, { "epoch": 0.97, "grad_norm": 0.14237482845783234, "learning_rate": 7.664951074710638e-05, "loss": 0.0623, "step": 551 }, { "epoch": 0.98, "grad_norm": 0.1694159060716629, "learning_rate": 7.657058706171911e-05, "loss": 0.0784, "step": 552 }, { "epoch": 0.98, "grad_norm": 0.1470886617898941, "learning_rate": 7.649157101270902e-05, "loss": 0.0635, "step": 553 }, { "epoch": 0.98, "grad_norm": 0.16492018103599548, "learning_rate": 7.641246287474855e-05, "loss": 0.0669, "step": 554 }, { "epoch": 0.98, "grad_norm": 0.195392444729805, "learning_rate": 7.633326292283028e-05, "loss": 0.0387, "step": 555 }, { "epoch": 0.98, "grad_norm": 0.17653177678585052, "learning_rate": 7.625397143226596e-05, "loss": 0.0592, "step": 556 }, { "epoch": 0.99, "grad_norm": 0.23455718159675598, "learning_rate": 7.617458867868553e-05, "loss": 0.0882, "step": 557 }, { "epoch": 0.99, "grad_norm": 0.3588998317718506, "learning_rate": 7.609511493803616e-05, "loss": 0.107, "step": 558 }, { "epoch": 0.99, "grad_norm": 0.2767946720123291, "learning_rate": 7.601555048658134e-05, "loss": 0.1609, "step": 559 }, { "epoch": 0.99, "grad_norm": 0.22181196510791779, "learning_rate": 7.593589560089985e-05, "loss": 0.0598, "step": 560 }, { "epoch": 0.99, "grad_norm": 0.30335313081741333, "learning_rate": 7.585615055788484e-05, "loss": 0.0825, "step": 561 }, { "epoch": 0.99, "grad_norm": 0.19477833807468414, "learning_rate": 7.577631563474291e-05, "loss": 0.0446, "step": 562 }, { "epoch": 1.0, "grad_norm": 0.11036123335361481, "learning_rate": 7.569639110899303e-05, "loss": 0.025, "step": 563 }, { "epoch": 1.0, "grad_norm": 0.19955220818519592, "learning_rate": 7.561637725846568e-05, "loss": 0.0484, "step": 564 }, { "epoch": 1.0, "grad_norm": 0.20293684303760529, "learning_rate": 7.553627436130183e-05, "loss": 0.0689, "step": 565 }, { "epoch": 1.0, "grad_norm": 0.18100765347480774, "learning_rate": 7.545608269595202e-05, "loss": 0.0371, "step": 566 }, { "epoch": 1.0, "grad_norm": 0.43053922057151794, "learning_rate": 7.537580254117531e-05, "loss": 0.0901, "step": 567 }, { "epoch": 1.0, "grad_norm": 0.3413926959037781, "learning_rate": 7.529543417603844e-05, "loss": 0.1088, "step": 568 }, { "epoch": 1.0, "eval_loss": 0.09266742318868637, "eval_runtime": 14.642, "eval_samples_per_second": 32.578, "eval_steps_per_second": 8.196, "step": 568 }, { "epoch": 1.01, "grad_norm": 0.1796027272939682, "learning_rate": 7.521497787991471e-05, "loss": 0.0244, "step": 569 }, { "epoch": 1.01, "grad_norm": 0.30515041947364807, "learning_rate": 7.513443393248312e-05, "loss": 0.0682, "step": 570 }, { "epoch": 1.01, "grad_norm": 0.3712550103664398, "learning_rate": 7.505380261372734e-05, "loss": 0.0921, "step": 571 }, { "epoch": 1.01, "grad_norm": 0.4219339191913605, "learning_rate": 7.497308420393477e-05, "loss": 0.0785, "step": 572 }, { "epoch": 1.01, "grad_norm": 0.24129725992679596, "learning_rate": 7.489227898369559e-05, "loss": 0.0851, "step": 573 }, { "epoch": 1.02, "grad_norm": 0.24595998227596283, "learning_rate": 7.481138723390164e-05, "loss": 0.1143, "step": 574 }, { "epoch": 1.02, "grad_norm": 0.13906948268413544, "learning_rate": 7.473040923574567e-05, "loss": 0.0402, "step": 575 }, { "epoch": 1.02, "grad_norm": 0.1885530650615692, "learning_rate": 7.464934527072016e-05, "loss": 0.0384, "step": 576 }, { "epoch": 1.02, "grad_norm": 0.13116823136806488, "learning_rate": 7.456819562061649e-05, "loss": 0.0447, "step": 577 }, { "epoch": 1.02, "grad_norm": 0.23953841626644135, "learning_rate": 7.448696056752383e-05, "loss": 0.0602, "step": 578 }, { "epoch": 1.02, "grad_norm": 0.17374739050865173, "learning_rate": 7.440564039382827e-05, "loss": 0.0657, "step": 579 }, { "epoch": 1.03, "grad_norm": 0.20921552181243896, "learning_rate": 7.432423538221178e-05, "loss": 0.0757, "step": 580 }, { "epoch": 1.03, "grad_norm": 0.10258325189352036, "learning_rate": 7.424274581565123e-05, "loss": 0.0237, "step": 581 }, { "epoch": 1.03, "grad_norm": 0.31752172112464905, "learning_rate": 7.416117197741742e-05, "loss": 0.0625, "step": 582 }, { "epoch": 1.03, "grad_norm": 0.229179248213768, "learning_rate": 7.407951415107413e-05, "loss": 0.0792, "step": 583 }, { "epoch": 1.03, "grad_norm": 0.16059361398220062, "learning_rate": 7.3997772620477e-05, "loss": 0.0718, "step": 584 }, { "epoch": 1.03, "grad_norm": 0.1626499593257904, "learning_rate": 7.391594766977277e-05, "loss": 0.0457, "step": 585 }, { "epoch": 1.04, "grad_norm": 0.1549261212348938, "learning_rate": 7.383403958339807e-05, "loss": 0.0544, "step": 586 }, { "epoch": 1.04, "grad_norm": 0.1588374525308609, "learning_rate": 7.375204864607852e-05, "loss": 0.0342, "step": 587 }, { "epoch": 1.04, "grad_norm": 0.09883646667003632, "learning_rate": 7.366997514282782e-05, "loss": 0.0292, "step": 588 }, { "epoch": 1.04, "grad_norm": 0.3421178460121155, "learning_rate": 7.358781935894659e-05, "loss": 0.0999, "step": 589 }, { "epoch": 1.04, "grad_norm": 0.12105683982372284, "learning_rate": 7.350558158002154e-05, "loss": 0.023, "step": 590 }, { "epoch": 1.05, "grad_norm": 0.15255074203014374, "learning_rate": 7.342326209192435e-05, "loss": 0.0423, "step": 591 }, { "epoch": 1.05, "grad_norm": 0.18337713181972504, "learning_rate": 7.33408611808108e-05, "loss": 0.0305, "step": 592 }, { "epoch": 1.05, "grad_norm": 0.12969495356082916, "learning_rate": 7.325837913311966e-05, "loss": 0.0175, "step": 593 }, { "epoch": 1.05, "grad_norm": 0.2849477231502533, "learning_rate": 7.317581623557177e-05, "loss": 0.0878, "step": 594 }, { "epoch": 1.05, "grad_norm": 0.24307942390441895, "learning_rate": 7.3093172775169e-05, "loss": 0.037, "step": 595 }, { "epoch": 1.05, "grad_norm": 0.2612784206867218, "learning_rate": 7.301044903919325e-05, "loss": 0.097, "step": 596 }, { "epoch": 1.06, "grad_norm": 0.2548207640647888, "learning_rate": 7.292764531520553e-05, "loss": 0.0928, "step": 597 }, { "epoch": 1.06, "grad_norm": 0.3221377432346344, "learning_rate": 7.284476189104485e-05, "loss": 0.0874, "step": 598 }, { "epoch": 1.06, "grad_norm": 0.20441681146621704, "learning_rate": 7.27617990548273e-05, "loss": 0.035, "step": 599 }, { "epoch": 1.06, "grad_norm": 0.26378926634788513, "learning_rate": 7.267875709494499e-05, "loss": 0.0494, "step": 600 }, { "epoch": 1.06, "grad_norm": 0.5505862832069397, "learning_rate": 7.259563630006512e-05, "loss": 0.1241, "step": 601 }, { "epoch": 1.07, "grad_norm": 0.13375498354434967, "learning_rate": 7.251243695912886e-05, "loss": 0.0241, "step": 602 }, { "epoch": 1.07, "grad_norm": 0.19569019973278046, "learning_rate": 7.242915936135051e-05, "loss": 0.0698, "step": 603 }, { "epoch": 1.07, "grad_norm": 0.28432735800743103, "learning_rate": 7.234580379621637e-05, "loss": 0.0641, "step": 604 }, { "epoch": 1.07, "grad_norm": 0.13998962938785553, "learning_rate": 7.22623705534837e-05, "loss": 0.0365, "step": 605 }, { "epoch": 1.07, "grad_norm": 0.1772097498178482, "learning_rate": 7.217885992317985e-05, "loss": 0.081, "step": 606 }, { "epoch": 1.07, "grad_norm": 0.4318295121192932, "learning_rate": 7.209527219560119e-05, "loss": 0.0532, "step": 607 }, { "epoch": 1.08, "grad_norm": 0.2505156695842743, "learning_rate": 7.201160766131207e-05, "loss": 0.0667, "step": 608 }, { "epoch": 1.08, "grad_norm": 0.13024090230464935, "learning_rate": 7.192786661114384e-05, "loss": 0.0234, "step": 609 }, { "epoch": 1.08, "grad_norm": 0.2824789583683014, "learning_rate": 7.184404933619377e-05, "loss": 0.095, "step": 610 }, { "epoch": 1.08, "grad_norm": 0.18059489130973816, "learning_rate": 7.17601561278242e-05, "loss": 0.0471, "step": 611 }, { "epoch": 1.08, "grad_norm": 0.2839769124984741, "learning_rate": 7.167618727766138e-05, "loss": 0.0783, "step": 612 }, { "epoch": 1.08, "grad_norm": 0.1342955082654953, "learning_rate": 7.159214307759448e-05, "loss": 0.0453, "step": 613 }, { "epoch": 1.09, "grad_norm": 0.1336507499217987, "learning_rate": 7.150802381977464e-05, "loss": 0.0431, "step": 614 }, { "epoch": 1.09, "grad_norm": 0.2953212559223175, "learning_rate": 7.142382979661386e-05, "loss": 0.0705, "step": 615 }, { "epoch": 1.09, "grad_norm": 0.17532870173454285, "learning_rate": 7.133956130078412e-05, "loss": 0.0666, "step": 616 }, { "epoch": 1.09, "grad_norm": 0.17404836416244507, "learning_rate": 7.12552186252162e-05, "loss": 0.0522, "step": 617 }, { "epoch": 1.09, "grad_norm": 0.25231000781059265, "learning_rate": 7.117080206309878e-05, "loss": 0.0854, "step": 618 }, { "epoch": 1.1, "grad_norm": 0.2264215499162674, "learning_rate": 7.108631190787735e-05, "loss": 0.0692, "step": 619 }, { "epoch": 1.1, "grad_norm": 0.3555202782154083, "learning_rate": 7.100174845325327e-05, "loss": 0.074, "step": 620 }, { "epoch": 1.1, "grad_norm": 0.34550729393959045, "learning_rate": 7.091711199318264e-05, "loss": 0.0831, "step": 621 }, { "epoch": 1.1, "grad_norm": 0.14560338854789734, "learning_rate": 7.083240282187543e-05, "loss": 0.0404, "step": 622 }, { "epoch": 1.1, "grad_norm": 0.23464788496494293, "learning_rate": 7.074762123379423e-05, "loss": 0.0699, "step": 623 }, { "epoch": 1.1, "grad_norm": 0.22587832808494568, "learning_rate": 7.066276752365352e-05, "loss": 0.0887, "step": 624 }, { "epoch": 1.11, "grad_norm": 0.17183855175971985, "learning_rate": 7.057784198641834e-05, "loss": 0.0373, "step": 625 }, { "epoch": 1.11, "grad_norm": 0.19148162007331848, "learning_rate": 7.049284491730354e-05, "loss": 0.0289, "step": 626 }, { "epoch": 1.11, "grad_norm": 0.26134082674980164, "learning_rate": 7.040777661177251e-05, "loss": 0.0367, "step": 627 }, { "epoch": 1.11, "grad_norm": 0.5379131436347961, "learning_rate": 7.032263736553635e-05, "loss": 0.1049, "step": 628 }, { "epoch": 1.11, "grad_norm": 0.13634662330150604, "learning_rate": 7.023742747455276e-05, "loss": 0.018, "step": 629 }, { "epoch": 1.11, "grad_norm": 0.28767991065979004, "learning_rate": 7.015214723502496e-05, "loss": 0.06, "step": 630 }, { "epoch": 1.12, "grad_norm": 0.2551933825016022, "learning_rate": 7.006679694340073e-05, "loss": 0.0407, "step": 631 }, { "epoch": 1.12, "grad_norm": 0.41325151920318604, "learning_rate": 6.998137689637142e-05, "loss": 0.046, "step": 632 }, { "epoch": 1.12, "grad_norm": 0.25655174255371094, "learning_rate": 6.989588739087078e-05, "loss": 0.0398, "step": 633 }, { "epoch": 1.12, "grad_norm": 0.3917771279811859, "learning_rate": 6.981032872407405e-05, "loss": 0.1072, "step": 634 }, { "epoch": 1.12, "grad_norm": 0.20738206803798676, "learning_rate": 6.972470119339691e-05, "loss": 0.0457, "step": 635 }, { "epoch": 1.13, "grad_norm": 0.1865154653787613, "learning_rate": 6.963900509649434e-05, "loss": 0.0258, "step": 636 }, { "epoch": 1.13, "grad_norm": 0.282071590423584, "learning_rate": 6.955324073125979e-05, "loss": 0.07, "step": 637 }, { "epoch": 1.13, "grad_norm": 0.27442115545272827, "learning_rate": 6.946740839582388e-05, "loss": 0.0875, "step": 638 }, { "epoch": 1.13, "grad_norm": 0.2635151445865631, "learning_rate": 6.938150838855359e-05, "loss": 0.0332, "step": 639 }, { "epoch": 1.13, "grad_norm": 0.16783182322978973, "learning_rate": 6.929554100805118e-05, "loss": 0.0405, "step": 640 }, { "epoch": 1.13, "grad_norm": 0.3328685760498047, "learning_rate": 6.920950655315297e-05, "loss": 0.1076, "step": 641 }, { "epoch": 1.14, "grad_norm": 0.20146729052066803, "learning_rate": 6.91234053229286e-05, "loss": 0.0481, "step": 642 }, { "epoch": 1.14, "grad_norm": 0.21599121391773224, "learning_rate": 6.903723761667973e-05, "loss": 0.0502, "step": 643 }, { "epoch": 1.14, "grad_norm": 0.16269706189632416, "learning_rate": 6.895100373393913e-05, "loss": 0.0652, "step": 644 }, { "epoch": 1.14, "grad_norm": 0.3716180622577667, "learning_rate": 6.886470397446958e-05, "loss": 0.0914, "step": 645 }, { "epoch": 1.14, "grad_norm": 0.18003414571285248, "learning_rate": 6.877833863826295e-05, "loss": 0.0484, "step": 646 }, { "epoch": 1.14, "grad_norm": 0.2745915949344635, "learning_rate": 6.869190802553894e-05, "loss": 0.1057, "step": 647 }, { "epoch": 1.15, "grad_norm": 0.2507147490978241, "learning_rate": 6.860541243674426e-05, "loss": 0.0587, "step": 648 }, { "epoch": 1.15, "grad_norm": 0.19874247908592224, "learning_rate": 6.851885217255145e-05, "loss": 0.0452, "step": 649 }, { "epoch": 1.15, "grad_norm": 0.21256215870380402, "learning_rate": 6.843222753385786e-05, "loss": 0.0434, "step": 650 }, { "epoch": 1.15, "grad_norm": 0.13569054007530212, "learning_rate": 6.834553882178463e-05, "loss": 0.0275, "step": 651 }, { "epoch": 1.15, "grad_norm": 0.26869267225265503, "learning_rate": 6.825878633767563e-05, "loss": 0.1006, "step": 652 }, { "epoch": 1.16, "grad_norm": 0.21900776028633118, "learning_rate": 6.817197038309644e-05, "loss": 0.0564, "step": 653 }, { "epoch": 1.16, "grad_norm": 0.13306765258312225, "learning_rate": 6.80850912598332e-05, "loss": 0.0304, "step": 654 }, { "epoch": 1.16, "grad_norm": 0.09863998740911484, "learning_rate": 6.79981492698917e-05, "loss": 0.0178, "step": 655 }, { "epoch": 1.16, "grad_norm": 0.2170545756816864, "learning_rate": 6.791114471549627e-05, "loss": 0.0754, "step": 656 }, { "epoch": 1.16, "grad_norm": 0.26661446690559387, "learning_rate": 6.782407789908863e-05, "loss": 0.1083, "step": 657 }, { "epoch": 1.16, "grad_norm": 0.2747049629688263, "learning_rate": 6.773694912332707e-05, "loss": 0.0758, "step": 658 }, { "epoch": 1.17, "grad_norm": 0.252560019493103, "learning_rate": 6.764975869108514e-05, "loss": 0.0681, "step": 659 }, { "epoch": 1.17, "grad_norm": 0.2867240905761719, "learning_rate": 6.756250690545079e-05, "loss": 0.095, "step": 660 }, { "epoch": 1.17, "grad_norm": 0.09760677814483643, "learning_rate": 6.747519406972524e-05, "loss": 0.0123, "step": 661 }, { "epoch": 1.17, "grad_norm": 0.17589041590690613, "learning_rate": 6.738782048742187e-05, "loss": 0.0437, "step": 662 }, { "epoch": 1.17, "grad_norm": 0.29266613721847534, "learning_rate": 6.730038646226532e-05, "loss": 0.0706, "step": 663 }, { "epoch": 1.17, "grad_norm": 0.1055804044008255, "learning_rate": 6.721289229819024e-05, "loss": 0.0343, "step": 664 }, { "epoch": 1.18, "grad_norm": 0.2909635305404663, "learning_rate": 6.712533829934042e-05, "loss": 0.0817, "step": 665 }, { "epoch": 1.18, "grad_norm": 0.2604895532131195, "learning_rate": 6.703772477006757e-05, "loss": 0.0452, "step": 666 }, { "epoch": 1.18, "grad_norm": 0.10520771890878677, "learning_rate": 6.695005201493038e-05, "loss": 0.0215, "step": 667 }, { "epoch": 1.18, "grad_norm": 0.10080817341804504, "learning_rate": 6.686232033869344e-05, "loss": 0.0188, "step": 668 }, { "epoch": 1.18, "grad_norm": 0.3340647220611572, "learning_rate": 6.677453004632608e-05, "loss": 0.0612, "step": 669 }, { "epoch": 1.19, "grad_norm": 0.29719796776771545, "learning_rate": 6.668668144300149e-05, "loss": 0.1014, "step": 670 }, { "epoch": 1.19, "grad_norm": 0.2131602168083191, "learning_rate": 6.659877483409545e-05, "loss": 0.0621, "step": 671 }, { "epoch": 1.19, "grad_norm": 0.1867963820695877, "learning_rate": 6.65108105251855e-05, "loss": 0.0312, "step": 672 }, { "epoch": 1.19, "grad_norm": 0.4250008463859558, "learning_rate": 6.642278882204963e-05, "loss": 0.0684, "step": 673 }, { "epoch": 1.19, "grad_norm": 0.20828047394752502, "learning_rate": 6.633471003066543e-05, "loss": 0.0421, "step": 674 }, { "epoch": 1.19, "grad_norm": 0.23356445133686066, "learning_rate": 6.62465744572089e-05, "loss": 0.0277, "step": 675 }, { "epoch": 1.2, "grad_norm": 0.42427390813827515, "learning_rate": 6.615838240805344e-05, "loss": 0.0745, "step": 676 }, { "epoch": 1.2, "grad_norm": 0.23298533260822296, "learning_rate": 6.607013418976874e-05, "loss": 0.047, "step": 677 }, { "epoch": 1.2, "grad_norm": 0.5681192278862, "learning_rate": 6.598183010911978e-05, "loss": 0.1032, "step": 678 }, { "epoch": 1.2, "grad_norm": 0.15370431542396545, "learning_rate": 6.589347047306571e-05, "loss": 0.0224, "step": 679 }, { "epoch": 1.2, "grad_norm": 0.2974132001399994, "learning_rate": 6.580505558875877e-05, "loss": 0.0908, "step": 680 }, { "epoch": 1.2, "grad_norm": 0.12158460170030594, "learning_rate": 6.571658576354333e-05, "loss": 0.0212, "step": 681 }, { "epoch": 1.21, "grad_norm": 0.32594335079193115, "learning_rate": 6.562806130495467e-05, "loss": 0.1016, "step": 682 }, { "epoch": 1.21, "grad_norm": 0.3316996097564697, "learning_rate": 6.5539482520718e-05, "loss": 0.0639, "step": 683 }, { "epoch": 1.21, "grad_norm": 0.21660655736923218, "learning_rate": 6.545084971874738e-05, "loss": 0.043, "step": 684 }, { "epoch": 1.21, "grad_norm": 0.350033164024353, "learning_rate": 6.536216320714466e-05, "loss": 0.0752, "step": 685 }, { "epoch": 1.21, "grad_norm": 0.30745336413383484, "learning_rate": 6.527342329419837e-05, "loss": 0.0927, "step": 686 }, { "epoch": 1.22, "grad_norm": 0.24984771013259888, "learning_rate": 6.51846302883827e-05, "loss": 0.0685, "step": 687 }, { "epoch": 1.22, "grad_norm": 0.07773179560899734, "learning_rate": 6.509578449835636e-05, "loss": 0.0152, "step": 688 }, { "epoch": 1.22, "grad_norm": 0.1620987057685852, "learning_rate": 6.500688623296159e-05, "loss": 0.0514, "step": 689 }, { "epoch": 1.22, "grad_norm": 0.1917831003665924, "learning_rate": 6.491793580122301e-05, "loss": 0.066, "step": 690 }, { "epoch": 1.22, "grad_norm": 0.21920029819011688, "learning_rate": 6.482893351234658e-05, "loss": 0.0547, "step": 691 }, { "epoch": 1.22, "grad_norm": 0.29076483845710754, "learning_rate": 6.473987967571856e-05, "loss": 0.079, "step": 692 }, { "epoch": 1.23, "grad_norm": 0.30292215943336487, "learning_rate": 6.46507746009043e-05, "loss": 0.0957, "step": 693 }, { "epoch": 1.23, "grad_norm": 0.14139439165592194, "learning_rate": 6.456161859764744e-05, "loss": 0.0346, "step": 694 }, { "epoch": 1.23, "grad_norm": 0.22850438952445984, "learning_rate": 6.447241197586847e-05, "loss": 0.0744, "step": 695 }, { "epoch": 1.23, "grad_norm": 0.48915836215019226, "learning_rate": 6.438315504566397e-05, "loss": 0.0953, "step": 696 }, { "epoch": 1.23, "grad_norm": 0.17644958198070526, "learning_rate": 6.429384811730528e-05, "loss": 0.046, "step": 697 }, { "epoch": 1.23, "grad_norm": 0.2039819210767746, "learning_rate": 6.420449150123767e-05, "loss": 0.1052, "step": 698 }, { "epoch": 1.24, "grad_norm": 0.17715586721897125, "learning_rate": 6.411508550807906e-05, "loss": 0.0447, "step": 699 }, { "epoch": 1.24, "grad_norm": 0.16100600361824036, "learning_rate": 6.4025630448619e-05, "loss": 0.0344, "step": 700 }, { "epoch": 1.24, "grad_norm": 0.22480256855487823, "learning_rate": 6.393612663381763e-05, "loss": 0.0495, "step": 701 }, { "epoch": 1.24, "grad_norm": 0.12992677092552185, "learning_rate": 6.384657437480458e-05, "loss": 0.0409, "step": 702 }, { "epoch": 1.24, "grad_norm": 0.1325366348028183, "learning_rate": 6.375697398287787e-05, "loss": 0.0257, "step": 703 }, { "epoch": 1.25, "grad_norm": 0.16241514682769775, "learning_rate": 6.366732576950284e-05, "loss": 0.0427, "step": 704 }, { "epoch": 1.25, "grad_norm": 0.21476183831691742, "learning_rate": 6.357763004631104e-05, "loss": 0.0451, "step": 705 }, { "epoch": 1.25, "grad_norm": 0.32039332389831543, "learning_rate": 6.34878871250992e-05, "loss": 0.0545, "step": 706 }, { "epoch": 1.25, "grad_norm": 0.3203076124191284, "learning_rate": 6.33980973178281e-05, "loss": 0.0917, "step": 707 }, { "epoch": 1.25, "grad_norm": 0.25006967782974243, "learning_rate": 6.330826093662156e-05, "loss": 0.1028, "step": 708 }, { "epoch": 1.25, "grad_norm": 0.52630215883255, "learning_rate": 6.32183782937652e-05, "loss": 0.0889, "step": 709 }, { "epoch": 1.26, "grad_norm": 0.33741331100463867, "learning_rate": 6.31284497017055e-05, "loss": 0.0725, "step": 710 }, { "epoch": 1.26, "eval_loss": 0.07285241782665253, "eval_runtime": 14.6756, "eval_samples_per_second": 32.503, "eval_steps_per_second": 8.177, "step": 710 }, { "epoch": 1.26, "grad_norm": 0.40746867656707764, "learning_rate": 6.303847547304873e-05, "loss": 0.0945, "step": 711 }, { "epoch": 1.26, "grad_norm": 0.22757941484451294, "learning_rate": 6.294845592055967e-05, "loss": 0.0532, "step": 712 }, { "epoch": 1.26, "grad_norm": 0.19006334245204926, "learning_rate": 6.285839135716079e-05, "loss": 0.0484, "step": 713 }, { "epoch": 1.26, "grad_norm": 0.48126357793807983, "learning_rate": 6.27682820959309e-05, "loss": 0.0967, "step": 714 }, { "epoch": 1.26, "grad_norm": 0.23766569793224335, "learning_rate": 6.26781284501043e-05, "loss": 0.106, "step": 715 }, { "epoch": 1.27, "grad_norm": 0.16818860173225403, "learning_rate": 6.258793073306949e-05, "loss": 0.0494, "step": 716 }, { "epoch": 1.27, "grad_norm": 0.28579115867614746, "learning_rate": 6.249768925836822e-05, "loss": 0.0937, "step": 717 }, { "epoch": 1.27, "grad_norm": 0.16623319685459137, "learning_rate": 6.240740433969432e-05, "loss": 0.0301, "step": 718 }, { "epoch": 1.27, "grad_norm": 0.1560198813676834, "learning_rate": 6.231707629089262e-05, "loss": 0.0384, "step": 719 }, { "epoch": 1.27, "grad_norm": 0.2002251148223877, "learning_rate": 6.2226705425958e-05, "loss": 0.0545, "step": 720 }, { "epoch": 1.28, "grad_norm": 0.16213096678256989, "learning_rate": 6.2136292059034e-05, "loss": 0.0433, "step": 721 }, { "epoch": 1.28, "grad_norm": 0.27064821124076843, "learning_rate": 6.204583650441201e-05, "loss": 0.0796, "step": 722 }, { "epoch": 1.28, "grad_norm": 0.11131159216165543, "learning_rate": 6.195533907653004e-05, "loss": 0.0229, "step": 723 }, { "epoch": 1.28, "grad_norm": 0.22354401648044586, "learning_rate": 6.18648000899717e-05, "loss": 0.0475, "step": 724 }, { "epoch": 1.28, "grad_norm": 0.19944117963314056, "learning_rate": 6.177421985946499e-05, "loss": 0.0413, "step": 725 }, { "epoch": 1.28, "grad_norm": 0.32458746433258057, "learning_rate": 6.168359869988134e-05, "loss": 0.1205, "step": 726 }, { "epoch": 1.29, "grad_norm": 0.19088833034038544, "learning_rate": 6.159293692623443e-05, "loss": 0.0626, "step": 727 }, { "epoch": 1.29, "grad_norm": 0.2114744633436203, "learning_rate": 6.150223485367914e-05, "loss": 0.048, "step": 728 }, { "epoch": 1.29, "grad_norm": 0.11308068782091141, "learning_rate": 6.141149279751043e-05, "loss": 0.0286, "step": 729 }, { "epoch": 1.29, "grad_norm": 0.22453975677490234, "learning_rate": 6.13207110731622e-05, "loss": 0.0279, "step": 730 }, { "epoch": 1.29, "grad_norm": 0.274513840675354, "learning_rate": 6.122988999620634e-05, "loss": 0.0553, "step": 731 }, { "epoch": 1.3, "grad_norm": 0.2700372636318207, "learning_rate": 6.113902988235145e-05, "loss": 0.0973, "step": 732 }, { "epoch": 1.3, "grad_norm": 0.3287579119205475, "learning_rate": 6.104813104744188e-05, "loss": 0.0853, "step": 733 }, { "epoch": 1.3, "grad_norm": 0.37582048773765564, "learning_rate": 6.095719380745654e-05, "loss": 0.088, "step": 734 }, { "epoch": 1.3, "grad_norm": 0.2009502649307251, "learning_rate": 6.086621847850788e-05, "loss": 0.0525, "step": 735 }, { "epoch": 1.3, "grad_norm": 0.1417909413576126, "learning_rate": 6.077520537684072e-05, "loss": 0.0311, "step": 736 }, { "epoch": 1.3, "grad_norm": 0.18328174948692322, "learning_rate": 6.068415481883122e-05, "loss": 0.0379, "step": 737 }, { "epoch": 1.31, "grad_norm": 0.1513252556324005, "learning_rate": 6.059306712098571e-05, "loss": 0.0319, "step": 738 }, { "epoch": 1.31, "grad_norm": 0.2544059753417969, "learning_rate": 6.0501942599939666e-05, "loss": 0.0593, "step": 739 }, { "epoch": 1.31, "grad_norm": 0.4964008331298828, "learning_rate": 6.0410781572456486e-05, "loss": 0.0367, "step": 740 }, { "epoch": 1.31, "grad_norm": 0.14776591956615448, "learning_rate": 6.031958435542659e-05, "loss": 0.0284, "step": 741 }, { "epoch": 1.31, "grad_norm": 0.20590472221374512, "learning_rate": 6.022835126586609e-05, "loss": 0.0359, "step": 742 }, { "epoch": 1.31, "grad_norm": 0.2493211179971695, "learning_rate": 6.0137082620915863e-05, "loss": 0.0424, "step": 743 }, { "epoch": 1.32, "grad_norm": 0.31557443737983704, "learning_rate": 6.0045778737840344e-05, "loss": 0.0563, "step": 744 }, { "epoch": 1.32, "grad_norm": 0.14257828891277313, "learning_rate": 5.995443993402647e-05, "loss": 0.024, "step": 745 }, { "epoch": 1.32, "grad_norm": 0.21385452151298523, "learning_rate": 5.9863066526982605e-05, "loss": 0.0721, "step": 746 }, { "epoch": 1.32, "grad_norm": 0.17539048194885254, "learning_rate": 5.977165883433734e-05, "loss": 0.025, "step": 747 }, { "epoch": 1.32, "grad_norm": 0.28508231043815613, "learning_rate": 5.9680217173838494e-05, "loss": 0.0595, "step": 748 }, { "epoch": 1.33, "grad_norm": 0.30929744243621826, "learning_rate": 5.9588741863351924e-05, "loss": 0.112, "step": 749 }, { "epoch": 1.33, "grad_norm": 0.439656525850296, "learning_rate": 5.949723322086053e-05, "loss": 0.0427, "step": 750 }, { "epoch": 1.33, "grad_norm": 0.2300054430961609, "learning_rate": 5.940569156446298e-05, "loss": 0.0437, "step": 751 }, { "epoch": 1.33, "grad_norm": 0.4155109226703644, "learning_rate": 5.931411721237279e-05, "loss": 0.0569, "step": 752 }, { "epoch": 1.33, "grad_norm": 0.25196224451065063, "learning_rate": 5.922251048291707e-05, "loss": 0.0413, "step": 753 }, { "epoch": 1.33, "grad_norm": 0.5078486204147339, "learning_rate": 5.913087169453554e-05, "loss": 0.0988, "step": 754 }, { "epoch": 1.34, "grad_norm": 0.26931652426719666, "learning_rate": 5.9039201165779315e-05, "loss": 0.0578, "step": 755 }, { "epoch": 1.34, "grad_norm": 0.2641213834285736, "learning_rate": 5.8947499215309834e-05, "loss": 0.0362, "step": 756 }, { "epoch": 1.34, "grad_norm": 0.23865339159965515, "learning_rate": 5.8855766161897805e-05, "loss": 0.0375, "step": 757 }, { "epoch": 1.34, "grad_norm": 0.2594137191772461, "learning_rate": 5.876400232442205e-05, "loss": 0.0489, "step": 758 }, { "epoch": 1.34, "grad_norm": 0.2721590995788574, "learning_rate": 5.867220802186837e-05, "loss": 0.0407, "step": 759 }, { "epoch": 1.34, "grad_norm": 0.3681499660015106, "learning_rate": 5.85803835733285e-05, "loss": 0.0554, "step": 760 }, { "epoch": 1.35, "grad_norm": 0.3132595121860504, "learning_rate": 5.848852929799894e-05, "loss": 0.0486, "step": 761 }, { "epoch": 1.35, "grad_norm": 0.16972127556800842, "learning_rate": 5.8396645515179884e-05, "loss": 0.0473, "step": 762 }, { "epoch": 1.35, "grad_norm": 0.30628886818885803, "learning_rate": 5.83047325442741e-05, "loss": 0.0664, "step": 763 }, { "epoch": 1.35, "grad_norm": 0.3327179551124573, "learning_rate": 5.8212790704785824e-05, "loss": 0.0605, "step": 764 }, { "epoch": 1.35, "grad_norm": 0.3301398754119873, "learning_rate": 5.812082031631966e-05, "loss": 0.0477, "step": 765 }, { "epoch": 1.36, "grad_norm": 0.23960134387016296, "learning_rate": 5.8028821698579385e-05, "loss": 0.0376, "step": 766 }, { "epoch": 1.36, "grad_norm": 0.2526357173919678, "learning_rate": 5.7936795171367e-05, "loss": 0.0712, "step": 767 }, { "epoch": 1.36, "grad_norm": 0.32746273279190063, "learning_rate": 5.784474105458143e-05, "loss": 0.0542, "step": 768 }, { "epoch": 1.36, "grad_norm": 0.10859230905771255, "learning_rate": 5.77526596682176e-05, "loss": 0.019, "step": 769 }, { "epoch": 1.36, "grad_norm": 0.2908915877342224, "learning_rate": 5.766055133236513e-05, "loss": 0.142, "step": 770 }, { "epoch": 1.36, "grad_norm": 0.26869770884513855, "learning_rate": 5.7568416367207404e-05, "loss": 0.0774, "step": 771 }, { "epoch": 1.37, "grad_norm": 0.39681994915008545, "learning_rate": 5.7476255093020326e-05, "loss": 0.0632, "step": 772 }, { "epoch": 1.37, "grad_norm": 0.14335761964321136, "learning_rate": 5.7384067830171274e-05, "loss": 0.03, "step": 773 }, { "epoch": 1.37, "grad_norm": 0.1377771943807602, "learning_rate": 5.729185489911797e-05, "loss": 0.0263, "step": 774 }, { "epoch": 1.37, "grad_norm": 0.19834232330322266, "learning_rate": 5.719961662040733e-05, "loss": 0.0506, "step": 775 }, { "epoch": 1.37, "grad_norm": 0.14378659427165985, "learning_rate": 5.710735331467444e-05, "loss": 0.0285, "step": 776 }, { "epoch": 1.37, "grad_norm": 0.25368401408195496, "learning_rate": 5.701506530264132e-05, "loss": 0.0584, "step": 777 }, { "epoch": 1.38, "grad_norm": 0.12339203804731369, "learning_rate": 5.692275290511592e-05, "loss": 0.0282, "step": 778 }, { "epoch": 1.38, "grad_norm": 0.203715518116951, "learning_rate": 5.683041644299093e-05, "loss": 0.0849, "step": 779 }, { "epoch": 1.38, "grad_norm": 0.1526814103126526, "learning_rate": 5.673805623724272e-05, "loss": 0.0256, "step": 780 }, { "epoch": 1.38, "grad_norm": 0.18840323388576508, "learning_rate": 5.664567260893019e-05, "loss": 0.048, "step": 781 }, { "epoch": 1.38, "grad_norm": 0.15979206562042236, "learning_rate": 5.6553265879193606e-05, "loss": 0.0237, "step": 782 }, { "epoch": 1.39, "grad_norm": 0.1128401905298233, "learning_rate": 5.6460836369253624e-05, "loss": 0.0213, "step": 783 }, { "epoch": 1.39, "grad_norm": 0.1648949831724167, "learning_rate": 5.6368384400410035e-05, "loss": 0.0348, "step": 784 }, { "epoch": 1.39, "grad_norm": 0.31091129779815674, "learning_rate": 5.627591029404071e-05, "loss": 0.0685, "step": 785 }, { "epoch": 1.39, "grad_norm": 0.2921251654624939, "learning_rate": 5.6183414371600496e-05, "loss": 0.045, "step": 786 }, { "epoch": 1.39, "grad_norm": 0.3398689925670624, "learning_rate": 5.609089695462002e-05, "loss": 0.0546, "step": 787 }, { "epoch": 1.39, "grad_norm": 0.21610289812088013, "learning_rate": 5.599835836470469e-05, "loss": 0.0322, "step": 788 }, { "epoch": 1.4, "grad_norm": 0.3218781054019928, "learning_rate": 5.5905798923533484e-05, "loss": 0.0331, "step": 789 }, { "epoch": 1.4, "grad_norm": 0.5338783860206604, "learning_rate": 5.581321895285787e-05, "loss": 0.0764, "step": 790 }, { "epoch": 1.4, "grad_norm": 0.2539553940296173, "learning_rate": 5.5720618774500675e-05, "loss": 0.0553, "step": 791 }, { "epoch": 1.4, "grad_norm": 0.22199298441410065, "learning_rate": 5.5627998710354957e-05, "loss": 0.0304, "step": 792 }, { "epoch": 1.4, "grad_norm": 0.13213643431663513, "learning_rate": 5.5535359082382944e-05, "loss": 0.0115, "step": 793 }, { "epoch": 1.4, "grad_norm": 0.3686007857322693, "learning_rate": 5.544270021261483e-05, "loss": 0.0371, "step": 794 }, { "epoch": 1.41, "grad_norm": 0.08815140277147293, "learning_rate": 5.535002242314772e-05, "loss": 0.0089, "step": 795 }, { "epoch": 1.41, "grad_norm": 0.40321916341781616, "learning_rate": 5.525732603614444e-05, "loss": 0.0653, "step": 796 }, { "epoch": 1.41, "grad_norm": 0.44097116589546204, "learning_rate": 5.5164611373832544e-05, "loss": 0.0555, "step": 797 }, { "epoch": 1.41, "grad_norm": 0.5565125942230225, "learning_rate": 5.5071878758503046e-05, "loss": 0.0646, "step": 798 }, { "epoch": 1.41, "grad_norm": 0.8579866290092468, "learning_rate": 5.49791285125094e-05, "loss": 0.1532, "step": 799 }, { "epoch": 1.42, "grad_norm": 0.550639271736145, "learning_rate": 5.488636095826636e-05, "loss": 0.0574, "step": 800 }, { "epoch": 1.42, "grad_norm": 0.07725897431373596, "learning_rate": 5.479357641824877e-05, "loss": 0.0087, "step": 801 }, { "epoch": 1.42, "grad_norm": 0.25981655716896057, "learning_rate": 5.470077521499063e-05, "loss": 0.0328, "step": 802 }, { "epoch": 1.42, "grad_norm": 0.07465404272079468, "learning_rate": 5.4607957671083786e-05, "loss": 0.0117, "step": 803 }, { "epoch": 1.42, "grad_norm": 0.22613628208637238, "learning_rate": 5.4515124109176904e-05, "loss": 0.0596, "step": 804 }, { "epoch": 1.42, "grad_norm": 0.20493067800998688, "learning_rate": 5.442227485197435e-05, "loss": 0.0394, "step": 805 }, { "epoch": 1.43, "grad_norm": 0.2182394117116928, "learning_rate": 5.4329410222235034e-05, "loss": 0.0491, "step": 806 }, { "epoch": 1.43, "grad_norm": 0.16270771622657776, "learning_rate": 5.42365305427713e-05, "loss": 0.0333, "step": 807 }, { "epoch": 1.43, "grad_norm": 0.3527982234954834, "learning_rate": 5.414363613644782e-05, "loss": 0.1369, "step": 808 }, { "epoch": 1.43, "grad_norm": 0.38832610845565796, "learning_rate": 5.405072732618043e-05, "loss": 0.0719, "step": 809 }, { "epoch": 1.43, "grad_norm": 0.18581318855285645, "learning_rate": 5.395780443493508e-05, "loss": 0.0305, "step": 810 }, { "epoch": 1.43, "grad_norm": 0.298115611076355, "learning_rate": 5.386486778572665e-05, "loss": 0.0676, "step": 811 }, { "epoch": 1.44, "grad_norm": 0.15877433121204376, "learning_rate": 5.3771917701617827e-05, "loss": 0.0343, "step": 812 }, { "epoch": 1.44, "grad_norm": 0.14393776655197144, "learning_rate": 5.367895450571801e-05, "loss": 0.0395, "step": 813 }, { "epoch": 1.44, "grad_norm": 0.25177934765815735, "learning_rate": 5.358597852118219e-05, "loss": 0.0757, "step": 814 }, { "epoch": 1.44, "grad_norm": 0.16125288605690002, "learning_rate": 5.3492990071209806e-05, "loss": 0.0432, "step": 815 }, { "epoch": 1.44, "grad_norm": 0.21766537427902222, "learning_rate": 5.3399989479043624e-05, "loss": 0.087, "step": 816 }, { "epoch": 1.45, "grad_norm": 0.18221743404865265, "learning_rate": 5.3306977067968614e-05, "loss": 0.0403, "step": 817 }, { "epoch": 1.45, "grad_norm": 0.2517869472503662, "learning_rate": 5.3213953161310825e-05, "loss": 0.0666, "step": 818 }, { "epoch": 1.45, "grad_norm": 0.20983122289180756, "learning_rate": 5.3120918082436314e-05, "loss": 0.0664, "step": 819 }, { "epoch": 1.45, "grad_norm": 0.17914025485515594, "learning_rate": 5.3027872154749915e-05, "loss": 0.0382, "step": 820 }, { "epoch": 1.45, "grad_norm": 0.12290598452091217, "learning_rate": 5.2934815701694204e-05, "loss": 0.0282, "step": 821 }, { "epoch": 1.45, "grad_norm": 0.17699532210826874, "learning_rate": 5.2841749046748345e-05, "loss": 0.0413, "step": 822 }, { "epoch": 1.46, "grad_norm": 0.3889511227607727, "learning_rate": 5.274867251342694e-05, "loss": 0.0758, "step": 823 }, { "epoch": 1.46, "grad_norm": 0.24286973476409912, "learning_rate": 5.2655586425278966e-05, "loss": 0.0532, "step": 824 }, { "epoch": 1.46, "grad_norm": 0.14911137521266937, "learning_rate": 5.256249110588659e-05, "loss": 0.0277, "step": 825 }, { "epoch": 1.46, "grad_norm": 0.31271466612815857, "learning_rate": 5.246938687886409e-05, "loss": 0.0726, "step": 826 }, { "epoch": 1.46, "grad_norm": 0.2684333920478821, "learning_rate": 5.237627406785667e-05, "loss": 0.0993, "step": 827 }, { "epoch": 1.46, "grad_norm": 0.14797139167785645, "learning_rate": 5.228315299653942e-05, "loss": 0.0198, "step": 828 }, { "epoch": 1.47, "grad_norm": 0.29548555612564087, "learning_rate": 5.2190023988616113e-05, "loss": 0.0562, "step": 829 }, { "epoch": 1.47, "grad_norm": 0.29321712255477905, "learning_rate": 5.2096887367818105e-05, "loss": 0.1208, "step": 830 }, { "epoch": 1.47, "grad_norm": 0.18331380188465118, "learning_rate": 5.2003743457903256e-05, "loss": 0.0256, "step": 831 }, { "epoch": 1.47, "grad_norm": 0.21740898489952087, "learning_rate": 5.1910592582654715e-05, "loss": 0.057, "step": 832 }, { "epoch": 1.47, "grad_norm": 0.2625051736831665, "learning_rate": 5.181743506587989e-05, "loss": 0.0667, "step": 833 }, { "epoch": 1.48, "grad_norm": 0.2670525908470154, "learning_rate": 5.172427123140923e-05, "loss": 0.0883, "step": 834 }, { "epoch": 1.48, "grad_norm": 0.33282265067100525, "learning_rate": 5.1631101403095184e-05, "loss": 0.0424, "step": 835 }, { "epoch": 1.48, "grad_norm": 0.21608753502368927, "learning_rate": 5.1537925904811004e-05, "loss": 0.049, "step": 836 }, { "epoch": 1.48, "grad_norm": 0.10450909286737442, "learning_rate": 5.144474506044968e-05, "loss": 0.0158, "step": 837 }, { "epoch": 1.48, "grad_norm": 0.3188491761684418, "learning_rate": 5.135155919392279e-05, "loss": 0.0547, "step": 838 }, { "epoch": 1.48, "grad_norm": 0.24398969113826752, "learning_rate": 5.125836862915934e-05, "loss": 0.053, "step": 839 }, { "epoch": 1.49, "grad_norm": 0.1743936687707901, "learning_rate": 5.116517369010466e-05, "loss": 0.0239, "step": 840 }, { "epoch": 1.49, "grad_norm": 0.180791437625885, "learning_rate": 5.1071974700719326e-05, "loss": 0.0864, "step": 841 }, { "epoch": 1.49, "grad_norm": 0.19678902626037598, "learning_rate": 5.0978771984978003e-05, "loss": 0.0376, "step": 842 }, { "epoch": 1.49, "grad_norm": 0.230797678232193, "learning_rate": 5.0885565866868227e-05, "loss": 0.0597, "step": 843 }, { "epoch": 1.49, "grad_norm": 0.4890972971916199, "learning_rate": 5.079235667038944e-05, "loss": 0.0832, "step": 844 }, { "epoch": 1.49, "grad_norm": 0.20508797466754913, "learning_rate": 5.069914471955178e-05, "loss": 0.0349, "step": 845 }, { "epoch": 1.5, "grad_norm": 0.21593628823757172, "learning_rate": 5.060593033837493e-05, "loss": 0.0354, "step": 846 }, { "epoch": 1.5, "grad_norm": 0.2712628245353699, "learning_rate": 5.051271385088702e-05, "loss": 0.0311, "step": 847 }, { "epoch": 1.5, "grad_norm": 0.11844774335622787, "learning_rate": 5.041949558112351e-05, "loss": 0.0109, "step": 848 }, { "epoch": 1.5, "grad_norm": 0.1798882633447647, "learning_rate": 5.032627585312608e-05, "loss": 0.0196, "step": 849 }, { "epoch": 1.5, "grad_norm": 0.290019690990448, "learning_rate": 5.023305499094144e-05, "loss": 0.0667, "step": 850 }, { "epoch": 1.51, "grad_norm": 0.24924272298812866, "learning_rate": 5.013983331862027e-05, "loss": 0.0556, "step": 851 }, { "epoch": 1.51, "grad_norm": 0.22597135603427887, "learning_rate": 5.004661116021605e-05, "loss": 0.0495, "step": 852 }, { "epoch": 1.51, "eval_loss": 0.07545028626918793, "eval_runtime": 14.6561, "eval_samples_per_second": 32.546, "eval_steps_per_second": 8.188, "step": 852 }, { "epoch": 1.51, "grad_norm": 0.15348747372627258, "learning_rate": 4.9953388839783954e-05, "loss": 0.0204, "step": 853 }, { "epoch": 1.51, "grad_norm": 0.6507572531700134, "learning_rate": 4.9860166681379745e-05, "loss": 0.076, "step": 854 }, { "epoch": 1.51, "grad_norm": 0.13102935254573822, "learning_rate": 4.976694500905857e-05, "loss": 0.0143, "step": 855 }, { "epoch": 1.51, "grad_norm": 0.43004414439201355, "learning_rate": 4.967372414687393e-05, "loss": 0.0675, "step": 856 }, { "epoch": 1.52, "grad_norm": 0.38339918851852417, "learning_rate": 4.95805044188765e-05, "loss": 0.0747, "step": 857 }, { "epoch": 1.52, "grad_norm": 0.4646240472793579, "learning_rate": 4.9487286149112986e-05, "loss": 0.0883, "step": 858 }, { "epoch": 1.52, "grad_norm": 0.2721651792526245, "learning_rate": 4.9394069661625076e-05, "loss": 0.062, "step": 859 }, { "epoch": 1.52, "grad_norm": 0.356275349855423, "learning_rate": 4.930085528044823e-05, "loss": 0.0321, "step": 860 }, { "epoch": 1.52, "grad_norm": 0.3066048324108124, "learning_rate": 4.9207643329610556e-05, "loss": 0.0525, "step": 861 }, { "epoch": 1.52, "grad_norm": 0.12254035472869873, "learning_rate": 4.911443413313179e-05, "loss": 0.0106, "step": 862 }, { "epoch": 1.53, "grad_norm": 0.19596797227859497, "learning_rate": 4.9021228015022015e-05, "loss": 0.0242, "step": 863 }, { "epoch": 1.53, "grad_norm": 0.39066678285598755, "learning_rate": 4.892802529928067e-05, "loss": 0.0558, "step": 864 }, { "epoch": 1.53, "grad_norm": 0.17401085793972015, "learning_rate": 4.883482630989535e-05, "loss": 0.0203, "step": 865 }, { "epoch": 1.53, "grad_norm": 0.28903472423553467, "learning_rate": 4.874163137084068e-05, "loss": 0.0819, "step": 866 }, { "epoch": 1.53, "grad_norm": 0.2746363580226898, "learning_rate": 4.8648440806077226e-05, "loss": 0.0696, "step": 867 }, { "epoch": 1.54, "grad_norm": 0.40534642338752747, "learning_rate": 4.8555254939550324e-05, "loss": 0.111, "step": 868 }, { "epoch": 1.54, "grad_norm": 0.41272208094596863, "learning_rate": 4.8462074095188994e-05, "loss": 0.1089, "step": 869 }, { "epoch": 1.54, "grad_norm": 0.40718454122543335, "learning_rate": 4.8368898596904834e-05, "loss": 0.1339, "step": 870 }, { "epoch": 1.54, "grad_norm": 0.14493143558502197, "learning_rate": 4.827572876859078e-05, "loss": 0.0227, "step": 871 }, { "epoch": 1.54, "grad_norm": 0.20390640199184418, "learning_rate": 4.8182564934120115e-05, "loss": 0.0464, "step": 872 }, { "epoch": 1.54, "grad_norm": 0.12927311658859253, "learning_rate": 4.80894074173453e-05, "loss": 0.0253, "step": 873 }, { "epoch": 1.55, "grad_norm": 0.17653903365135193, "learning_rate": 4.799625654209675e-05, "loss": 0.0509, "step": 874 }, { "epoch": 1.55, "grad_norm": 0.34687289595603943, "learning_rate": 4.790311263218191e-05, "loss": 0.0916, "step": 875 }, { "epoch": 1.55, "grad_norm": 0.22851605713367462, "learning_rate": 4.7809976011383905e-05, "loss": 0.0857, "step": 876 }, { "epoch": 1.55, "grad_norm": 0.27485382556915283, "learning_rate": 4.771684700346059e-05, "loss": 0.0623, "step": 877 }, { "epoch": 1.55, "grad_norm": 0.21887461841106415, "learning_rate": 4.762372593214335e-05, "loss": 0.0573, "step": 878 }, { "epoch": 1.56, "grad_norm": 0.2466115951538086, "learning_rate": 4.753061312113592e-05, "loss": 0.1039, "step": 879 }, { "epoch": 1.56, "grad_norm": 0.344625860452652, "learning_rate": 4.743750889411342e-05, "loss": 0.0637, "step": 880 }, { "epoch": 1.56, "grad_norm": 0.1676146686077118, "learning_rate": 4.7344413574721046e-05, "loss": 0.0372, "step": 881 }, { "epoch": 1.56, "grad_norm": 0.14225785434246063, "learning_rate": 4.725132748657307e-05, "loss": 0.0506, "step": 882 }, { "epoch": 1.56, "grad_norm": 0.19915729761123657, "learning_rate": 4.715825095325168e-05, "loss": 0.0459, "step": 883 }, { "epoch": 1.56, "grad_norm": 0.20955249667167664, "learning_rate": 4.70651842983058e-05, "loss": 0.0539, "step": 884 }, { "epoch": 1.57, "grad_norm": 0.171535924077034, "learning_rate": 4.697212784525008e-05, "loss": 0.0346, "step": 885 }, { "epoch": 1.57, "grad_norm": 0.11981090158224106, "learning_rate": 4.687908191756369e-05, "loss": 0.0378, "step": 886 }, { "epoch": 1.57, "grad_norm": 0.18210795521736145, "learning_rate": 4.678604683868918e-05, "loss": 0.0563, "step": 887 }, { "epoch": 1.57, "grad_norm": 0.18884742259979248, "learning_rate": 4.669302293203142e-05, "loss": 0.0393, "step": 888 }, { "epoch": 1.57, "grad_norm": 0.21338443458080292, "learning_rate": 4.660001052095639e-05, "loss": 0.054, "step": 889 }, { "epoch": 1.57, "grad_norm": 0.16022799909114838, "learning_rate": 4.65070099287902e-05, "loss": 0.0497, "step": 890 }, { "epoch": 1.58, "grad_norm": 0.30642077326774597, "learning_rate": 4.641402147881782e-05, "loss": 0.0702, "step": 891 }, { "epoch": 1.58, "grad_norm": 0.24659690260887146, "learning_rate": 4.6321045494282e-05, "loss": 0.0986, "step": 892 }, { "epoch": 1.58, "grad_norm": 0.4151371419429779, "learning_rate": 4.62280822983822e-05, "loss": 0.1064, "step": 893 }, { "epoch": 1.58, "grad_norm": 0.19555744528770447, "learning_rate": 4.613513221427337e-05, "loss": 0.034, "step": 894 }, { "epoch": 1.58, "grad_norm": 0.3575385510921478, "learning_rate": 4.604219556506492e-05, "loss": 0.0563, "step": 895 }, { "epoch": 1.59, "grad_norm": 0.33982524275779724, "learning_rate": 4.594927267381958e-05, "loss": 0.1152, "step": 896 }, { "epoch": 1.59, "grad_norm": 0.40054503083229065, "learning_rate": 4.58563638635522e-05, "loss": 0.0684, "step": 897 }, { "epoch": 1.59, "grad_norm": 0.16741478443145752, "learning_rate": 4.5763469457228695e-05, "loss": 0.0221, "step": 898 }, { "epoch": 1.59, "grad_norm": 0.30603042244911194, "learning_rate": 4.5670589777764984e-05, "loss": 0.0725, "step": 899 }, { "epoch": 1.59, "grad_norm": 0.345217227935791, "learning_rate": 4.5577725148025646e-05, "loss": 0.062, "step": 900 }, { "epoch": 1.59, "grad_norm": 0.4248473048210144, "learning_rate": 4.54848758908231e-05, "loss": 0.1482, "step": 901 }, { "epoch": 1.6, "grad_norm": 0.25596097111701965, "learning_rate": 4.5392042328916226e-05, "loss": 0.0417, "step": 902 }, { "epoch": 1.6, "grad_norm": 0.14021873474121094, "learning_rate": 4.5299224785009374e-05, "loss": 0.0242, "step": 903 }, { "epoch": 1.6, "grad_norm": 0.16665437817573547, "learning_rate": 4.5206423581751245e-05, "loss": 0.0569, "step": 904 }, { "epoch": 1.6, "grad_norm": 0.29362550377845764, "learning_rate": 4.511363904173366e-05, "loss": 0.068, "step": 905 }, { "epoch": 1.6, "grad_norm": 0.44577184319496155, "learning_rate": 4.5020871487490604e-05, "loss": 0.0787, "step": 906 }, { "epoch": 1.6, "grad_norm": 0.20594125986099243, "learning_rate": 4.492812124149696e-05, "loss": 0.0868, "step": 907 }, { "epoch": 1.61, "grad_norm": 0.2906559109687805, "learning_rate": 4.483538862616747e-05, "loss": 0.0592, "step": 908 }, { "epoch": 1.61, "grad_norm": 0.17545486986637115, "learning_rate": 4.4742673963855576e-05, "loss": 0.0225, "step": 909 }, { "epoch": 1.61, "grad_norm": 0.18305286765098572, "learning_rate": 4.46499775768523e-05, "loss": 0.0483, "step": 910 }, { "epoch": 1.61, "grad_norm": 0.2249644249677658, "learning_rate": 4.455729978738517e-05, "loss": 0.0383, "step": 911 }, { "epoch": 1.61, "grad_norm": 0.3094448149204254, "learning_rate": 4.446464091761706e-05, "loss": 0.0533, "step": 912 }, { "epoch": 1.62, "grad_norm": 0.22453933954238892, "learning_rate": 4.437200128964504e-05, "loss": 0.0435, "step": 913 }, { "epoch": 1.62, "grad_norm": 0.1814616322517395, "learning_rate": 4.4279381225499344e-05, "loss": 0.0245, "step": 914 }, { "epoch": 1.62, "grad_norm": 0.20599542558193207, "learning_rate": 4.418678104714214e-05, "loss": 0.0321, "step": 915 }, { "epoch": 1.62, "grad_norm": 0.27197298407554626, "learning_rate": 4.409420107646652e-05, "loss": 0.0512, "step": 916 }, { "epoch": 1.62, "grad_norm": 0.35009968280792236, "learning_rate": 4.400164163529532e-05, "loss": 0.0717, "step": 917 }, { "epoch": 1.62, "grad_norm": 0.17196977138519287, "learning_rate": 4.390910304537999e-05, "loss": 0.033, "step": 918 }, { "epoch": 1.63, "grad_norm": 0.1884760707616806, "learning_rate": 4.381658562839953e-05, "loss": 0.0526, "step": 919 }, { "epoch": 1.63, "grad_norm": 0.4165942966938019, "learning_rate": 4.3724089705959305e-05, "loss": 0.0824, "step": 920 }, { "epoch": 1.63, "grad_norm": 0.36213231086730957, "learning_rate": 4.363161559958996e-05, "loss": 0.0524, "step": 921 }, { "epoch": 1.63, "grad_norm": 0.22675907611846924, "learning_rate": 4.353916363074638e-05, "loss": 0.0367, "step": 922 }, { "epoch": 1.63, "grad_norm": 0.29561713337898254, "learning_rate": 4.34467341208064e-05, "loss": 0.0364, "step": 923 }, { "epoch": 1.63, "grad_norm": 0.22054970264434814, "learning_rate": 4.3354327391069826e-05, "loss": 0.025, "step": 924 }, { "epoch": 1.64, "grad_norm": 0.06693907827138901, "learning_rate": 4.3261943762757287e-05, "loss": 0.0104, "step": 925 }, { "epoch": 1.64, "grad_norm": 0.339631050825119, "learning_rate": 4.3169583557009064e-05, "loss": 0.0732, "step": 926 }, { "epoch": 1.64, "grad_norm": 0.2175193578004837, "learning_rate": 4.307724709488409e-05, "loss": 0.0464, "step": 927 }, { "epoch": 1.64, "grad_norm": 0.23093104362487793, "learning_rate": 4.298493469735869e-05, "loss": 0.0335, "step": 928 }, { "epoch": 1.64, "grad_norm": 0.38875579833984375, "learning_rate": 4.289264668532557e-05, "loss": 0.0327, "step": 929 }, { "epoch": 1.65, "grad_norm": 0.05294647812843323, "learning_rate": 4.280038337959268e-05, "loss": 0.007, "step": 930 }, { "epoch": 1.65, "grad_norm": 0.36184802651405334, "learning_rate": 4.270814510088203e-05, "loss": 0.0688, "step": 931 }, { "epoch": 1.65, "grad_norm": 0.531517744064331, "learning_rate": 4.2615932169828744e-05, "loss": 0.1305, "step": 932 }, { "epoch": 1.65, "grad_norm": 0.3471108376979828, "learning_rate": 4.2523744906979686e-05, "loss": 0.0236, "step": 933 }, { "epoch": 1.65, "grad_norm": 0.2624709904193878, "learning_rate": 4.24315836327926e-05, "loss": 0.0272, "step": 934 }, { "epoch": 1.65, "grad_norm": 0.3648707866668701, "learning_rate": 4.233944866763489e-05, "loss": 0.0384, "step": 935 }, { "epoch": 1.66, "grad_norm": 0.2199166864156723, "learning_rate": 4.224734033178241e-05, "loss": 0.0347, "step": 936 }, { "epoch": 1.66, "grad_norm": 0.44493308663368225, "learning_rate": 4.2155258945418566e-05, "loss": 0.0405, "step": 937 }, { "epoch": 1.66, "grad_norm": 0.4102453291416168, "learning_rate": 4.206320482863301e-05, "loss": 0.0849, "step": 938 }, { "epoch": 1.66, "grad_norm": 0.33510318398475647, "learning_rate": 4.1971178301420613e-05, "loss": 0.052, "step": 939 }, { "epoch": 1.66, "grad_norm": 0.40965744853019714, "learning_rate": 4.187917968368036e-05, "loss": 0.0848, "step": 940 }, { "epoch": 1.66, "grad_norm": 0.2755095064640045, "learning_rate": 4.178720929521418e-05, "loss": 0.0391, "step": 941 }, { "epoch": 1.67, "grad_norm": 0.32818931341171265, "learning_rate": 4.16952674557259e-05, "loss": 0.0327, "step": 942 }, { "epoch": 1.67, "grad_norm": 0.39538365602493286, "learning_rate": 4.1603354484820134e-05, "loss": 0.043, "step": 943 }, { "epoch": 1.67, "grad_norm": 0.2221785932779312, "learning_rate": 4.1511470702001074e-05, "loss": 0.0288, "step": 944 }, { "epoch": 1.67, "grad_norm": 0.2112448811531067, "learning_rate": 4.141961642667152e-05, "loss": 0.025, "step": 945 }, { "epoch": 1.67, "grad_norm": 0.15548011660575867, "learning_rate": 4.132779197813164e-05, "loss": 0.0694, "step": 946 }, { "epoch": 1.68, "grad_norm": 0.10790842771530151, "learning_rate": 4.1235997675577956e-05, "loss": 0.0124, "step": 947 }, { "epoch": 1.68, "grad_norm": 0.42269936203956604, "learning_rate": 4.11442338381022e-05, "loss": 0.0937, "step": 948 }, { "epoch": 1.68, "grad_norm": 0.36002832651138306, "learning_rate": 4.105250078469018e-05, "loss": 0.1242, "step": 949 }, { "epoch": 1.68, "grad_norm": 0.22437407076358795, "learning_rate": 4.0960798834220704e-05, "loss": 0.0638, "step": 950 }, { "epoch": 1.68, "grad_norm": 0.3967968225479126, "learning_rate": 4.086912830546448e-05, "loss": 0.0439, "step": 951 }, { "epoch": 1.68, "grad_norm": 0.20550177991390228, "learning_rate": 4.077748951708292e-05, "loss": 0.0347, "step": 952 }, { "epoch": 1.69, "grad_norm": 0.2020653337240219, "learning_rate": 4.068588278762723e-05, "loss": 0.0376, "step": 953 }, { "epoch": 1.69, "grad_norm": 0.19614431262016296, "learning_rate": 4.0594308435537024e-05, "loss": 0.032, "step": 954 }, { "epoch": 1.69, "grad_norm": 0.08721073716878891, "learning_rate": 4.0502766779139484e-05, "loss": 0.012, "step": 955 }, { "epoch": 1.69, "grad_norm": 0.2303171306848526, "learning_rate": 4.041125813664808e-05, "loss": 0.0268, "step": 956 }, { "epoch": 1.69, "grad_norm": 0.2725672721862793, "learning_rate": 4.031978282616151e-05, "loss": 0.0413, "step": 957 }, { "epoch": 1.69, "grad_norm": 0.28184202313423157, "learning_rate": 4.0228341165662685e-05, "loss": 0.0383, "step": 958 }, { "epoch": 1.7, "grad_norm": 0.28514358401298523, "learning_rate": 4.0136933473017407e-05, "loss": 0.044, "step": 959 }, { "epoch": 1.7, "grad_norm": 0.374714732170105, "learning_rate": 4.004556006597353e-05, "loss": 0.044, "step": 960 }, { "epoch": 1.7, "grad_norm": 0.19936969876289368, "learning_rate": 3.9954221262159674e-05, "loss": 0.0334, "step": 961 }, { "epoch": 1.7, "grad_norm": 0.3212338984012604, "learning_rate": 3.986291737908414e-05, "loss": 0.0473, "step": 962 }, { "epoch": 1.7, "grad_norm": 0.28045183420181274, "learning_rate": 3.9771648734133906e-05, "loss": 0.0321, "step": 963 }, { "epoch": 1.71, "grad_norm": 0.34167036414146423, "learning_rate": 3.968041564457342e-05, "loss": 0.0696, "step": 964 }, { "epoch": 1.71, "grad_norm": 0.5529135465621948, "learning_rate": 3.958921842754351e-05, "loss": 0.131, "step": 965 }, { "epoch": 1.71, "grad_norm": 0.275803804397583, "learning_rate": 3.949805740006036e-05, "loss": 0.0436, "step": 966 }, { "epoch": 1.71, "grad_norm": 0.3003288209438324, "learning_rate": 3.94069328790143e-05, "loss": 0.073, "step": 967 }, { "epoch": 1.71, "grad_norm": 0.20078504085540771, "learning_rate": 3.9315845181168784e-05, "loss": 0.0425, "step": 968 }, { "epoch": 1.71, "grad_norm": 0.3904169797897339, "learning_rate": 3.9224794623159294e-05, "loss": 0.0668, "step": 969 }, { "epoch": 1.72, "grad_norm": 0.212997168302536, "learning_rate": 3.913378152149214e-05, "loss": 0.0436, "step": 970 }, { "epoch": 1.72, "grad_norm": 0.04039880260825157, "learning_rate": 3.904280619254348e-05, "loss": 0.0077, "step": 971 }, { "epoch": 1.72, "grad_norm": 0.21076536178588867, "learning_rate": 3.895186895255814e-05, "loss": 0.0677, "step": 972 }, { "epoch": 1.72, "grad_norm": 0.37436169385910034, "learning_rate": 3.886097011764856e-05, "loss": 0.0294, "step": 973 }, { "epoch": 1.72, "grad_norm": 0.26611942052841187, "learning_rate": 3.877011000379367e-05, "loss": 0.057, "step": 974 }, { "epoch": 1.72, "grad_norm": 0.32198566198349, "learning_rate": 3.8679288926837804e-05, "loss": 0.0583, "step": 975 }, { "epoch": 1.73, "grad_norm": 0.2785477340221405, "learning_rate": 3.8588507202489586e-05, "loss": 0.0913, "step": 976 }, { "epoch": 1.73, "grad_norm": 0.20920749008655548, "learning_rate": 3.8497765146320876e-05, "loss": 0.0454, "step": 977 }, { "epoch": 1.73, "grad_norm": 0.31738653779029846, "learning_rate": 3.840706307376557e-05, "loss": 0.0464, "step": 978 }, { "epoch": 1.73, "grad_norm": 0.1887190192937851, "learning_rate": 3.8316401300118675e-05, "loss": 0.026, "step": 979 }, { "epoch": 1.73, "grad_norm": 0.22016988694667816, "learning_rate": 3.8225780140535025e-05, "loss": 0.0375, "step": 980 }, { "epoch": 1.74, "grad_norm": 0.2261650264263153, "learning_rate": 3.813519991002831e-05, "loss": 0.0368, "step": 981 }, { "epoch": 1.74, "grad_norm": 0.3108493983745575, "learning_rate": 3.804466092346997e-05, "loss": 0.0539, "step": 982 }, { "epoch": 1.74, "grad_norm": 0.23392857611179352, "learning_rate": 3.7954163495587995e-05, "loss": 0.0363, "step": 983 }, { "epoch": 1.74, "grad_norm": 0.23699642717838287, "learning_rate": 3.786370794096603e-05, "loss": 0.0362, "step": 984 }, { "epoch": 1.74, "grad_norm": 0.29672032594680786, "learning_rate": 3.777329457404202e-05, "loss": 0.0388, "step": 985 }, { "epoch": 1.74, "grad_norm": 0.25258907675743103, "learning_rate": 3.768292370910737e-05, "loss": 0.0278, "step": 986 }, { "epoch": 1.75, "grad_norm": 0.40179169178009033, "learning_rate": 3.759259566030571e-05, "loss": 0.1118, "step": 987 }, { "epoch": 1.75, "grad_norm": 0.3777885437011719, "learning_rate": 3.750231074163179e-05, "loss": 0.073, "step": 988 }, { "epoch": 1.75, "grad_norm": 0.39896661043167114, "learning_rate": 3.7412069266930516e-05, "loss": 0.0428, "step": 989 }, { "epoch": 1.75, "grad_norm": 0.26577284932136536, "learning_rate": 3.7321871549895714e-05, "loss": 0.0335, "step": 990 }, { "epoch": 1.75, "grad_norm": 0.5360684990882874, "learning_rate": 3.7231717904069094e-05, "loss": 0.0979, "step": 991 }, { "epoch": 1.75, "grad_norm": 0.21000511944293976, "learning_rate": 3.714160864283923e-05, "loss": 0.0156, "step": 992 }, { "epoch": 1.76, "grad_norm": 0.11425631493330002, "learning_rate": 3.7051544079440336e-05, "loss": 0.0143, "step": 993 }, { "epoch": 1.76, "grad_norm": 0.20283763110637665, "learning_rate": 3.696152452695128e-05, "loss": 0.0627, "step": 994 }, { "epoch": 1.76, "eval_loss": 0.07780980318784714, "eval_runtime": 14.6775, "eval_samples_per_second": 32.499, "eval_steps_per_second": 8.176, "step": 994 }, { "epoch": 1.76, "grad_norm": 0.13721764087677002, "learning_rate": 3.68715502982945e-05, "loss": 0.0154, "step": 995 }, { "epoch": 1.76, "grad_norm": 0.397158145904541, "learning_rate": 3.678162170623481e-05, "loss": 0.0494, "step": 996 }, { "epoch": 1.76, "grad_norm": 0.47730910778045654, "learning_rate": 3.669173906337846e-05, "loss": 0.0897, "step": 997 }, { "epoch": 1.77, "grad_norm": 0.19200514256954193, "learning_rate": 3.6601902682171894e-05, "loss": 0.0145, "step": 998 }, { "epoch": 1.77, "grad_norm": 0.39415818452835083, "learning_rate": 3.65121128749008e-05, "loss": 0.0778, "step": 999 }, { "epoch": 1.77, "grad_norm": 0.17673304677009583, "learning_rate": 3.642236995368897e-05, "loss": 0.0211, "step": 1000 }, { "epoch": 1.77, "grad_norm": 0.3190731108188629, "learning_rate": 3.633267423049717e-05, "loss": 0.0856, "step": 1001 }, { "epoch": 1.77, "grad_norm": 0.3833164572715759, "learning_rate": 3.624302601712213e-05, "loss": 0.0687, "step": 1002 }, { "epoch": 1.77, "grad_norm": 0.2938999831676483, "learning_rate": 3.6153425625195425e-05, "loss": 0.0717, "step": 1003 }, { "epoch": 1.78, "grad_norm": 0.2645472586154938, "learning_rate": 3.606387336618237e-05, "loss": 0.0341, "step": 1004 }, { "epoch": 1.78, "grad_norm": 0.1330021619796753, "learning_rate": 3.597436955138102e-05, "loss": 0.0244, "step": 1005 }, { "epoch": 1.78, "grad_norm": 0.34308159351348877, "learning_rate": 3.588491449192096e-05, "loss": 0.091, "step": 1006 }, { "epoch": 1.78, "grad_norm": 0.2861696481704712, "learning_rate": 3.579550849876233e-05, "loss": 0.0601, "step": 1007 }, { "epoch": 1.78, "grad_norm": 0.32016104459762573, "learning_rate": 3.570615188269473e-05, "loss": 0.0699, "step": 1008 }, { "epoch": 1.79, "grad_norm": 0.2865599989891052, "learning_rate": 3.561684495433605e-05, "loss": 0.0742, "step": 1009 }, { "epoch": 1.79, "grad_norm": 0.2045123279094696, "learning_rate": 3.5527588024131544e-05, "loss": 0.0323, "step": 1010 }, { "epoch": 1.79, "grad_norm": 0.13562075793743134, "learning_rate": 3.5438381402352574e-05, "loss": 0.0167, "step": 1011 }, { "epoch": 1.79, "grad_norm": 0.2536921799182892, "learning_rate": 3.534922539909569e-05, "loss": 0.047, "step": 1012 }, { "epoch": 1.79, "grad_norm": 0.193417489528656, "learning_rate": 3.5260120324281474e-05, "loss": 0.0349, "step": 1013 }, { "epoch": 1.79, "grad_norm": 0.18863309919834137, "learning_rate": 3.517106648765343e-05, "loss": 0.0261, "step": 1014 }, { "epoch": 1.8, "grad_norm": 0.5035936832427979, "learning_rate": 3.5082064198777e-05, "loss": 0.0963, "step": 1015 }, { "epoch": 1.8, "grad_norm": 0.33512285351753235, "learning_rate": 3.499311376703842e-05, "loss": 0.0534, "step": 1016 }, { "epoch": 1.8, "grad_norm": 0.14217574894428253, "learning_rate": 3.4904215501643646e-05, "loss": 0.0246, "step": 1017 }, { "epoch": 1.8, "grad_norm": 0.3124421536922455, "learning_rate": 3.4815369711617316e-05, "loss": 0.0498, "step": 1018 }, { "epoch": 1.8, "grad_norm": 0.22936655580997467, "learning_rate": 3.4726576705801636e-05, "loss": 0.0249, "step": 1019 }, { "epoch": 1.8, "grad_norm": 0.5534436106681824, "learning_rate": 3.463783679285535e-05, "loss": 0.1696, "step": 1020 }, { "epoch": 1.81, "grad_norm": 0.3127197027206421, "learning_rate": 3.4549150281252636e-05, "loss": 0.045, "step": 1021 }, { "epoch": 1.81, "grad_norm": 0.13819609582424164, "learning_rate": 3.446051747928202e-05, "loss": 0.0203, "step": 1022 }, { "epoch": 1.81, "grad_norm": 0.35015806555747986, "learning_rate": 3.4371938695045346e-05, "loss": 0.0608, "step": 1023 }, { "epoch": 1.81, "grad_norm": 0.3767643868923187, "learning_rate": 3.428341423645668e-05, "loss": 0.0686, "step": 1024 }, { "epoch": 1.81, "grad_norm": 0.3282710611820221, "learning_rate": 3.419494441124121e-05, "loss": 0.0697, "step": 1025 }, { "epoch": 1.82, "grad_norm": 0.24474768340587616, "learning_rate": 3.4106529526934306e-05, "loss": 0.0583, "step": 1026 }, { "epoch": 1.82, "grad_norm": 0.24781620502471924, "learning_rate": 3.4018169890880225e-05, "loss": 0.0327, "step": 1027 }, { "epoch": 1.82, "grad_norm": 0.2474932074546814, "learning_rate": 3.392986581023126e-05, "loss": 0.0679, "step": 1028 }, { "epoch": 1.82, "grad_norm": 0.39474180340766907, "learning_rate": 3.384161759194658e-05, "loss": 0.0713, "step": 1029 }, { "epoch": 1.82, "grad_norm": 0.13963012397289276, "learning_rate": 3.375342554279111e-05, "loss": 0.0179, "step": 1030 }, { "epoch": 1.82, "grad_norm": 0.32144859433174133, "learning_rate": 3.3665289969334585e-05, "loss": 0.0447, "step": 1031 }, { "epoch": 1.83, "grad_norm": 0.14767055213451385, "learning_rate": 3.3577211177950385e-05, "loss": 0.017, "step": 1032 }, { "epoch": 1.83, "grad_norm": 0.3088414967060089, "learning_rate": 3.348918947481452e-05, "loss": 0.0483, "step": 1033 }, { "epoch": 1.83, "grad_norm": 0.33189231157302856, "learning_rate": 3.340122516590456e-05, "loss": 0.0382, "step": 1034 }, { "epoch": 1.83, "grad_norm": 0.4593893885612488, "learning_rate": 3.3313318556998526e-05, "loss": 0.0523, "step": 1035 }, { "epoch": 1.83, "grad_norm": 0.253412127494812, "learning_rate": 3.322546995367394e-05, "loss": 0.0212, "step": 1036 }, { "epoch": 1.83, "grad_norm": 0.42397648096084595, "learning_rate": 3.3137679661306576e-05, "loss": 0.087, "step": 1037 }, { "epoch": 1.84, "grad_norm": 0.40718910098075867, "learning_rate": 3.3049947985069616e-05, "loss": 0.0965, "step": 1038 }, { "epoch": 1.84, "grad_norm": 0.3604757487773895, "learning_rate": 3.2962275229932446e-05, "loss": 0.0973, "step": 1039 }, { "epoch": 1.84, "grad_norm": 0.2129022628068924, "learning_rate": 3.287466170065959e-05, "loss": 0.0384, "step": 1040 }, { "epoch": 1.84, "grad_norm": 0.20552971959114075, "learning_rate": 3.2787107701809754e-05, "loss": 0.0305, "step": 1041 }, { "epoch": 1.84, "grad_norm": 0.41909754276275635, "learning_rate": 3.269961353773469e-05, "loss": 0.0725, "step": 1042 }, { "epoch": 1.85, "grad_norm": 0.2766873240470886, "learning_rate": 3.261217951257813e-05, "loss": 0.0649, "step": 1043 }, { "epoch": 1.85, "grad_norm": 0.1571783572435379, "learning_rate": 3.252480593027478e-05, "loss": 0.0238, "step": 1044 }, { "epoch": 1.85, "grad_norm": 0.2809221148490906, "learning_rate": 3.243749309454922e-05, "loss": 0.0613, "step": 1045 }, { "epoch": 1.85, "grad_norm": 0.4999224841594696, "learning_rate": 3.235024130891487e-05, "loss": 0.0806, "step": 1046 }, { "epoch": 1.85, "grad_norm": 0.1895889937877655, "learning_rate": 3.226305087667295e-05, "loss": 0.026, "step": 1047 }, { "epoch": 1.85, "grad_norm": 0.220509871840477, "learning_rate": 3.217592210091137e-05, "loss": 0.0681, "step": 1048 }, { "epoch": 1.86, "grad_norm": 0.19226831197738647, "learning_rate": 3.208885528450376e-05, "loss": 0.0232, "step": 1049 }, { "epoch": 1.86, "grad_norm": 0.30779534578323364, "learning_rate": 3.200185073010831e-05, "loss": 0.0547, "step": 1050 }, { "epoch": 1.86, "grad_norm": 0.16252338886260986, "learning_rate": 3.1914908740166795e-05, "loss": 0.0237, "step": 1051 }, { "epoch": 1.86, "grad_norm": 0.4130539000034332, "learning_rate": 3.182802961690357e-05, "loss": 0.0437, "step": 1052 }, { "epoch": 1.86, "grad_norm": 0.20889542996883392, "learning_rate": 3.1741213662324365e-05, "loss": 0.0493, "step": 1053 }, { "epoch": 1.86, "grad_norm": 0.27447709441185, "learning_rate": 3.165446117821538e-05, "loss": 0.0859, "step": 1054 }, { "epoch": 1.87, "grad_norm": 0.07740370184183121, "learning_rate": 3.1567772466142156e-05, "loss": 0.011, "step": 1055 }, { "epoch": 1.87, "grad_norm": 0.1462429016828537, "learning_rate": 3.148114782744855e-05, "loss": 0.0228, "step": 1056 }, { "epoch": 1.87, "grad_norm": 0.34325939416885376, "learning_rate": 3.139458756325576e-05, "loss": 0.0928, "step": 1057 }, { "epoch": 1.87, "grad_norm": 0.2712673246860504, "learning_rate": 3.130809197446106e-05, "loss": 0.0408, "step": 1058 }, { "epoch": 1.87, "grad_norm": 0.24180567264556885, "learning_rate": 3.122166136173706e-05, "loss": 0.0309, "step": 1059 }, { "epoch": 1.88, "grad_norm": 0.2045336663722992, "learning_rate": 3.113529602553042e-05, "loss": 0.0391, "step": 1060 }, { "epoch": 1.88, "grad_norm": 0.09178590029478073, "learning_rate": 3.104899626606088e-05, "loss": 0.0132, "step": 1061 }, { "epoch": 1.88, "grad_norm": 0.300592303276062, "learning_rate": 3.0962762383320285e-05, "loss": 0.0787, "step": 1062 }, { "epoch": 1.88, "grad_norm": 0.31034108996391296, "learning_rate": 3.08765946770714e-05, "loss": 0.0307, "step": 1063 }, { "epoch": 1.88, "grad_norm": 0.1603342443704605, "learning_rate": 3.0790493446847024e-05, "loss": 0.0209, "step": 1064 }, { "epoch": 1.88, "grad_norm": 0.23990066349506378, "learning_rate": 3.070445899194885e-05, "loss": 0.0361, "step": 1065 }, { "epoch": 1.89, "grad_norm": 0.15933218598365784, "learning_rate": 3.061849161144641e-05, "loss": 0.0137, "step": 1066 }, { "epoch": 1.89, "grad_norm": 0.4810096025466919, "learning_rate": 3.053259160417613e-05, "loss": 0.1255, "step": 1067 }, { "epoch": 1.89, "grad_norm": 0.4099353849887848, "learning_rate": 3.0446759268740233e-05, "loss": 0.0718, "step": 1068 }, { "epoch": 1.89, "grad_norm": 0.39221569895744324, "learning_rate": 3.0360994903505653e-05, "loss": 0.0721, "step": 1069 }, { "epoch": 1.89, "grad_norm": 0.48040878772735596, "learning_rate": 3.02752988066031e-05, "loss": 0.1077, "step": 1070 }, { "epoch": 1.89, "grad_norm": 0.1548435539007187, "learning_rate": 3.018967127592595e-05, "loss": 0.0273, "step": 1071 }, { "epoch": 1.9, "grad_norm": 0.37816908955574036, "learning_rate": 3.010411260912922e-05, "loss": 0.0421, "step": 1072 }, { "epoch": 1.9, "grad_norm": 0.2902359366416931, "learning_rate": 3.0018623103628596e-05, "loss": 0.0645, "step": 1073 }, { "epoch": 1.9, "grad_norm": 0.32010090351104736, "learning_rate": 2.9933203056599275e-05, "loss": 0.0444, "step": 1074 }, { "epoch": 1.9, "grad_norm": 0.47379517555236816, "learning_rate": 2.984785276497507e-05, "loss": 0.0423, "step": 1075 }, { "epoch": 1.9, "grad_norm": 0.3198046088218689, "learning_rate": 2.9762572525447262e-05, "loss": 0.0678, "step": 1076 }, { "epoch": 1.91, "grad_norm": 0.33741793036460876, "learning_rate": 2.9677362634463647e-05, "loss": 0.0643, "step": 1077 }, { "epoch": 1.91, "grad_norm": 0.2264060080051422, "learning_rate": 2.9592223388227503e-05, "loss": 0.0275, "step": 1078 }, { "epoch": 1.91, "grad_norm": 0.3069595694541931, "learning_rate": 2.9507155082696482e-05, "loss": 0.0481, "step": 1079 }, { "epoch": 1.91, "grad_norm": 0.26178085803985596, "learning_rate": 2.9422158013581658e-05, "loss": 0.0298, "step": 1080 }, { "epoch": 1.91, "grad_norm": 0.4191998243331909, "learning_rate": 2.93372324763465e-05, "loss": 0.1156, "step": 1081 }, { "epoch": 1.91, "grad_norm": 0.21308496594429016, "learning_rate": 2.9252378766205758e-05, "loss": 0.0478, "step": 1082 }, { "epoch": 1.92, "grad_norm": 0.20033082365989685, "learning_rate": 2.9167597178124585e-05, "loss": 0.0262, "step": 1083 }, { "epoch": 1.92, "grad_norm": 0.21688255667686462, "learning_rate": 2.9082888006817365e-05, "loss": 0.0767, "step": 1084 }, { "epoch": 1.92, "grad_norm": 0.300791472196579, "learning_rate": 2.899825154674674e-05, "loss": 0.0464, "step": 1085 }, { "epoch": 1.92, "grad_norm": 0.2416476011276245, "learning_rate": 2.8913688092122664e-05, "loss": 0.0605, "step": 1086 }, { "epoch": 1.92, "grad_norm": 0.2521096169948578, "learning_rate": 2.8829197936901232e-05, "loss": 0.0293, "step": 1087 }, { "epoch": 1.92, "grad_norm": 0.3229115605354309, "learning_rate": 2.8744781374783813e-05, "loss": 0.0435, "step": 1088 }, { "epoch": 1.93, "grad_norm": 0.0780615508556366, "learning_rate": 2.8660438699215898e-05, "loss": 0.0109, "step": 1089 }, { "epoch": 1.93, "grad_norm": 0.2879962623119354, "learning_rate": 2.8576170203386143e-05, "loss": 0.0665, "step": 1090 }, { "epoch": 1.93, "grad_norm": 0.19985683262348175, "learning_rate": 2.8491976180225388e-05, "loss": 0.0378, "step": 1091 }, { "epoch": 1.93, "grad_norm": 0.15360887348651886, "learning_rate": 2.840785692240553e-05, "loss": 0.0257, "step": 1092 }, { "epoch": 1.93, "grad_norm": 0.19224387407302856, "learning_rate": 2.832381272233864e-05, "loss": 0.0529, "step": 1093 }, { "epoch": 1.94, "grad_norm": 0.2037738561630249, "learning_rate": 2.8239843872175814e-05, "loss": 0.0228, "step": 1094 }, { "epoch": 1.94, "grad_norm": 0.380874902009964, "learning_rate": 2.8155950663806235e-05, "loss": 0.0525, "step": 1095 }, { "epoch": 1.94, "grad_norm": 0.28522253036499023, "learning_rate": 2.8072133388856192e-05, "loss": 0.0615, "step": 1096 }, { "epoch": 1.94, "grad_norm": 0.1098146066069603, "learning_rate": 2.7988392338687926e-05, "loss": 0.0159, "step": 1097 }, { "epoch": 1.94, "grad_norm": 0.2615334987640381, "learning_rate": 2.7904727804398812e-05, "loss": 0.0353, "step": 1098 }, { "epoch": 1.94, "grad_norm": 0.2041955441236496, "learning_rate": 2.7821140076820162e-05, "loss": 0.0187, "step": 1099 }, { "epoch": 1.95, "grad_norm": 0.1846192330121994, "learning_rate": 2.773762944651632e-05, "loss": 0.0554, "step": 1100 }, { "epoch": 1.95, "grad_norm": 0.17711102962493896, "learning_rate": 2.765419620378366e-05, "loss": 0.0342, "step": 1101 }, { "epoch": 1.95, "grad_norm": 0.3703756332397461, "learning_rate": 2.7570840638649486e-05, "loss": 0.0378, "step": 1102 }, { "epoch": 1.95, "grad_norm": 0.4282096326351166, "learning_rate": 2.7487563040871145e-05, "loss": 0.0789, "step": 1103 }, { "epoch": 1.95, "grad_norm": 0.32506605982780457, "learning_rate": 2.740436369993491e-05, "loss": 0.0337, "step": 1104 }, { "epoch": 1.95, "grad_norm": 0.250688374042511, "learning_rate": 2.7321242905055013e-05, "loss": 0.0554, "step": 1105 }, { "epoch": 1.96, "grad_norm": 0.3557257354259491, "learning_rate": 2.7238200945172698e-05, "loss": 0.0356, "step": 1106 }, { "epoch": 1.96, "grad_norm": 0.3472774028778076, "learning_rate": 2.715523810895515e-05, "loss": 0.0348, "step": 1107 }, { "epoch": 1.96, "grad_norm": 0.07373315095901489, "learning_rate": 2.707235468479449e-05, "loss": 0.0088, "step": 1108 }, { "epoch": 1.96, "grad_norm": 0.21439437568187714, "learning_rate": 2.6989550960806768e-05, "loss": 0.0222, "step": 1109 }, { "epoch": 1.96, "grad_norm": 0.2730681002140045, "learning_rate": 2.690682722483102e-05, "loss": 0.068, "step": 1110 }, { "epoch": 1.97, "grad_norm": 0.41124334931373596, "learning_rate": 2.6824183764428224e-05, "loss": 0.086, "step": 1111 }, { "epoch": 1.97, "grad_norm": 0.6637737154960632, "learning_rate": 2.6741620866880335e-05, "loss": 0.0365, "step": 1112 }, { "epoch": 1.97, "grad_norm": 0.425441712141037, "learning_rate": 2.665913881918921e-05, "loss": 0.095, "step": 1113 }, { "epoch": 1.97, "grad_norm": 0.5520187020301819, "learning_rate": 2.6576737908075668e-05, "loss": 0.0514, "step": 1114 }, { "epoch": 1.97, "grad_norm": 0.5284621119499207, "learning_rate": 2.6494418419978482e-05, "loss": 0.0593, "step": 1115 }, { "epoch": 1.97, "grad_norm": 0.08148845285177231, "learning_rate": 2.641218064105341e-05, "loss": 0.0084, "step": 1116 }, { "epoch": 1.98, "grad_norm": 0.8642109036445618, "learning_rate": 2.6330024857172192e-05, "loss": 0.0766, "step": 1117 }, { "epoch": 1.98, "grad_norm": 0.40509146451950073, "learning_rate": 2.6247951353921485e-05, "loss": 0.1148, "step": 1118 }, { "epoch": 1.98, "grad_norm": 0.2981242537498474, "learning_rate": 2.616596041660194e-05, "loss": 0.0666, "step": 1119 }, { "epoch": 1.98, "grad_norm": 0.21514151990413666, "learning_rate": 2.6084052330227238e-05, "loss": 0.0363, "step": 1120 }, { "epoch": 1.98, "grad_norm": 0.10281267762184143, "learning_rate": 2.6002227379522992e-05, "loss": 0.0169, "step": 1121 }, { "epoch": 1.98, "grad_norm": 0.3236760199069977, "learning_rate": 2.5920485848925913e-05, "loss": 0.0296, "step": 1122 }, { "epoch": 1.99, "grad_norm": 0.22741632163524628, "learning_rate": 2.5838828022582594e-05, "loss": 0.023, "step": 1123 }, { "epoch": 1.99, "grad_norm": 0.3826078772544861, "learning_rate": 2.5757254184348778e-05, "loss": 0.0744, "step": 1124 }, { "epoch": 1.99, "grad_norm": 0.226307213306427, "learning_rate": 2.5675764617788234e-05, "loss": 0.0297, "step": 1125 }, { "epoch": 1.99, "grad_norm": 0.31913021206855774, "learning_rate": 2.5594359606171724e-05, "loss": 0.0793, "step": 1126 }, { "epoch": 1.99, "grad_norm": 0.2947479486465454, "learning_rate": 2.5513039432476193e-05, "loss": 0.1363, "step": 1127 }, { "epoch": 2.0, "grad_norm": 0.26046791672706604, "learning_rate": 2.5431804379383523e-05, "loss": 0.0727, "step": 1128 }, { "epoch": 2.0, "grad_norm": 0.1183793917298317, "learning_rate": 2.535065472927983e-05, "loss": 0.0139, "step": 1129 }, { "epoch": 2.0, "grad_norm": 0.23370495438575745, "learning_rate": 2.526959076425434e-05, "loss": 0.0503, "step": 1130 } ], "logging_steps": 1, "max_steps": 1695, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 565, "total_flos": 1.0339891388035891e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }