diff --git "a/checkpoint-1130/trainer_state.json" "b/checkpoint-1130/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1130/trainer_state.json" @@ -0,0 +1,7995 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9991154356479433, + "eval_steps": 142, + "global_step": 1130, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 4.680586814880371, + "learning_rate": 1e-05, + "loss": 3.3182, + "step": 1 + }, + { + "epoch": 0.0, + "eval_loss": 3.3362529277801514, + "eval_runtime": 14.4477, + "eval_samples_per_second": 33.016, + "eval_steps_per_second": 8.306, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 4.609802722930908, + "learning_rate": 2e-05, + "loss": 3.2788, + "step": 2 + }, + { + "epoch": 0.01, + "grad_norm": 4.793943405151367, + "learning_rate": 3e-05, + "loss": 3.3432, + "step": 3 + }, + { + "epoch": 0.01, + "grad_norm": 4.687256336212158, + "learning_rate": 4e-05, + "loss": 3.2521, + "step": 4 + }, + { + "epoch": 0.01, + "grad_norm": 4.676945209503174, + "learning_rate": 5e-05, + "loss": 3.1085, + "step": 5 + }, + { + "epoch": 0.01, + "grad_norm": 4.490086078643799, + "learning_rate": 6e-05, + "loss": 2.8093, + "step": 6 + }, + { + "epoch": 0.01, + "grad_norm": 4.042544364929199, + "learning_rate": 7e-05, + "loss": 2.3501, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 3.4973549842834473, + "learning_rate": 8e-05, + "loss": 1.6118, + "step": 8 + }, + { + "epoch": 0.02, + "grad_norm": 2.9255049228668213, + "learning_rate": 9e-05, + "loss": 0.9938, + "step": 9 + }, + { + "epoch": 0.02, + "grad_norm": 1.9458708763122559, + "learning_rate": 0.0001, + "loss": 0.4821, + "step": 10 + }, + { + "epoch": 0.02, + "grad_norm": 1.3317277431488037, + "learning_rate": 9.999991309598974e-05, + "loss": 0.3336, + "step": 11 + }, + { + "epoch": 0.02, + "grad_norm": 0.7918155193328857, + "learning_rate": 9.999965238426104e-05, + "loss": 0.1707, + "step": 12 + }, + { + "epoch": 0.02, + "grad_norm": 0.779201090335846, + "learning_rate": 9.999921786572015e-05, + "loss": 0.1089, + "step": 13 + }, + { + "epoch": 0.02, + "grad_norm": 1.8333582878112793, + "learning_rate": 9.999860954187756e-05, + "loss": 0.1829, + "step": 14 + }, + { + "epoch": 0.03, + "grad_norm": 0.7508969902992249, + "learning_rate": 9.999782741484788e-05, + "loss": 0.1284, + "step": 15 + }, + { + "epoch": 0.03, + "grad_norm": 0.6024438142776489, + "learning_rate": 9.999687148734995e-05, + "loss": 0.1321, + "step": 16 + }, + { + "epoch": 0.03, + "grad_norm": 0.2141278237104416, + "learning_rate": 9.999574176270667e-05, + "loss": 0.1294, + "step": 17 + }, + { + "epoch": 0.03, + "grad_norm": 0.8326414227485657, + "learning_rate": 9.999443824484519e-05, + "loss": 0.1414, + "step": 18 + }, + { + "epoch": 0.03, + "grad_norm": 0.25254812836647034, + "learning_rate": 9.999296093829672e-05, + "loss": 0.1389, + "step": 19 + }, + { + "epoch": 0.04, + "grad_norm": 0.24321849644184113, + "learning_rate": 9.999130984819662e-05, + "loss": 0.1354, + "step": 20 + }, + { + "epoch": 0.04, + "grad_norm": 0.27592459321022034, + "learning_rate": 9.998948498028435e-05, + "loss": 0.1032, + "step": 21 + }, + { + "epoch": 0.04, + "grad_norm": 0.5619893670082092, + "learning_rate": 9.998748634090344e-05, + "loss": 0.1264, + "step": 22 + }, + { + "epoch": 0.04, + "grad_norm": 0.7668437361717224, + "learning_rate": 9.998531393700148e-05, + "loss": 0.1223, + "step": 23 + }, + { + "epoch": 0.04, + "grad_norm": 0.38381776213645935, + "learning_rate": 9.99829677761301e-05, + "loss": 0.0988, + "step": 24 + }, + { + "epoch": 0.04, + "grad_norm": 1.8528074026107788, + "learning_rate": 9.998044786644491e-05, + "loss": 0.1421, + "step": 25 + }, + { + "epoch": 0.05, + "grad_norm": 2.61940336227417, + "learning_rate": 9.997775421670556e-05, + "loss": 0.2738, + "step": 26 + }, + { + "epoch": 0.05, + "grad_norm": 0.5642948150634766, + "learning_rate": 9.997488683627559e-05, + "loss": 0.1113, + "step": 27 + }, + { + "epoch": 0.05, + "grad_norm": 0.31713128089904785, + "learning_rate": 9.997184573512245e-05, + "loss": 0.0593, + "step": 28 + }, + { + "epoch": 0.05, + "grad_norm": 1.540770411491394, + "learning_rate": 9.996863092381752e-05, + "loss": 0.2008, + "step": 29 + }, + { + "epoch": 0.05, + "grad_norm": 0.5343081951141357, + "learning_rate": 9.9965242413536e-05, + "loss": 0.1141, + "step": 30 + }, + { + "epoch": 0.05, + "grad_norm": 0.3453178107738495, + "learning_rate": 9.99616802160569e-05, + "loss": 0.1137, + "step": 31 + }, + { + "epoch": 0.06, + "grad_norm": 0.24307988584041595, + "learning_rate": 9.995794434376297e-05, + "loss": 0.0971, + "step": 32 + }, + { + "epoch": 0.06, + "grad_norm": 0.1228247806429863, + "learning_rate": 9.995403480964072e-05, + "loss": 0.1246, + "step": 33 + }, + { + "epoch": 0.06, + "grad_norm": 0.28919702768325806, + "learning_rate": 9.994995162728029e-05, + "loss": 0.1485, + "step": 34 + }, + { + "epoch": 0.06, + "grad_norm": 0.16266010701656342, + "learning_rate": 9.994569481087552e-05, + "loss": 0.1196, + "step": 35 + }, + { + "epoch": 0.06, + "grad_norm": 0.13942277431488037, + "learning_rate": 9.994126437522375e-05, + "loss": 0.1266, + "step": 36 + }, + { + "epoch": 0.07, + "grad_norm": 0.22137008607387543, + "learning_rate": 9.99366603357259e-05, + "loss": 0.1226, + "step": 37 + }, + { + "epoch": 0.07, + "grad_norm": 0.4418635070323944, + "learning_rate": 9.993188270838635e-05, + "loss": 0.1577, + "step": 38 + }, + { + "epoch": 0.07, + "grad_norm": 0.1568412482738495, + "learning_rate": 9.992693150981292e-05, + "loss": 0.1205, + "step": 39 + }, + { + "epoch": 0.07, + "grad_norm": 0.33117881417274475, + "learning_rate": 9.992180675721672e-05, + "loss": 0.1179, + "step": 40 + }, + { + "epoch": 0.07, + "grad_norm": 0.18135391175746918, + "learning_rate": 9.991650846841226e-05, + "loss": 0.098, + "step": 41 + }, + { + "epoch": 0.07, + "grad_norm": 0.23688584566116333, + "learning_rate": 9.99110366618172e-05, + "loss": 0.0839, + "step": 42 + }, + { + "epoch": 0.08, + "grad_norm": 0.40056225657463074, + "learning_rate": 9.990539135645245e-05, + "loss": 0.1648, + "step": 43 + }, + { + "epoch": 0.08, + "grad_norm": 0.16901437938213348, + "learning_rate": 9.9899572571942e-05, + "loss": 0.052, + "step": 44 + }, + { + "epoch": 0.08, + "grad_norm": 0.6529514193534851, + "learning_rate": 9.989358032851284e-05, + "loss": 0.1448, + "step": 45 + }, + { + "epoch": 0.08, + "grad_norm": 0.25050362944602966, + "learning_rate": 9.9887414646995e-05, + "loss": 0.083, + "step": 46 + }, + { + "epoch": 0.08, + "grad_norm": 0.38230955600738525, + "learning_rate": 9.988107554882138e-05, + "loss": 0.0912, + "step": 47 + }, + { + "epoch": 0.08, + "grad_norm": 0.24738825857639313, + "learning_rate": 9.987456305602769e-05, + "loss": 0.1337, + "step": 48 + }, + { + "epoch": 0.09, + "grad_norm": 0.22692906856536865, + "learning_rate": 9.986787719125241e-05, + "loss": 0.0924, + "step": 49 + }, + { + "epoch": 0.09, + "grad_norm": 0.6348592638969421, + "learning_rate": 9.986101797773667e-05, + "loss": 0.1401, + "step": 50 + }, + { + "epoch": 0.09, + "grad_norm": 0.27844250202178955, + "learning_rate": 9.985398543932421e-05, + "loss": 0.1028, + "step": 51 + }, + { + "epoch": 0.09, + "grad_norm": 0.4353552460670471, + "learning_rate": 9.984677960046123e-05, + "loss": 0.1245, + "step": 52 + }, + { + "epoch": 0.09, + "grad_norm": 0.3049747943878174, + "learning_rate": 9.98394004861964e-05, + "loss": 0.0936, + "step": 53 + }, + { + "epoch": 0.1, + "grad_norm": 0.34531188011169434, + "learning_rate": 9.983184812218072e-05, + "loss": 0.0775, + "step": 54 + }, + { + "epoch": 0.1, + "grad_norm": 0.252056360244751, + "learning_rate": 9.98241225346674e-05, + "loss": 0.1082, + "step": 55 + }, + { + "epoch": 0.1, + "grad_norm": 0.14545601606369019, + "learning_rate": 9.981622375051183e-05, + "loss": 0.0857, + "step": 56 + }, + { + "epoch": 0.1, + "grad_norm": 0.313376784324646, + "learning_rate": 9.980815179717145e-05, + "loss": 0.0798, + "step": 57 + }, + { + "epoch": 0.1, + "grad_norm": 0.3164367079734802, + "learning_rate": 9.979990670270564e-05, + "loss": 0.1103, + "step": 58 + }, + { + "epoch": 0.1, + "grad_norm": 0.30412447452545166, + "learning_rate": 9.979148849577572e-05, + "loss": 0.0889, + "step": 59 + }, + { + "epoch": 0.11, + "grad_norm": 0.2582318186759949, + "learning_rate": 9.978289720564471e-05, + "loss": 0.0844, + "step": 60 + }, + { + "epoch": 0.11, + "grad_norm": 0.2912735939025879, + "learning_rate": 9.977413286217728e-05, + "loss": 0.077, + "step": 61 + }, + { + "epoch": 0.11, + "grad_norm": 0.3888057768344879, + "learning_rate": 9.976519549583974e-05, + "loss": 0.1386, + "step": 62 + }, + { + "epoch": 0.11, + "grad_norm": 0.42121654748916626, + "learning_rate": 9.975608513769976e-05, + "loss": 0.0823, + "step": 63 + }, + { + "epoch": 0.11, + "grad_norm": 0.4052259624004364, + "learning_rate": 9.974680181942645e-05, + "loss": 0.0846, + "step": 64 + }, + { + "epoch": 0.11, + "grad_norm": 0.2273682802915573, + "learning_rate": 9.973734557329009e-05, + "loss": 0.0589, + "step": 65 + }, + { + "epoch": 0.12, + "grad_norm": 0.5051669478416443, + "learning_rate": 9.972771643216212e-05, + "loss": 0.1111, + "step": 66 + }, + { + "epoch": 0.12, + "grad_norm": 0.2672370970249176, + "learning_rate": 9.971791442951497e-05, + "loss": 0.0819, + "step": 67 + }, + { + "epoch": 0.12, + "grad_norm": 0.28557881712913513, + "learning_rate": 9.970793959942198e-05, + "loss": 0.0912, + "step": 68 + }, + { + "epoch": 0.12, + "grad_norm": 0.34148702025413513, + "learning_rate": 9.969779197655726e-05, + "loss": 0.1036, + "step": 69 + }, + { + "epoch": 0.12, + "grad_norm": 0.3550778925418854, + "learning_rate": 9.968747159619556e-05, + "loss": 0.0833, + "step": 70 + }, + { + "epoch": 0.13, + "grad_norm": 0.3434258699417114, + "learning_rate": 9.967697849421221e-05, + "loss": 0.1186, + "step": 71 + }, + { + "epoch": 0.13, + "grad_norm": 0.23545867204666138, + "learning_rate": 9.966631270708287e-05, + "loss": 0.1185, + "step": 72 + }, + { + "epoch": 0.13, + "grad_norm": 0.28094542026519775, + "learning_rate": 9.965547427188357e-05, + "loss": 0.052, + "step": 73 + }, + { + "epoch": 0.13, + "grad_norm": 0.13517600297927856, + "learning_rate": 9.964446322629043e-05, + "loss": 0.0695, + "step": 74 + }, + { + "epoch": 0.13, + "grad_norm": 0.16696467995643616, + "learning_rate": 9.963327960857962e-05, + "loss": 0.1003, + "step": 75 + }, + { + "epoch": 0.13, + "grad_norm": 0.18569788336753845, + "learning_rate": 9.962192345762717e-05, + "loss": 0.0495, + "step": 76 + }, + { + "epoch": 0.14, + "grad_norm": 0.19817449152469635, + "learning_rate": 9.961039481290888e-05, + "loss": 0.067, + "step": 77 + }, + { + "epoch": 0.14, + "grad_norm": 0.14672966301441193, + "learning_rate": 9.959869371450021e-05, + "loss": 0.0737, + "step": 78 + }, + { + "epoch": 0.14, + "grad_norm": 0.302121639251709, + "learning_rate": 9.958682020307601e-05, + "loss": 0.0779, + "step": 79 + }, + { + "epoch": 0.14, + "grad_norm": 0.35970303416252136, + "learning_rate": 9.957477431991054e-05, + "loss": 0.134, + "step": 80 + }, + { + "epoch": 0.14, + "grad_norm": 0.308292955160141, + "learning_rate": 9.956255610687719e-05, + "loss": 0.1006, + "step": 81 + }, + { + "epoch": 0.15, + "grad_norm": 0.27124735713005066, + "learning_rate": 9.955016560644847e-05, + "loss": 0.0572, + "step": 82 + }, + { + "epoch": 0.15, + "grad_norm": 0.1820615977048874, + "learning_rate": 9.953760286169571e-05, + "loss": 0.0595, + "step": 83 + }, + { + "epoch": 0.15, + "grad_norm": 0.36385104060173035, + "learning_rate": 9.952486791628905e-05, + "loss": 0.0874, + "step": 84 + }, + { + "epoch": 0.15, + "grad_norm": 0.848340630531311, + "learning_rate": 9.95119608144972e-05, + "loss": 0.1178, + "step": 85 + }, + { + "epoch": 0.15, + "grad_norm": 0.7947489023208618, + "learning_rate": 9.94988816011873e-05, + "loss": 0.1115, + "step": 86 + }, + { + "epoch": 0.15, + "grad_norm": 0.33932074904441833, + "learning_rate": 9.94856303218248e-05, + "loss": 0.0546, + "step": 87 + }, + { + "epoch": 0.16, + "grad_norm": 0.26873940229415894, + "learning_rate": 9.947220702247329e-05, + "loss": 0.0873, + "step": 88 + }, + { + "epoch": 0.16, + "grad_norm": 0.3373044431209564, + "learning_rate": 9.945861174979429e-05, + "loss": 0.1051, + "step": 89 + }, + { + "epoch": 0.16, + "grad_norm": 0.24391719698905945, + "learning_rate": 9.944484455104717e-05, + "loss": 0.0986, + "step": 90 + }, + { + "epoch": 0.16, + "grad_norm": 0.4301680326461792, + "learning_rate": 9.943090547408888e-05, + "loss": 0.1524, + "step": 91 + }, + { + "epoch": 0.16, + "grad_norm": 0.4246341288089752, + "learning_rate": 9.941679456737394e-05, + "loss": 0.1619, + "step": 92 + }, + { + "epoch": 0.16, + "grad_norm": 0.2680893838405609, + "learning_rate": 9.940251187995411e-05, + "loss": 0.1187, + "step": 93 + }, + { + "epoch": 0.17, + "grad_norm": 0.18920297920703888, + "learning_rate": 9.938805746147827e-05, + "loss": 0.105, + "step": 94 + }, + { + "epoch": 0.17, + "grad_norm": 0.22168701887130737, + "learning_rate": 9.937343136219233e-05, + "loss": 0.0856, + "step": 95 + }, + { + "epoch": 0.17, + "grad_norm": 0.2235199213027954, + "learning_rate": 9.935863363293896e-05, + "loss": 0.1026, + "step": 96 + }, + { + "epoch": 0.17, + "grad_norm": 0.16578496992588043, + "learning_rate": 9.93436643251574e-05, + "loss": 0.0777, + "step": 97 + }, + { + "epoch": 0.17, + "grad_norm": 0.15994016826152802, + "learning_rate": 9.932852349088342e-05, + "loss": 0.0957, + "step": 98 + }, + { + "epoch": 0.18, + "grad_norm": 0.18692170083522797, + "learning_rate": 9.931321118274897e-05, + "loss": 0.0913, + "step": 99 + }, + { + "epoch": 0.18, + "grad_norm": 0.15477485954761505, + "learning_rate": 9.929772745398206e-05, + "loss": 0.0911, + "step": 100 + }, + { + "epoch": 0.18, + "grad_norm": 0.28473320603370667, + "learning_rate": 9.928207235840664e-05, + "loss": 0.1283, + "step": 101 + }, + { + "epoch": 0.18, + "grad_norm": 0.38557159900665283, + "learning_rate": 9.926624595044234e-05, + "loss": 0.1125, + "step": 102 + }, + { + "epoch": 0.18, + "grad_norm": 0.13523289561271667, + "learning_rate": 9.925024828510427e-05, + "loss": 0.0555, + "step": 103 + }, + { + "epoch": 0.18, + "grad_norm": 0.15305563807487488, + "learning_rate": 9.923407941800291e-05, + "loss": 0.1003, + "step": 104 + }, + { + "epoch": 0.19, + "grad_norm": 0.21130621433258057, + "learning_rate": 9.921773940534382e-05, + "loss": 0.0945, + "step": 105 + }, + { + "epoch": 0.19, + "grad_norm": 0.2301904559135437, + "learning_rate": 9.920122830392748e-05, + "loss": 0.1019, + "step": 106 + }, + { + "epoch": 0.19, + "grad_norm": 0.16425654292106628, + "learning_rate": 9.918454617114918e-05, + "loss": 0.0781, + "step": 107 + }, + { + "epoch": 0.19, + "grad_norm": 0.2672991454601288, + "learning_rate": 9.916769306499866e-05, + "loss": 0.085, + "step": 108 + }, + { + "epoch": 0.19, + "grad_norm": 0.3746218681335449, + "learning_rate": 9.915066904406e-05, + "loss": 0.1698, + "step": 109 + }, + { + "epoch": 0.19, + "grad_norm": 0.1691233068704605, + "learning_rate": 9.913347416751148e-05, + "loss": 0.046, + "step": 110 + }, + { + "epoch": 0.2, + "grad_norm": 0.3089153468608856, + "learning_rate": 9.91161084951252e-05, + "loss": 0.131, + "step": 111 + }, + { + "epoch": 0.2, + "grad_norm": 0.1581045240163803, + "learning_rate": 9.909857208726705e-05, + "loss": 0.0654, + "step": 112 + }, + { + "epoch": 0.2, + "grad_norm": 0.2545772194862366, + "learning_rate": 9.908086500489637e-05, + "loss": 0.1021, + "step": 113 + }, + { + "epoch": 0.2, + "grad_norm": 0.2257249355316162, + "learning_rate": 9.906298730956586e-05, + "loss": 0.0636, + "step": 114 + }, + { + "epoch": 0.2, + "grad_norm": 0.17862719297409058, + "learning_rate": 9.904493906342123e-05, + "loss": 0.0942, + "step": 115 + }, + { + "epoch": 0.21, + "grad_norm": 0.23423053324222565, + "learning_rate": 9.902672032920106e-05, + "loss": 0.0676, + "step": 116 + }, + { + "epoch": 0.21, + "grad_norm": 0.26653358340263367, + "learning_rate": 9.900833117023664e-05, + "loss": 0.0918, + "step": 117 + }, + { + "epoch": 0.21, + "grad_norm": 0.4517073631286621, + "learning_rate": 9.89897716504516e-05, + "loss": 0.1102, + "step": 118 + }, + { + "epoch": 0.21, + "grad_norm": 0.20187437534332275, + "learning_rate": 9.897104183436183e-05, + "loss": 0.0713, + "step": 119 + }, + { + "epoch": 0.21, + "grad_norm": 0.5759711861610413, + "learning_rate": 9.895214178707516e-05, + "loss": 0.0837, + "step": 120 + }, + { + "epoch": 0.21, + "grad_norm": 0.43704915046691895, + "learning_rate": 9.89330715742912e-05, + "loss": 0.0868, + "step": 121 + }, + { + "epoch": 0.22, + "grad_norm": 0.30784374475479126, + "learning_rate": 9.891383126230104e-05, + "loss": 0.1171, + "step": 122 + }, + { + "epoch": 0.22, + "grad_norm": 0.23538921773433685, + "learning_rate": 9.889442091798712e-05, + "loss": 0.055, + "step": 123 + }, + { + "epoch": 0.22, + "grad_norm": 0.27727362513542175, + "learning_rate": 9.887484060882291e-05, + "loss": 0.041, + "step": 124 + }, + { + "epoch": 0.22, + "grad_norm": 0.21666617691516876, + "learning_rate": 9.885509040287268e-05, + "loss": 0.0624, + "step": 125 + }, + { + "epoch": 0.22, + "grad_norm": 0.2829636335372925, + "learning_rate": 9.883517036879132e-05, + "loss": 0.0946, + "step": 126 + }, + { + "epoch": 0.22, + "grad_norm": 0.34035512804985046, + "learning_rate": 9.88150805758241e-05, + "loss": 0.0635, + "step": 127 + }, + { + "epoch": 0.23, + "grad_norm": 0.44064444303512573, + "learning_rate": 9.879482109380634e-05, + "loss": 0.0931, + "step": 128 + }, + { + "epoch": 0.23, + "grad_norm": 0.5164741277694702, + "learning_rate": 9.877439199316323e-05, + "loss": 0.0891, + "step": 129 + }, + { + "epoch": 0.23, + "grad_norm": 0.5549228191375732, + "learning_rate": 9.875379334490962e-05, + "loss": 0.1144, + "step": 130 + }, + { + "epoch": 0.23, + "grad_norm": 0.41133901476860046, + "learning_rate": 9.873302522064972e-05, + "loss": 0.1022, + "step": 131 + }, + { + "epoch": 0.23, + "grad_norm": 0.5204330682754517, + "learning_rate": 9.871208769257685e-05, + "loss": 0.0867, + "step": 132 + }, + { + "epoch": 0.24, + "grad_norm": 0.3383274972438812, + "learning_rate": 9.869098083347323e-05, + "loss": 0.0558, + "step": 133 + }, + { + "epoch": 0.24, + "grad_norm": 0.7084139585494995, + "learning_rate": 9.866970471670967e-05, + "loss": 0.1208, + "step": 134 + }, + { + "epoch": 0.24, + "grad_norm": 0.2974587380886078, + "learning_rate": 9.864825941624537e-05, + "loss": 0.1199, + "step": 135 + }, + { + "epoch": 0.24, + "grad_norm": 0.1811504065990448, + "learning_rate": 9.862664500662764e-05, + "loss": 0.1025, + "step": 136 + }, + { + "epoch": 0.24, + "grad_norm": 0.23748300969600677, + "learning_rate": 9.860486156299164e-05, + "loss": 0.0864, + "step": 137 + }, + { + "epoch": 0.24, + "grad_norm": 0.21784676611423492, + "learning_rate": 9.85829091610601e-05, + "loss": 0.095, + "step": 138 + }, + { + "epoch": 0.25, + "grad_norm": 0.26979225873947144, + "learning_rate": 9.856078787714309e-05, + "loss": 0.0864, + "step": 139 + }, + { + "epoch": 0.25, + "grad_norm": 0.1479984074831009, + "learning_rate": 9.853849778813777e-05, + "loss": 0.0904, + "step": 140 + }, + { + "epoch": 0.25, + "grad_norm": 0.17924343049526215, + "learning_rate": 9.851603897152803e-05, + "loss": 0.0752, + "step": 141 + }, + { + "epoch": 0.25, + "grad_norm": 0.16448016464710236, + "learning_rate": 9.849341150538434e-05, + "loss": 0.0781, + "step": 142 + }, + { + "epoch": 0.25, + "eval_loss": 0.0848281979560852, + "eval_runtime": 14.6961, + "eval_samples_per_second": 32.458, + "eval_steps_per_second": 8.165, + "step": 142 + }, + { + "epoch": 0.25, + "grad_norm": 0.14405055344104767, + "learning_rate": 9.847061546836339e-05, + "loss": 0.1007, + "step": 143 + }, + { + "epoch": 0.25, + "grad_norm": 0.20907168090343475, + "learning_rate": 9.844765093970787e-05, + "loss": 0.1126, + "step": 144 + }, + { + "epoch": 0.26, + "grad_norm": 0.1777975857257843, + "learning_rate": 9.842451799924616e-05, + "loss": 0.0928, + "step": 145 + }, + { + "epoch": 0.26, + "grad_norm": 0.1817995309829712, + "learning_rate": 9.840121672739208e-05, + "loss": 0.046, + "step": 146 + }, + { + "epoch": 0.26, + "grad_norm": 0.2099136859178543, + "learning_rate": 9.837774720514457e-05, + "loss": 0.1032, + "step": 147 + }, + { + "epoch": 0.26, + "grad_norm": 0.19467169046401978, + "learning_rate": 9.835410951408748e-05, + "loss": 0.0913, + "step": 148 + }, + { + "epoch": 0.26, + "grad_norm": 0.24700500071048737, + "learning_rate": 9.833030373638919e-05, + "loss": 0.1101, + "step": 149 + }, + { + "epoch": 0.27, + "grad_norm": 0.22854459285736084, + "learning_rate": 9.830632995480242e-05, + "loss": 0.0729, + "step": 150 + }, + { + "epoch": 0.27, + "grad_norm": 0.206742063164711, + "learning_rate": 9.828218825266388e-05, + "loss": 0.0861, + "step": 151 + }, + { + "epoch": 0.27, + "grad_norm": 0.40378740429878235, + "learning_rate": 9.8257878713894e-05, + "loss": 0.0948, + "step": 152 + }, + { + "epoch": 0.27, + "grad_norm": 0.5055291652679443, + "learning_rate": 9.823340142299662e-05, + "loss": 0.193, + "step": 153 + }, + { + "epoch": 0.27, + "grad_norm": 0.3036790192127228, + "learning_rate": 9.820875646505874e-05, + "loss": 0.0859, + "step": 154 + }, + { + "epoch": 0.27, + "grad_norm": 0.1878231018781662, + "learning_rate": 9.818394392575019e-05, + "loss": 0.0702, + "step": 155 + }, + { + "epoch": 0.28, + "grad_norm": 0.17990007996559143, + "learning_rate": 9.815896389132333e-05, + "loss": 0.0967, + "step": 156 + }, + { + "epoch": 0.28, + "grad_norm": 0.22680750489234924, + "learning_rate": 9.813381644861277e-05, + "loss": 0.0959, + "step": 157 + }, + { + "epoch": 0.28, + "grad_norm": 0.299663782119751, + "learning_rate": 9.810850168503506e-05, + "loss": 0.0801, + "step": 158 + }, + { + "epoch": 0.28, + "grad_norm": 0.3132835924625397, + "learning_rate": 9.808301968858837e-05, + "loss": 0.1151, + "step": 159 + }, + { + "epoch": 0.28, + "grad_norm": 0.16891297698020935, + "learning_rate": 9.805737054785222e-05, + "loss": 0.0799, + "step": 160 + }, + { + "epoch": 0.28, + "grad_norm": 0.23542983829975128, + "learning_rate": 9.803155435198712e-05, + "loss": 0.0645, + "step": 161 + }, + { + "epoch": 0.29, + "grad_norm": 0.1784803569316864, + "learning_rate": 9.800557119073433e-05, + "loss": 0.0475, + "step": 162 + }, + { + "epoch": 0.29, + "grad_norm": 0.37001606822013855, + "learning_rate": 9.797942115441545e-05, + "loss": 0.1331, + "step": 163 + }, + { + "epoch": 0.29, + "grad_norm": 0.18926851451396942, + "learning_rate": 9.795310433393226e-05, + "loss": 0.0744, + "step": 164 + }, + { + "epoch": 0.29, + "grad_norm": 0.15572589635849, + "learning_rate": 9.792662082076618e-05, + "loss": 0.0551, + "step": 165 + }, + { + "epoch": 0.29, + "grad_norm": 0.2562514841556549, + "learning_rate": 9.789997070697821e-05, + "loss": 0.106, + "step": 166 + }, + { + "epoch": 0.3, + "grad_norm": 0.4756919741630554, + "learning_rate": 9.787315408520838e-05, + "loss": 0.1229, + "step": 167 + }, + { + "epoch": 0.3, + "grad_norm": 0.2322833091020584, + "learning_rate": 9.78461710486756e-05, + "loss": 0.1212, + "step": 168 + }, + { + "epoch": 0.3, + "grad_norm": 0.28180065751075745, + "learning_rate": 9.78190216911772e-05, + "loss": 0.0855, + "step": 169 + }, + { + "epoch": 0.3, + "grad_norm": 0.2058788239955902, + "learning_rate": 9.779170610708872e-05, + "loss": 0.0445, + "step": 170 + }, + { + "epoch": 0.3, + "grad_norm": 0.16885802149772644, + "learning_rate": 9.776422439136352e-05, + "loss": 0.0619, + "step": 171 + }, + { + "epoch": 0.3, + "grad_norm": 0.2562006413936615, + "learning_rate": 9.773657663953243e-05, + "loss": 0.1111, + "step": 172 + }, + { + "epoch": 0.31, + "grad_norm": 0.2394249439239502, + "learning_rate": 9.770876294770349e-05, + "loss": 0.0562, + "step": 173 + }, + { + "epoch": 0.31, + "grad_norm": 0.214800164103508, + "learning_rate": 9.768078341256155e-05, + "loss": 0.0428, + "step": 174 + }, + { + "epoch": 0.31, + "grad_norm": 0.30056923627853394, + "learning_rate": 9.765263813136796e-05, + "loss": 0.1173, + "step": 175 + }, + { + "epoch": 0.31, + "grad_norm": 0.3174525499343872, + "learning_rate": 9.762432720196024e-05, + "loss": 0.0871, + "step": 176 + }, + { + "epoch": 0.31, + "grad_norm": 0.31560906767845154, + "learning_rate": 9.75958507227517e-05, + "loss": 0.1133, + "step": 177 + }, + { + "epoch": 0.31, + "grad_norm": 0.21752357482910156, + "learning_rate": 9.756720879273117e-05, + "loss": 0.0421, + "step": 178 + }, + { + "epoch": 0.32, + "grad_norm": 0.3215218484401703, + "learning_rate": 9.753840151146259e-05, + "loss": 0.0596, + "step": 179 + }, + { + "epoch": 0.32, + "grad_norm": 0.3161137104034424, + "learning_rate": 9.750942897908468e-05, + "loss": 0.122, + "step": 180 + }, + { + "epoch": 0.32, + "grad_norm": 0.4206744134426117, + "learning_rate": 9.748029129631062e-05, + "loss": 0.0966, + "step": 181 + }, + { + "epoch": 0.32, + "grad_norm": 0.28242579102516174, + "learning_rate": 9.745098856442768e-05, + "loss": 0.0853, + "step": 182 + }, + { + "epoch": 0.32, + "grad_norm": 0.24647079408168793, + "learning_rate": 9.742152088529684e-05, + "loss": 0.1077, + "step": 183 + }, + { + "epoch": 0.33, + "grad_norm": 0.29940977692604065, + "learning_rate": 9.739188836135247e-05, + "loss": 0.0837, + "step": 184 + }, + { + "epoch": 0.33, + "grad_norm": 0.21811984479427338, + "learning_rate": 9.7362091095602e-05, + "loss": 0.1, + "step": 185 + }, + { + "epoch": 0.33, + "grad_norm": 0.24434742331504822, + "learning_rate": 9.733212919162549e-05, + "loss": 0.0839, + "step": 186 + }, + { + "epoch": 0.33, + "grad_norm": 0.1656690537929535, + "learning_rate": 9.730200275357535e-05, + "loss": 0.0894, + "step": 187 + }, + { + "epoch": 0.33, + "grad_norm": 0.16984042525291443, + "learning_rate": 9.727171188617587e-05, + "loss": 0.0732, + "step": 188 + }, + { + "epoch": 0.33, + "grad_norm": 0.19889003038406372, + "learning_rate": 9.7241256694723e-05, + "loss": 0.0832, + "step": 189 + }, + { + "epoch": 0.34, + "grad_norm": 0.1639273464679718, + "learning_rate": 9.721063728508383e-05, + "loss": 0.0912, + "step": 190 + }, + { + "epoch": 0.34, + "grad_norm": 0.26211172342300415, + "learning_rate": 9.717985376369639e-05, + "loss": 0.0986, + "step": 191 + }, + { + "epoch": 0.34, + "grad_norm": 0.12403538823127747, + "learning_rate": 9.714890623756912e-05, + "loss": 0.0844, + "step": 192 + }, + { + "epoch": 0.34, + "grad_norm": 0.1882586032152176, + "learning_rate": 9.711779481428057e-05, + "loss": 0.1163, + "step": 193 + }, + { + "epoch": 0.34, + "grad_norm": 0.1780715435743332, + "learning_rate": 9.708651960197904e-05, + "loss": 0.1038, + "step": 194 + }, + { + "epoch": 0.34, + "grad_norm": 0.1291002333164215, + "learning_rate": 9.705508070938218e-05, + "loss": 0.0746, + "step": 195 + }, + { + "epoch": 0.35, + "grad_norm": 0.2384466975927353, + "learning_rate": 9.702347824577666e-05, + "loss": 0.0909, + "step": 196 + }, + { + "epoch": 0.35, + "grad_norm": 0.25463247299194336, + "learning_rate": 9.699171232101768e-05, + "loss": 0.0977, + "step": 197 + }, + { + "epoch": 0.35, + "grad_norm": 0.19303986430168152, + "learning_rate": 9.69597830455287e-05, + "loss": 0.1137, + "step": 198 + }, + { + "epoch": 0.35, + "grad_norm": 0.21899022161960602, + "learning_rate": 9.692769053030099e-05, + "loss": 0.0671, + "step": 199 + }, + { + "epoch": 0.35, + "grad_norm": 0.16923530399799347, + "learning_rate": 9.689543488689332e-05, + "loss": 0.0776, + "step": 200 + }, + { + "epoch": 0.36, + "grad_norm": 0.260955810546875, + "learning_rate": 9.686301622743144e-05, + "loss": 0.092, + "step": 201 + }, + { + "epoch": 0.36, + "grad_norm": 0.1771455854177475, + "learning_rate": 9.683043466460782e-05, + "loss": 0.06, + "step": 202 + }, + { + "epoch": 0.36, + "grad_norm": 0.18851810693740845, + "learning_rate": 9.67976903116812e-05, + "loss": 0.0965, + "step": 203 + }, + { + "epoch": 0.36, + "grad_norm": 0.589522659778595, + "learning_rate": 9.676478328247622e-05, + "loss": 0.1673, + "step": 204 + }, + { + "epoch": 0.36, + "grad_norm": 0.21747058629989624, + "learning_rate": 9.673171369138296e-05, + "loss": 0.0997, + "step": 205 + }, + { + "epoch": 0.36, + "grad_norm": 0.4168107807636261, + "learning_rate": 9.669848165335666e-05, + "loss": 0.0795, + "step": 206 + }, + { + "epoch": 0.37, + "grad_norm": 0.6520416140556335, + "learning_rate": 9.666508728391719e-05, + "loss": 0.1177, + "step": 207 + }, + { + "epoch": 0.37, + "grad_norm": 0.3752453029155731, + "learning_rate": 9.663153069914875e-05, + "loss": 0.0871, + "step": 208 + }, + { + "epoch": 0.37, + "grad_norm": 0.26546868681907654, + "learning_rate": 9.65978120156994e-05, + "loss": 0.0647, + "step": 209 + }, + { + "epoch": 0.37, + "grad_norm": 0.20044176280498505, + "learning_rate": 9.656393135078068e-05, + "loss": 0.1072, + "step": 210 + }, + { + "epoch": 0.37, + "grad_norm": 0.25033503770828247, + "learning_rate": 9.652988882216724e-05, + "loss": 0.1326, + "step": 211 + }, + { + "epoch": 0.38, + "grad_norm": 0.24569682776927948, + "learning_rate": 9.649568454819637e-05, + "loss": 0.0931, + "step": 212 + }, + { + "epoch": 0.38, + "grad_norm": 0.5623157620429993, + "learning_rate": 9.64613186477676e-05, + "loss": 0.2157, + "step": 213 + }, + { + "epoch": 0.38, + "grad_norm": 0.15052182972431183, + "learning_rate": 9.642679124034233e-05, + "loss": 0.1236, + "step": 214 + }, + { + "epoch": 0.38, + "grad_norm": 0.14209671318531036, + "learning_rate": 9.639210244594334e-05, + "loss": 0.0971, + "step": 215 + }, + { + "epoch": 0.38, + "grad_norm": 0.1627768725156784, + "learning_rate": 9.635725238515445e-05, + "loss": 0.1161, + "step": 216 + }, + { + "epoch": 0.38, + "grad_norm": 0.10190293937921524, + "learning_rate": 9.63222411791201e-05, + "loss": 0.0999, + "step": 217 + }, + { + "epoch": 0.39, + "grad_norm": 0.11575043201446533, + "learning_rate": 9.62870689495448e-05, + "loss": 0.0986, + "step": 218 + }, + { + "epoch": 0.39, + "grad_norm": 0.09842410683631897, + "learning_rate": 9.62517358186929e-05, + "loss": 0.1176, + "step": 219 + }, + { + "epoch": 0.39, + "grad_norm": 0.14816004037857056, + "learning_rate": 9.621624190938803e-05, + "loss": 0.0833, + "step": 220 + }, + { + "epoch": 0.39, + "grad_norm": 0.11311839520931244, + "learning_rate": 9.618058734501269e-05, + "loss": 0.0815, + "step": 221 + }, + { + "epoch": 0.39, + "grad_norm": 0.17481163144111633, + "learning_rate": 9.614477224950789e-05, + "loss": 0.0678, + "step": 222 + }, + { + "epoch": 0.39, + "grad_norm": 0.2277013659477234, + "learning_rate": 9.610879674737264e-05, + "loss": 0.0941, + "step": 223 + }, + { + "epoch": 0.4, + "grad_norm": 0.14689870178699493, + "learning_rate": 9.607266096366352e-05, + "loss": 0.0991, + "step": 224 + }, + { + "epoch": 0.4, + "grad_norm": 0.24558769166469574, + "learning_rate": 9.603636502399436e-05, + "loss": 0.0878, + "step": 225 + }, + { + "epoch": 0.4, + "grad_norm": 0.1541660875082016, + "learning_rate": 9.599990905453567e-05, + "loss": 0.0784, + "step": 226 + }, + { + "epoch": 0.4, + "grad_norm": 0.12188339233398438, + "learning_rate": 9.59632931820142e-05, + "loss": 0.0464, + "step": 227 + }, + { + "epoch": 0.4, + "grad_norm": 0.32710394263267517, + "learning_rate": 9.592651753371265e-05, + "loss": 0.0541, + "step": 228 + }, + { + "epoch": 0.41, + "grad_norm": 0.3118465840816498, + "learning_rate": 9.588958223746903e-05, + "loss": 0.0845, + "step": 229 + }, + { + "epoch": 0.41, + "grad_norm": 0.26805219054222107, + "learning_rate": 9.585248742167639e-05, + "loss": 0.0485, + "step": 230 + }, + { + "epoch": 0.41, + "grad_norm": 0.7972936630249023, + "learning_rate": 9.581523321528223e-05, + "loss": 0.1013, + "step": 231 + }, + { + "epoch": 0.41, + "grad_norm": 0.6285438537597656, + "learning_rate": 9.577781974778817e-05, + "loss": 0.0767, + "step": 232 + }, + { + "epoch": 0.41, + "grad_norm": 0.6384493708610535, + "learning_rate": 9.57402471492494e-05, + "loss": 0.1855, + "step": 233 + }, + { + "epoch": 0.41, + "grad_norm": 0.5759001970291138, + "learning_rate": 9.570251555027432e-05, + "loss": 0.1585, + "step": 234 + }, + { + "epoch": 0.42, + "grad_norm": 0.42002353072166443, + "learning_rate": 9.566462508202402e-05, + "loss": 0.1479, + "step": 235 + }, + { + "epoch": 0.42, + "grad_norm": 0.18405884504318237, + "learning_rate": 9.562657587621184e-05, + "loss": 0.09, + "step": 236 + }, + { + "epoch": 0.42, + "grad_norm": 0.20893922448158264, + "learning_rate": 9.558836806510291e-05, + "loss": 0.0685, + "step": 237 + }, + { + "epoch": 0.42, + "grad_norm": 0.31388092041015625, + "learning_rate": 9.555000178151374e-05, + "loss": 0.0983, + "step": 238 + }, + { + "epoch": 0.42, + "grad_norm": 0.20344533026218414, + "learning_rate": 9.551147715881166e-05, + "loss": 0.0944, + "step": 239 + }, + { + "epoch": 0.42, + "grad_norm": 0.1582648903131485, + "learning_rate": 9.547279433091446e-05, + "loss": 0.0662, + "step": 240 + }, + { + "epoch": 0.43, + "grad_norm": 0.16737405955791473, + "learning_rate": 9.543395343228983e-05, + "loss": 0.1565, + "step": 241 + }, + { + "epoch": 0.43, + "grad_norm": 0.21974924206733704, + "learning_rate": 9.539495459795499e-05, + "loss": 0.1243, + "step": 242 + }, + { + "epoch": 0.43, + "grad_norm": 0.1147058829665184, + "learning_rate": 9.535579796347612e-05, + "loss": 0.0727, + "step": 243 + }, + { + "epoch": 0.43, + "grad_norm": 0.13460345566272736, + "learning_rate": 9.531648366496799e-05, + "loss": 0.0691, + "step": 244 + }, + { + "epoch": 0.43, + "grad_norm": 0.1404263824224472, + "learning_rate": 9.527701183909336e-05, + "loss": 0.0975, + "step": 245 + }, + { + "epoch": 0.44, + "grad_norm": 0.17380090057849884, + "learning_rate": 9.523738262306269e-05, + "loss": 0.0873, + "step": 246 + }, + { + "epoch": 0.44, + "grad_norm": 0.13862797617912292, + "learning_rate": 9.519759615463346e-05, + "loss": 0.0738, + "step": 247 + }, + { + "epoch": 0.44, + "grad_norm": 0.17551685869693756, + "learning_rate": 9.51576525721098e-05, + "loss": 0.0676, + "step": 248 + }, + { + "epoch": 0.44, + "grad_norm": 0.20715269446372986, + "learning_rate": 9.511755201434205e-05, + "loss": 0.0737, + "step": 249 + }, + { + "epoch": 0.44, + "grad_norm": 0.14763356745243073, + "learning_rate": 9.507729462072614e-05, + "loss": 0.07, + "step": 250 + }, + { + "epoch": 0.44, + "grad_norm": 0.317452073097229, + "learning_rate": 9.503688053120327e-05, + "loss": 0.1252, + "step": 251 + }, + { + "epoch": 0.45, + "grad_norm": 0.21908459067344666, + "learning_rate": 9.499630988625925e-05, + "loss": 0.0877, + "step": 252 + }, + { + "epoch": 0.45, + "grad_norm": 0.3233601450920105, + "learning_rate": 9.49555828269242e-05, + "loss": 0.0891, + "step": 253 + }, + { + "epoch": 0.45, + "grad_norm": 0.4098372161388397, + "learning_rate": 9.491469949477187e-05, + "loss": 0.0805, + "step": 254 + }, + { + "epoch": 0.45, + "grad_norm": 0.40573808550834656, + "learning_rate": 9.487366003191931e-05, + "loss": 0.1284, + "step": 255 + }, + { + "epoch": 0.45, + "grad_norm": 0.3391616940498352, + "learning_rate": 9.483246458102625e-05, + "loss": 0.0901, + "step": 256 + }, + { + "epoch": 0.45, + "grad_norm": 0.1822938323020935, + "learning_rate": 9.479111328529473e-05, + "loss": 0.0398, + "step": 257 + }, + { + "epoch": 0.46, + "grad_norm": 0.4700302183628082, + "learning_rate": 9.474960628846843e-05, + "loss": 0.1509, + "step": 258 + }, + { + "epoch": 0.46, + "grad_norm": 0.20210890471935272, + "learning_rate": 9.470794373483236e-05, + "loss": 0.0765, + "step": 259 + }, + { + "epoch": 0.46, + "grad_norm": 0.28329914808273315, + "learning_rate": 9.466612576921223e-05, + "loss": 0.0666, + "step": 260 + }, + { + "epoch": 0.46, + "grad_norm": 0.41083166003227234, + "learning_rate": 9.462415253697401e-05, + "loss": 0.1248, + "step": 261 + }, + { + "epoch": 0.46, + "grad_norm": 0.17644570767879486, + "learning_rate": 9.458202418402338e-05, + "loss": 0.0532, + "step": 262 + }, + { + "epoch": 0.47, + "grad_norm": 0.2667219936847687, + "learning_rate": 9.453974085680526e-05, + "loss": 0.0937, + "step": 263 + }, + { + "epoch": 0.47, + "grad_norm": 0.20900332927703857, + "learning_rate": 9.449730270230326e-05, + "loss": 0.0853, + "step": 264 + }, + { + "epoch": 0.47, + "grad_norm": 0.26425743103027344, + "learning_rate": 9.445470986803922e-05, + "loss": 0.12, + "step": 265 + }, + { + "epoch": 0.47, + "grad_norm": 0.1956167221069336, + "learning_rate": 9.441196250207267e-05, + "loss": 0.0965, + "step": 266 + }, + { + "epoch": 0.47, + "grad_norm": 0.21896903216838837, + "learning_rate": 9.436906075300032e-05, + "loss": 0.0867, + "step": 267 + }, + { + "epoch": 0.47, + "grad_norm": 0.2082919031381607, + "learning_rate": 9.432600476995551e-05, + "loss": 0.0847, + "step": 268 + }, + { + "epoch": 0.48, + "grad_norm": 0.1674569696187973, + "learning_rate": 9.428279470260776e-05, + "loss": 0.0846, + "step": 269 + }, + { + "epoch": 0.48, + "grad_norm": 0.23109744489192963, + "learning_rate": 9.423943070116218e-05, + "loss": 0.136, + "step": 270 + }, + { + "epoch": 0.48, + "grad_norm": 0.21344415843486786, + "learning_rate": 9.4195912916359e-05, + "loss": 0.1091, + "step": 271 + }, + { + "epoch": 0.48, + "grad_norm": 0.16391590237617493, + "learning_rate": 9.415224149947306e-05, + "loss": 0.0901, + "step": 272 + }, + { + "epoch": 0.48, + "grad_norm": 0.2023243010044098, + "learning_rate": 9.410841660231315e-05, + "loss": 0.0635, + "step": 273 + }, + { + "epoch": 0.48, + "grad_norm": 0.1723608821630478, + "learning_rate": 9.406443837722168e-05, + "loss": 0.1001, + "step": 274 + }, + { + "epoch": 0.49, + "grad_norm": 0.1470147669315338, + "learning_rate": 9.402030697707398e-05, + "loss": 0.0721, + "step": 275 + }, + { + "epoch": 0.49, + "grad_norm": 0.15082985162734985, + "learning_rate": 9.397602255527791e-05, + "loss": 0.0698, + "step": 276 + }, + { + "epoch": 0.49, + "grad_norm": 0.16322006285190582, + "learning_rate": 9.393158526577323e-05, + "loss": 0.0809, + "step": 277 + }, + { + "epoch": 0.49, + "grad_norm": 0.10098633915185928, + "learning_rate": 9.388699526303105e-05, + "loss": 0.0386, + "step": 278 + }, + { + "epoch": 0.49, + "grad_norm": 0.19049708545207977, + "learning_rate": 9.38422527020534e-05, + "loss": 0.0559, + "step": 279 + }, + { + "epoch": 0.5, + "grad_norm": 0.22742775082588196, + "learning_rate": 9.37973577383726e-05, + "loss": 0.0802, + "step": 280 + }, + { + "epoch": 0.5, + "grad_norm": 0.2055177539587021, + "learning_rate": 9.375231052805072e-05, + "loss": 0.1048, + "step": 281 + }, + { + "epoch": 0.5, + "grad_norm": 0.1366245150566101, + "learning_rate": 9.370711122767913e-05, + "loss": 0.0204, + "step": 282 + }, + { + "epoch": 0.5, + "grad_norm": 0.3235447406768799, + "learning_rate": 9.36617599943778e-05, + "loss": 0.0974, + "step": 283 + }, + { + "epoch": 0.5, + "grad_norm": 0.09579204767942429, + "learning_rate": 9.361625698579493e-05, + "loss": 0.0151, + "step": 284 + }, + { + "epoch": 0.5, + "eval_loss": 0.07987037301063538, + "eval_runtime": 14.6437, + "eval_samples_per_second": 32.574, + "eval_steps_per_second": 8.195, + "step": 284 + }, + { + "epoch": 0.5, + "grad_norm": 0.22850771248340607, + "learning_rate": 9.357060236010625e-05, + "loss": 0.0458, + "step": 285 + }, + { + "epoch": 0.51, + "grad_norm": 0.4980478882789612, + "learning_rate": 9.352479627601457e-05, + "loss": 0.1306, + "step": 286 + }, + { + "epoch": 0.51, + "grad_norm": 0.18770304322242737, + "learning_rate": 9.347883889274923e-05, + "loss": 0.0218, + "step": 287 + }, + { + "epoch": 0.51, + "grad_norm": 0.6386083364486694, + "learning_rate": 9.34327303700654e-05, + "loss": 0.0912, + "step": 288 + }, + { + "epoch": 0.51, + "grad_norm": 0.4997164309024811, + "learning_rate": 9.338647086824372e-05, + "loss": 0.1083, + "step": 289 + }, + { + "epoch": 0.51, + "grad_norm": 0.31682559847831726, + "learning_rate": 9.334006054808966e-05, + "loss": 0.0947, + "step": 290 + }, + { + "epoch": 0.51, + "grad_norm": 0.28325051069259644, + "learning_rate": 9.329349957093292e-05, + "loss": 0.0794, + "step": 291 + }, + { + "epoch": 0.52, + "grad_norm": 0.5778185725212097, + "learning_rate": 9.324678809862695e-05, + "loss": 0.1223, + "step": 292 + }, + { + "epoch": 0.52, + "grad_norm": 0.2953624725341797, + "learning_rate": 9.319992629354828e-05, + "loss": 0.0747, + "step": 293 + }, + { + "epoch": 0.52, + "grad_norm": 0.28283196687698364, + "learning_rate": 9.31529143185961e-05, + "loss": 0.1099, + "step": 294 + }, + { + "epoch": 0.52, + "grad_norm": 0.36138102412223816, + "learning_rate": 9.310575233719154e-05, + "loss": 0.1303, + "step": 295 + }, + { + "epoch": 0.52, + "grad_norm": 0.16202205419540405, + "learning_rate": 9.305844051327725e-05, + "loss": 0.0805, + "step": 296 + }, + { + "epoch": 0.53, + "grad_norm": 0.11523901671171188, + "learning_rate": 9.30109790113167e-05, + "loss": 0.0775, + "step": 297 + }, + { + "epoch": 0.53, + "grad_norm": 0.10756238549947739, + "learning_rate": 9.296336799629369e-05, + "loss": 0.0795, + "step": 298 + }, + { + "epoch": 0.53, + "grad_norm": 0.18142195045948029, + "learning_rate": 9.291560763371173e-05, + "loss": 0.0833, + "step": 299 + }, + { + "epoch": 0.53, + "grad_norm": 0.14596430957317352, + "learning_rate": 9.28676980895935e-05, + "loss": 0.0904, + "step": 300 + }, + { + "epoch": 0.53, + "grad_norm": 0.10054739564657211, + "learning_rate": 9.28196395304803e-05, + "loss": 0.0898, + "step": 301 + }, + { + "epoch": 0.53, + "grad_norm": 0.24579764902591705, + "learning_rate": 9.277143212343134e-05, + "loss": 0.145, + "step": 302 + }, + { + "epoch": 0.54, + "grad_norm": 0.13506978750228882, + "learning_rate": 9.272307603602334e-05, + "loss": 0.0847, + "step": 303 + }, + { + "epoch": 0.54, + "grad_norm": 0.17480792105197906, + "learning_rate": 9.267457143634979e-05, + "loss": 0.125, + "step": 304 + }, + { + "epoch": 0.54, + "grad_norm": 0.31458401679992676, + "learning_rate": 9.262591849302048e-05, + "loss": 0.1047, + "step": 305 + }, + { + "epoch": 0.54, + "grad_norm": 0.17494355142116547, + "learning_rate": 9.257711737516082e-05, + "loss": 0.0576, + "step": 306 + }, + { + "epoch": 0.54, + "grad_norm": 0.2996468245983124, + "learning_rate": 9.252816825241134e-05, + "loss": 0.1012, + "step": 307 + }, + { + "epoch": 0.54, + "grad_norm": 0.19122976064682007, + "learning_rate": 9.247907129492707e-05, + "loss": 0.0878, + "step": 308 + }, + { + "epoch": 0.55, + "grad_norm": 0.16079925000667572, + "learning_rate": 9.242982667337685e-05, + "loss": 0.0778, + "step": 309 + }, + { + "epoch": 0.55, + "grad_norm": 0.2628028392791748, + "learning_rate": 9.238043455894293e-05, + "loss": 0.0987, + "step": 310 + }, + { + "epoch": 0.55, + "grad_norm": 0.318097859621048, + "learning_rate": 9.23308951233202e-05, + "loss": 0.1108, + "step": 311 + }, + { + "epoch": 0.55, + "grad_norm": 0.2207389920949936, + "learning_rate": 9.228120853871571e-05, + "loss": 0.0826, + "step": 312 + }, + { + "epoch": 0.55, + "grad_norm": 0.34375905990600586, + "learning_rate": 9.223137497784797e-05, + "loss": 0.1174, + "step": 313 + }, + { + "epoch": 0.56, + "grad_norm": 0.16714760661125183, + "learning_rate": 9.218139461394644e-05, + "loss": 0.0883, + "step": 314 + }, + { + "epoch": 0.56, + "grad_norm": 0.24213539063930511, + "learning_rate": 9.213126762075088e-05, + "loss": 0.0686, + "step": 315 + }, + { + "epoch": 0.56, + "grad_norm": 0.2654499113559723, + "learning_rate": 9.208099417251077e-05, + "loss": 0.1185, + "step": 316 + }, + { + "epoch": 0.56, + "grad_norm": 0.1353083997964859, + "learning_rate": 9.203057444398469e-05, + "loss": 0.0806, + "step": 317 + }, + { + "epoch": 0.56, + "grad_norm": 0.30304938554763794, + "learning_rate": 9.198000861043967e-05, + "loss": 0.0817, + "step": 318 + }, + { + "epoch": 0.56, + "grad_norm": 0.14495517313480377, + "learning_rate": 9.192929684765067e-05, + "loss": 0.0436, + "step": 319 + }, + { + "epoch": 0.57, + "grad_norm": 0.2180556207895279, + "learning_rate": 9.187843933189995e-05, + "loss": 0.1255, + "step": 320 + }, + { + "epoch": 0.57, + "grad_norm": 0.3051697611808777, + "learning_rate": 9.182743623997634e-05, + "loss": 0.1241, + "step": 321 + }, + { + "epoch": 0.57, + "grad_norm": 0.42936787009239197, + "learning_rate": 9.17762877491748e-05, + "loss": 0.1847, + "step": 322 + }, + { + "epoch": 0.57, + "grad_norm": 0.20895107090473175, + "learning_rate": 9.172499403729566e-05, + "loss": 0.0939, + "step": 323 + }, + { + "epoch": 0.57, + "grad_norm": 0.15273532271385193, + "learning_rate": 9.167355528264414e-05, + "loss": 0.1012, + "step": 324 + }, + { + "epoch": 0.57, + "grad_norm": 0.15428248047828674, + "learning_rate": 9.162197166402956e-05, + "loss": 0.061, + "step": 325 + }, + { + "epoch": 0.58, + "grad_norm": 0.13089029490947723, + "learning_rate": 9.157024336076487e-05, + "loss": 0.089, + "step": 326 + }, + { + "epoch": 0.58, + "grad_norm": 0.12000248581171036, + "learning_rate": 9.151837055266594e-05, + "loss": 0.0813, + "step": 327 + }, + { + "epoch": 0.58, + "grad_norm": 0.12965545058250427, + "learning_rate": 9.146635342005099e-05, + "loss": 0.113, + "step": 328 + }, + { + "epoch": 0.58, + "grad_norm": 0.12225235253572464, + "learning_rate": 9.14141921437399e-05, + "loss": 0.0968, + "step": 329 + }, + { + "epoch": 0.58, + "grad_norm": 0.11869696527719498, + "learning_rate": 9.136188690505363e-05, + "loss": 0.0752, + "step": 330 + }, + { + "epoch": 0.59, + "grad_norm": 0.22600843012332916, + "learning_rate": 9.130943788581359e-05, + "loss": 0.1049, + "step": 331 + }, + { + "epoch": 0.59, + "grad_norm": 0.13381795585155487, + "learning_rate": 9.125684526834099e-05, + "loss": 0.0917, + "step": 332 + }, + { + "epoch": 0.59, + "grad_norm": 0.12936879694461823, + "learning_rate": 9.120410923545619e-05, + "loss": 0.0782, + "step": 333 + }, + { + "epoch": 0.59, + "grad_norm": 0.14804388582706451, + "learning_rate": 9.115122997047811e-05, + "loss": 0.0959, + "step": 334 + }, + { + "epoch": 0.59, + "grad_norm": 0.18504676222801208, + "learning_rate": 9.109820765722357e-05, + "loss": 0.1126, + "step": 335 + }, + { + "epoch": 0.59, + "grad_norm": 0.1957363486289978, + "learning_rate": 9.10450424800066e-05, + "loss": 0.101, + "step": 336 + }, + { + "epoch": 0.6, + "grad_norm": 0.15677915513515472, + "learning_rate": 9.099173462363792e-05, + "loss": 0.0775, + "step": 337 + }, + { + "epoch": 0.6, + "grad_norm": 0.124906025826931, + "learning_rate": 9.093828427342418e-05, + "loss": 0.07, + "step": 338 + }, + { + "epoch": 0.6, + "grad_norm": 0.12451624125242233, + "learning_rate": 9.088469161516735e-05, + "loss": 0.0588, + "step": 339 + }, + { + "epoch": 0.6, + "grad_norm": 0.509678304195404, + "learning_rate": 9.083095683516414e-05, + "loss": 0.1563, + "step": 340 + }, + { + "epoch": 0.6, + "grad_norm": 0.2245551496744156, + "learning_rate": 9.077708012020524e-05, + "loss": 0.1029, + "step": 341 + }, + { + "epoch": 0.61, + "grad_norm": 0.2735763192176819, + "learning_rate": 9.072306165757476e-05, + "loss": 0.0958, + "step": 342 + }, + { + "epoch": 0.61, + "grad_norm": 0.2062731385231018, + "learning_rate": 9.066890163504955e-05, + "loss": 0.0638, + "step": 343 + }, + { + "epoch": 0.61, + "grad_norm": 0.1664024293422699, + "learning_rate": 9.061460024089853e-05, + "loss": 0.0555, + "step": 344 + }, + { + "epoch": 0.61, + "grad_norm": 0.15788845717906952, + "learning_rate": 9.056015766388205e-05, + "loss": 0.0509, + "step": 345 + }, + { + "epoch": 0.61, + "grad_norm": 0.185616135597229, + "learning_rate": 9.050557409325125e-05, + "loss": 0.1196, + "step": 346 + }, + { + "epoch": 0.61, + "grad_norm": 0.24650661647319794, + "learning_rate": 9.045084971874738e-05, + "loss": 0.0723, + "step": 347 + }, + { + "epoch": 0.62, + "grad_norm": 0.23959776759147644, + "learning_rate": 9.039598473060113e-05, + "loss": 0.1139, + "step": 348 + }, + { + "epoch": 0.62, + "grad_norm": 0.24370582401752472, + "learning_rate": 9.034097931953201e-05, + "loss": 0.0559, + "step": 349 + }, + { + "epoch": 0.62, + "grad_norm": 0.11590461432933807, + "learning_rate": 9.028583367674765e-05, + "loss": 0.0285, + "step": 350 + }, + { + "epoch": 0.62, + "grad_norm": 0.21419131755828857, + "learning_rate": 9.023054799394316e-05, + "loss": 0.0686, + "step": 351 + }, + { + "epoch": 0.62, + "grad_norm": 0.2115790992975235, + "learning_rate": 9.017512246330042e-05, + "loss": 0.071, + "step": 352 + }, + { + "epoch": 0.62, + "grad_norm": 0.2025454044342041, + "learning_rate": 9.011955727748748e-05, + "loss": 0.0993, + "step": 353 + }, + { + "epoch": 0.63, + "grad_norm": 0.22743502259254456, + "learning_rate": 9.006385262965786e-05, + "loss": 0.0705, + "step": 354 + }, + { + "epoch": 0.63, + "grad_norm": 0.16963045299053192, + "learning_rate": 9.00080087134498e-05, + "loss": 0.0569, + "step": 355 + }, + { + "epoch": 0.63, + "grad_norm": 0.12319042533636093, + "learning_rate": 8.995202572298576e-05, + "loss": 0.0427, + "step": 356 + }, + { + "epoch": 0.63, + "grad_norm": 0.15424852073192596, + "learning_rate": 8.989590385287155e-05, + "loss": 0.0564, + "step": 357 + }, + { + "epoch": 0.63, + "grad_norm": 0.306594580411911, + "learning_rate": 8.983964329819583e-05, + "loss": 0.095, + "step": 358 + }, + { + "epoch": 0.64, + "grad_norm": 0.13876177370548248, + "learning_rate": 8.978324425452931e-05, + "loss": 0.0641, + "step": 359 + }, + { + "epoch": 0.64, + "grad_norm": 0.19561870396137238, + "learning_rate": 8.972670691792409e-05, + "loss": 0.0635, + "step": 360 + }, + { + "epoch": 0.64, + "grad_norm": 0.3458711504936218, + "learning_rate": 8.967003148491304e-05, + "loss": 0.1328, + "step": 361 + }, + { + "epoch": 0.64, + "grad_norm": 0.1129189059138298, + "learning_rate": 8.961321815250905e-05, + "loss": 0.0205, + "step": 362 + }, + { + "epoch": 0.64, + "grad_norm": 0.3680332601070404, + "learning_rate": 8.955626711820438e-05, + "loss": 0.1302, + "step": 363 + }, + { + "epoch": 0.64, + "grad_norm": 0.2695287764072418, + "learning_rate": 8.949917857996996e-05, + "loss": 0.0511, + "step": 364 + }, + { + "epoch": 0.65, + "grad_norm": 0.17332953214645386, + "learning_rate": 8.94419527362547e-05, + "loss": 0.0494, + "step": 365 + }, + { + "epoch": 0.65, + "grad_norm": 0.11610284447669983, + "learning_rate": 8.938458978598483e-05, + "loss": 0.0381, + "step": 366 + }, + { + "epoch": 0.65, + "grad_norm": 0.23595061898231506, + "learning_rate": 8.932708992856315e-05, + "loss": 0.0802, + "step": 367 + }, + { + "epoch": 0.65, + "grad_norm": 0.26876452565193176, + "learning_rate": 8.926945336386838e-05, + "loss": 0.0461, + "step": 368 + }, + { + "epoch": 0.65, + "grad_norm": 0.19504375755786896, + "learning_rate": 8.921168029225448e-05, + "loss": 0.0317, + "step": 369 + }, + { + "epoch": 0.65, + "grad_norm": 0.4416268467903137, + "learning_rate": 8.915377091454992e-05, + "loss": 0.0952, + "step": 370 + }, + { + "epoch": 0.66, + "grad_norm": 0.32519325613975525, + "learning_rate": 8.909572543205698e-05, + "loss": 0.1027, + "step": 371 + }, + { + "epoch": 0.66, + "grad_norm": 0.3939536511898041, + "learning_rate": 8.903754404655106e-05, + "loss": 0.1718, + "step": 372 + }, + { + "epoch": 0.66, + "grad_norm": 0.20514678955078125, + "learning_rate": 8.897922696027999e-05, + "loss": 0.06, + "step": 373 + }, + { + "epoch": 0.66, + "grad_norm": 0.3049127459526062, + "learning_rate": 8.892077437596332e-05, + "loss": 0.1014, + "step": 374 + }, + { + "epoch": 0.66, + "grad_norm": 0.19251297414302826, + "learning_rate": 8.88621864967916e-05, + "loss": 0.048, + "step": 375 + }, + { + "epoch": 0.67, + "grad_norm": 0.21226820349693298, + "learning_rate": 8.880346352642575e-05, + "loss": 0.0652, + "step": 376 + }, + { + "epoch": 0.67, + "grad_norm": 0.42633509635925293, + "learning_rate": 8.874460566899616e-05, + "loss": 0.1083, + "step": 377 + }, + { + "epoch": 0.67, + "grad_norm": 0.24073313176631927, + "learning_rate": 8.868561312910221e-05, + "loss": 0.0851, + "step": 378 + }, + { + "epoch": 0.67, + "grad_norm": 0.3047339618206024, + "learning_rate": 8.862648611181145e-05, + "loss": 0.086, + "step": 379 + }, + { + "epoch": 0.67, + "grad_norm": 0.2227114737033844, + "learning_rate": 8.856722482265886e-05, + "loss": 0.1196, + "step": 380 + }, + { + "epoch": 0.67, + "grad_norm": 0.1860799938440323, + "learning_rate": 8.850782946764619e-05, + "loss": 0.0779, + "step": 381 + }, + { + "epoch": 0.68, + "grad_norm": 0.2109043002128601, + "learning_rate": 8.844830025324122e-05, + "loss": 0.076, + "step": 382 + }, + { + "epoch": 0.68, + "grad_norm": 0.1998620331287384, + "learning_rate": 8.838863738637706e-05, + "loss": 0.1027, + "step": 383 + }, + { + "epoch": 0.68, + "grad_norm": 0.12607474625110626, + "learning_rate": 8.832884107445139e-05, + "loss": 0.0436, + "step": 384 + }, + { + "epoch": 0.68, + "grad_norm": 0.2890150845050812, + "learning_rate": 8.826891152532579e-05, + "loss": 0.0966, + "step": 385 + }, + { + "epoch": 0.68, + "grad_norm": 0.4496447443962097, + "learning_rate": 8.820884894732497e-05, + "loss": 0.1575, + "step": 386 + }, + { + "epoch": 0.68, + "grad_norm": 0.19411596655845642, + "learning_rate": 8.814865354923613e-05, + "loss": 0.1201, + "step": 387 + }, + { + "epoch": 0.69, + "grad_norm": 0.19513021409511566, + "learning_rate": 8.808832554030808e-05, + "loss": 0.0747, + "step": 388 + }, + { + "epoch": 0.69, + "grad_norm": 0.14038780331611633, + "learning_rate": 8.802786513025068e-05, + "loss": 0.0608, + "step": 389 + }, + { + "epoch": 0.69, + "grad_norm": 0.14907363057136536, + "learning_rate": 8.796727252923402e-05, + "loss": 0.0843, + "step": 390 + }, + { + "epoch": 0.69, + "grad_norm": 0.18512780964374542, + "learning_rate": 8.790654794788769e-05, + "loss": 0.0988, + "step": 391 + }, + { + "epoch": 0.69, + "grad_norm": 0.17880797386169434, + "learning_rate": 8.784569159730007e-05, + "loss": 0.079, + "step": 392 + }, + { + "epoch": 0.7, + "grad_norm": 0.16263402998447418, + "learning_rate": 8.778470368901762e-05, + "loss": 0.0704, + "step": 393 + }, + { + "epoch": 0.7, + "grad_norm": 0.27071598172187805, + "learning_rate": 8.772358443504405e-05, + "loss": 0.0983, + "step": 394 + }, + { + "epoch": 0.7, + "grad_norm": 0.23446398973464966, + "learning_rate": 8.766233404783974e-05, + "loss": 0.0577, + "step": 395 + }, + { + "epoch": 0.7, + "grad_norm": 0.2932927906513214, + "learning_rate": 8.760095274032083e-05, + "loss": 0.0946, + "step": 396 + }, + { + "epoch": 0.7, + "grad_norm": 0.29224956035614014, + "learning_rate": 8.75394407258586e-05, + "loss": 0.078, + "step": 397 + }, + { + "epoch": 0.7, + "grad_norm": 0.15467233955860138, + "learning_rate": 8.747779821827868e-05, + "loss": 0.0779, + "step": 398 + }, + { + "epoch": 0.71, + "grad_norm": 0.1883499026298523, + "learning_rate": 8.741602543186032e-05, + "loss": 0.0721, + "step": 399 + }, + { + "epoch": 0.71, + "grad_norm": 0.25179481506347656, + "learning_rate": 8.735412258133562e-05, + "loss": 0.0875, + "step": 400 + }, + { + "epoch": 0.71, + "grad_norm": 0.23851999640464783, + "learning_rate": 8.729208988188881e-05, + "loss": 0.0959, + "step": 401 + }, + { + "epoch": 0.71, + "grad_norm": 0.2662704885005951, + "learning_rate": 8.722992754915554e-05, + "loss": 0.1025, + "step": 402 + }, + { + "epoch": 0.71, + "grad_norm": 0.17909982800483704, + "learning_rate": 8.716763579922204e-05, + "loss": 0.0504, + "step": 403 + }, + { + "epoch": 0.71, + "grad_norm": 0.17002324759960175, + "learning_rate": 8.710521484862439e-05, + "loss": 0.0856, + "step": 404 + }, + { + "epoch": 0.72, + "grad_norm": 0.2229025810956955, + "learning_rate": 8.704266491434788e-05, + "loss": 0.0591, + "step": 405 + }, + { + "epoch": 0.72, + "grad_norm": 0.1444559544324875, + "learning_rate": 8.697998621382607e-05, + "loss": 0.0297, + "step": 406 + }, + { + "epoch": 0.72, + "grad_norm": 0.2677093744277954, + "learning_rate": 8.69171789649402e-05, + "loss": 0.0543, + "step": 407 + }, + { + "epoch": 0.72, + "grad_norm": 0.5875506401062012, + "learning_rate": 8.685424338601834e-05, + "loss": 0.1199, + "step": 408 + }, + { + "epoch": 0.72, + "grad_norm": 0.34576529264450073, + "learning_rate": 8.679117969583464e-05, + "loss": 0.1003, + "step": 409 + }, + { + "epoch": 0.73, + "grad_norm": 0.2765222489833832, + "learning_rate": 8.672798811360863e-05, + "loss": 0.0358, + "step": 410 + }, + { + "epoch": 0.73, + "grad_norm": 0.17154745757579803, + "learning_rate": 8.666466885900438e-05, + "loss": 0.0736, + "step": 411 + }, + { + "epoch": 0.73, + "grad_norm": 0.1607416719198227, + "learning_rate": 8.660122215212977e-05, + "loss": 0.0678, + "step": 412 + }, + { + "epoch": 0.73, + "grad_norm": 0.1216413602232933, + "learning_rate": 8.653764821353573e-05, + "loss": 0.0341, + "step": 413 + }, + { + "epoch": 0.73, + "grad_norm": 0.3713608980178833, + "learning_rate": 8.647394726421547e-05, + "loss": 0.118, + "step": 414 + }, + { + "epoch": 0.73, + "grad_norm": 0.48576387763023376, + "learning_rate": 8.641011952560371e-05, + "loss": 0.0931, + "step": 415 + }, + { + "epoch": 0.74, + "grad_norm": 0.14704179763793945, + "learning_rate": 8.63461652195759e-05, + "loss": 0.0285, + "step": 416 + }, + { + "epoch": 0.74, + "grad_norm": 0.3306657671928406, + "learning_rate": 8.628208456844747e-05, + "loss": 0.0737, + "step": 417 + }, + { + "epoch": 0.74, + "grad_norm": 0.39767885208129883, + "learning_rate": 8.621787779497305e-05, + "loss": 0.097, + "step": 418 + }, + { + "epoch": 0.74, + "grad_norm": 0.3198534846305847, + "learning_rate": 8.615354512234569e-05, + "loss": 0.0731, + "step": 419 + }, + { + "epoch": 0.74, + "grad_norm": 0.19191338121891022, + "learning_rate": 8.608908677419606e-05, + "loss": 0.0697, + "step": 420 + }, + { + "epoch": 0.74, + "grad_norm": 0.26490989327430725, + "learning_rate": 8.602450297459172e-05, + "loss": 0.1214, + "step": 421 + }, + { + "epoch": 0.75, + "grad_norm": 0.3545917868614197, + "learning_rate": 8.595979394803634e-05, + "loss": 0.0933, + "step": 422 + }, + { + "epoch": 0.75, + "grad_norm": 0.28239014744758606, + "learning_rate": 8.589495991946885e-05, + "loss": 0.0707, + "step": 423 + }, + { + "epoch": 0.75, + "grad_norm": 0.48272502422332764, + "learning_rate": 8.583000111426276e-05, + "loss": 0.0831, + "step": 424 + }, + { + "epoch": 0.75, + "grad_norm": 0.15938633680343628, + "learning_rate": 8.576491775822527e-05, + "loss": 0.0899, + "step": 425 + }, + { + "epoch": 0.75, + "grad_norm": 0.2616162896156311, + "learning_rate": 8.569971007759657e-05, + "loss": 0.118, + "step": 426 + }, + { + "epoch": 0.75, + "eval_loss": 0.0783080980181694, + "eval_runtime": 14.6414, + "eval_samples_per_second": 32.579, + "eval_steps_per_second": 8.196, + "step": 426 + }, + { + "epoch": 0.76, + "grad_norm": 0.19521737098693848, + "learning_rate": 8.563437829904903e-05, + "loss": 0.0814, + "step": 427 + }, + { + "epoch": 0.76, + "grad_norm": 0.194011390209198, + "learning_rate": 8.55689226496864e-05, + "loss": 0.0799, + "step": 428 + }, + { + "epoch": 0.76, + "grad_norm": 0.2743787169456482, + "learning_rate": 8.550334335704298e-05, + "loss": 0.0869, + "step": 429 + }, + { + "epoch": 0.76, + "grad_norm": 0.1369010955095291, + "learning_rate": 8.543764064908295e-05, + "loss": 0.0435, + "step": 430 + }, + { + "epoch": 0.76, + "grad_norm": 0.24237819015979767, + "learning_rate": 8.537181475419944e-05, + "loss": 0.1148, + "step": 431 + }, + { + "epoch": 0.76, + "grad_norm": 0.14511409401893616, + "learning_rate": 8.530586590121383e-05, + "loss": 0.0764, + "step": 432 + }, + { + "epoch": 0.77, + "grad_norm": 0.15356196463108063, + "learning_rate": 8.523979431937492e-05, + "loss": 0.05, + "step": 433 + }, + { + "epoch": 0.77, + "grad_norm": 0.21860916912555695, + "learning_rate": 8.51736002383581e-05, + "loss": 0.0971, + "step": 434 + }, + { + "epoch": 0.77, + "grad_norm": 0.23724305629730225, + "learning_rate": 8.510728388826463e-05, + "loss": 0.1049, + "step": 435 + }, + { + "epoch": 0.77, + "grad_norm": 0.25301918387413025, + "learning_rate": 8.50408454996208e-05, + "loss": 0.0848, + "step": 436 + }, + { + "epoch": 0.77, + "grad_norm": 0.22409550845623016, + "learning_rate": 8.497428530337706e-05, + "loss": 0.101, + "step": 437 + }, + { + "epoch": 0.77, + "grad_norm": 0.1324710100889206, + "learning_rate": 8.490760353090737e-05, + "loss": 0.0723, + "step": 438 + }, + { + "epoch": 0.78, + "grad_norm": 0.1362515389919281, + "learning_rate": 8.484080041400826e-05, + "loss": 0.0709, + "step": 439 + }, + { + "epoch": 0.78, + "grad_norm": 0.16375669836997986, + "learning_rate": 8.477387618489807e-05, + "loss": 0.0405, + "step": 440 + }, + { + "epoch": 0.78, + "grad_norm": 0.21752700209617615, + "learning_rate": 8.470683107621616e-05, + "loss": 0.0455, + "step": 441 + }, + { + "epoch": 0.78, + "grad_norm": 0.1989530771970749, + "learning_rate": 8.463966532102207e-05, + "loss": 0.0704, + "step": 442 + }, + { + "epoch": 0.78, + "grad_norm": 0.192123144865036, + "learning_rate": 8.457237915279476e-05, + "loss": 0.063, + "step": 443 + }, + { + "epoch": 0.79, + "grad_norm": 0.1221012994647026, + "learning_rate": 8.450497280543174e-05, + "loss": 0.0302, + "step": 444 + }, + { + "epoch": 0.79, + "grad_norm": 0.5705539584159851, + "learning_rate": 8.443744651324827e-05, + "loss": 0.1531, + "step": 445 + }, + { + "epoch": 0.79, + "grad_norm": 0.21490426361560822, + "learning_rate": 8.436980051097659e-05, + "loss": 0.0626, + "step": 446 + }, + { + "epoch": 0.79, + "grad_norm": 0.2654309570789337, + "learning_rate": 8.430203503376505e-05, + "loss": 0.0838, + "step": 447 + }, + { + "epoch": 0.79, + "grad_norm": 0.22457195818424225, + "learning_rate": 8.423415031717733e-05, + "loss": 0.0309, + "step": 448 + }, + { + "epoch": 0.79, + "grad_norm": 0.10934180021286011, + "learning_rate": 8.416614659719157e-05, + "loss": 0.0132, + "step": 449 + }, + { + "epoch": 0.8, + "grad_norm": 0.32622861862182617, + "learning_rate": 8.409802411019963e-05, + "loss": 0.107, + "step": 450 + }, + { + "epoch": 0.8, + "grad_norm": 0.24298590421676636, + "learning_rate": 8.40297830930062e-05, + "loss": 0.1268, + "step": 451 + }, + { + "epoch": 0.8, + "grad_norm": 0.29994437098503113, + "learning_rate": 8.396142378282798e-05, + "loss": 0.0747, + "step": 452 + }, + { + "epoch": 0.8, + "grad_norm": 0.16668649017810822, + "learning_rate": 8.389294641729293e-05, + "loss": 0.0479, + "step": 453 + }, + { + "epoch": 0.8, + "grad_norm": 0.26706060767173767, + "learning_rate": 8.382435123443934e-05, + "loss": 0.1116, + "step": 454 + }, + { + "epoch": 0.8, + "grad_norm": 0.1750030219554901, + "learning_rate": 8.375563847271506e-05, + "loss": 0.0597, + "step": 455 + }, + { + "epoch": 0.81, + "grad_norm": 0.2318125069141388, + "learning_rate": 8.36868083709767e-05, + "loss": 0.0909, + "step": 456 + }, + { + "epoch": 0.81, + "grad_norm": 0.1834569126367569, + "learning_rate": 8.361786116848872e-05, + "loss": 0.0813, + "step": 457 + }, + { + "epoch": 0.81, + "grad_norm": 0.27685895562171936, + "learning_rate": 8.354879710492264e-05, + "loss": 0.1301, + "step": 458 + }, + { + "epoch": 0.81, + "grad_norm": 0.16120545566082, + "learning_rate": 8.347961642035624e-05, + "loss": 0.0717, + "step": 459 + }, + { + "epoch": 0.81, + "grad_norm": 0.17625439167022705, + "learning_rate": 8.341031935527267e-05, + "loss": 0.0867, + "step": 460 + }, + { + "epoch": 0.82, + "grad_norm": 0.2321135252714157, + "learning_rate": 8.334090615055966e-05, + "loss": 0.1122, + "step": 461 + }, + { + "epoch": 0.82, + "grad_norm": 0.12720270454883575, + "learning_rate": 8.327137704750862e-05, + "loss": 0.0375, + "step": 462 + }, + { + "epoch": 0.82, + "grad_norm": 0.2046743929386139, + "learning_rate": 8.320173228781389e-05, + "loss": 0.0808, + "step": 463 + }, + { + "epoch": 0.82, + "grad_norm": 0.16513489186763763, + "learning_rate": 8.313197211357181e-05, + "loss": 0.0825, + "step": 464 + }, + { + "epoch": 0.82, + "grad_norm": 0.12492749094963074, + "learning_rate": 8.306209676727994e-05, + "loss": 0.0876, + "step": 465 + }, + { + "epoch": 0.82, + "grad_norm": 0.1343008577823639, + "learning_rate": 8.299210649183619e-05, + "loss": 0.0852, + "step": 466 + }, + { + "epoch": 0.83, + "grad_norm": 0.13951613008975983, + "learning_rate": 8.2922001530538e-05, + "loss": 0.1003, + "step": 467 + }, + { + "epoch": 0.83, + "grad_norm": 0.16553768515586853, + "learning_rate": 8.285178212708143e-05, + "loss": 0.0662, + "step": 468 + }, + { + "epoch": 0.83, + "grad_norm": 0.15311822295188904, + "learning_rate": 8.278144852556042e-05, + "loss": 0.0785, + "step": 469 + }, + { + "epoch": 0.83, + "grad_norm": 0.1948017179965973, + "learning_rate": 8.271100097046584e-05, + "loss": 0.0898, + "step": 470 + }, + { + "epoch": 0.83, + "grad_norm": 0.11078551411628723, + "learning_rate": 8.264043970668469e-05, + "loss": 0.0386, + "step": 471 + }, + { + "epoch": 0.84, + "grad_norm": 0.1605585515499115, + "learning_rate": 8.256976497949924e-05, + "loss": 0.0497, + "step": 472 + }, + { + "epoch": 0.84, + "grad_norm": 0.1617887318134308, + "learning_rate": 8.249897703458619e-05, + "loss": 0.0624, + "step": 473 + }, + { + "epoch": 0.84, + "grad_norm": 0.1274091750383377, + "learning_rate": 8.242807611801578e-05, + "loss": 0.0578, + "step": 474 + }, + { + "epoch": 0.84, + "grad_norm": 0.15953154861927032, + "learning_rate": 8.235706247625098e-05, + "loss": 0.042, + "step": 475 + }, + { + "epoch": 0.84, + "grad_norm": 0.27984094619750977, + "learning_rate": 8.228593635614659e-05, + "loss": 0.1037, + "step": 476 + }, + { + "epoch": 0.84, + "grad_norm": 0.1895013153553009, + "learning_rate": 8.22146980049484e-05, + "loss": 0.0728, + "step": 477 + }, + { + "epoch": 0.85, + "grad_norm": 0.1580246388912201, + "learning_rate": 8.214334767029239e-05, + "loss": 0.0398, + "step": 478 + }, + { + "epoch": 0.85, + "grad_norm": 0.2391231805086136, + "learning_rate": 8.207188560020373e-05, + "loss": 0.0707, + "step": 479 + }, + { + "epoch": 0.85, + "grad_norm": 0.25975751876831055, + "learning_rate": 8.200031204309603e-05, + "loss": 0.1369, + "step": 480 + }, + { + "epoch": 0.85, + "grad_norm": 0.32591861486434937, + "learning_rate": 8.192862724777051e-05, + "loss": 0.0878, + "step": 481 + }, + { + "epoch": 0.85, + "grad_norm": 0.32488566637039185, + "learning_rate": 8.185683146341496e-05, + "loss": 0.0692, + "step": 482 + }, + { + "epoch": 0.85, + "grad_norm": 0.1918002963066101, + "learning_rate": 8.178492493960309e-05, + "loss": 0.0942, + "step": 483 + }, + { + "epoch": 0.86, + "grad_norm": 0.43140751123428345, + "learning_rate": 8.171290792629347e-05, + "loss": 0.0979, + "step": 484 + }, + { + "epoch": 0.86, + "grad_norm": 0.1771157830953598, + "learning_rate": 8.164078067382882e-05, + "loss": 0.0894, + "step": 485 + }, + { + "epoch": 0.86, + "grad_norm": 0.25012728571891785, + "learning_rate": 8.1568543432935e-05, + "loss": 0.0734, + "step": 486 + }, + { + "epoch": 0.86, + "grad_norm": 0.19328337907791138, + "learning_rate": 8.149619645472031e-05, + "loss": 0.0869, + "step": 487 + }, + { + "epoch": 0.86, + "grad_norm": 0.18180640041828156, + "learning_rate": 8.142373999067439e-05, + "loss": 0.0897, + "step": 488 + }, + { + "epoch": 0.87, + "grad_norm": 0.25753355026245117, + "learning_rate": 8.135117429266757e-05, + "loss": 0.0883, + "step": 489 + }, + { + "epoch": 0.87, + "grad_norm": 0.23837833106517792, + "learning_rate": 8.127849961294984e-05, + "loss": 0.0549, + "step": 490 + }, + { + "epoch": 0.87, + "grad_norm": 0.25032365322113037, + "learning_rate": 8.120571620415006e-05, + "loss": 0.0976, + "step": 491 + }, + { + "epoch": 0.87, + "grad_norm": 0.30728307366371155, + "learning_rate": 8.113282431927502e-05, + "loss": 0.0709, + "step": 492 + }, + { + "epoch": 0.87, + "grad_norm": 0.1391928791999817, + "learning_rate": 8.10598242117086e-05, + "loss": 0.0378, + "step": 493 + }, + { + "epoch": 0.87, + "grad_norm": 0.1786775141954422, + "learning_rate": 8.098671613521089e-05, + "loss": 0.0671, + "step": 494 + }, + { + "epoch": 0.88, + "grad_norm": 0.21963584423065186, + "learning_rate": 8.091350034391732e-05, + "loss": 0.0936, + "step": 495 + }, + { + "epoch": 0.88, + "grad_norm": 0.13954299688339233, + "learning_rate": 8.084017709233767e-05, + "loss": 0.052, + "step": 496 + }, + { + "epoch": 0.88, + "grad_norm": 0.19656923413276672, + "learning_rate": 8.076674663535537e-05, + "loss": 0.0584, + "step": 497 + }, + { + "epoch": 0.88, + "grad_norm": 0.14330637454986572, + "learning_rate": 8.069320922822643e-05, + "loss": 0.0786, + "step": 498 + }, + { + "epoch": 0.88, + "grad_norm": 0.19019991159439087, + "learning_rate": 8.061956512657871e-05, + "loss": 0.0837, + "step": 499 + }, + { + "epoch": 0.88, + "grad_norm": 0.2079285979270935, + "learning_rate": 8.05458145864109e-05, + "loss": 0.0459, + "step": 500 + }, + { + "epoch": 0.89, + "grad_norm": 0.3516862392425537, + "learning_rate": 8.047195786409172e-05, + "loss": 0.191, + "step": 501 + }, + { + "epoch": 0.89, + "grad_norm": 0.192392036318779, + "learning_rate": 8.039799521635896e-05, + "loss": 0.1072, + "step": 502 + }, + { + "epoch": 0.89, + "grad_norm": 0.1787678301334381, + "learning_rate": 8.032392690031867e-05, + "loss": 0.0649, + "step": 503 + }, + { + "epoch": 0.89, + "grad_norm": 0.21046535670757294, + "learning_rate": 8.024975317344421e-05, + "loss": 0.1065, + "step": 504 + }, + { + "epoch": 0.89, + "grad_norm": 0.1215684562921524, + "learning_rate": 8.017547429357532e-05, + "loss": 0.0433, + "step": 505 + }, + { + "epoch": 0.9, + "grad_norm": 0.1342051476240158, + "learning_rate": 8.010109051891731e-05, + "loss": 0.0774, + "step": 506 + }, + { + "epoch": 0.9, + "grad_norm": 0.10215850174427032, + "learning_rate": 8.002660210804011e-05, + "loss": 0.0338, + "step": 507 + }, + { + "epoch": 0.9, + "grad_norm": 0.23539598286151886, + "learning_rate": 7.995200931987743e-05, + "loss": 0.0516, + "step": 508 + }, + { + "epoch": 0.9, + "grad_norm": 0.15601155161857605, + "learning_rate": 7.987731241372572e-05, + "loss": 0.0559, + "step": 509 + }, + { + "epoch": 0.9, + "grad_norm": 0.165851429104805, + "learning_rate": 7.98025116492434e-05, + "loss": 0.0372, + "step": 510 + }, + { + "epoch": 0.9, + "grad_norm": 0.21045421063899994, + "learning_rate": 7.972760728644996e-05, + "loss": 0.086, + "step": 511 + }, + { + "epoch": 0.91, + "grad_norm": 0.14814500510692596, + "learning_rate": 7.965259958572496e-05, + "loss": 0.0587, + "step": 512 + }, + { + "epoch": 0.91, + "grad_norm": 0.15543898940086365, + "learning_rate": 7.95774888078072e-05, + "loss": 0.0682, + "step": 513 + }, + { + "epoch": 0.91, + "grad_norm": 0.13527697324752808, + "learning_rate": 7.950227521379382e-05, + "loss": 0.0468, + "step": 514 + }, + { + "epoch": 0.91, + "grad_norm": 0.35773295164108276, + "learning_rate": 7.94269590651393e-05, + "loss": 0.1273, + "step": 515 + }, + { + "epoch": 0.91, + "grad_norm": 0.22433511912822723, + "learning_rate": 7.935154062365467e-05, + "loss": 0.0438, + "step": 516 + }, + { + "epoch": 0.91, + "grad_norm": 0.1453983038663864, + "learning_rate": 7.927602015150655e-05, + "loss": 0.0367, + "step": 517 + }, + { + "epoch": 0.92, + "grad_norm": 0.2343645691871643, + "learning_rate": 7.920039791121617e-05, + "loss": 0.128, + "step": 518 + }, + { + "epoch": 0.92, + "grad_norm": 0.25178173184394836, + "learning_rate": 7.912467416565861e-05, + "loss": 0.1094, + "step": 519 + }, + { + "epoch": 0.92, + "grad_norm": 0.3170076012611389, + "learning_rate": 7.904884917806174e-05, + "loss": 0.1323, + "step": 520 + }, + { + "epoch": 0.92, + "grad_norm": 0.15917453169822693, + "learning_rate": 7.897292321200538e-05, + "loss": 0.036, + "step": 521 + }, + { + "epoch": 0.92, + "grad_norm": 0.31578320264816284, + "learning_rate": 7.889689653142036e-05, + "loss": 0.0909, + "step": 522 + }, + { + "epoch": 0.93, + "grad_norm": 0.16602741181850433, + "learning_rate": 7.882076940058764e-05, + "loss": 0.0371, + "step": 523 + }, + { + "epoch": 0.93, + "grad_norm": 0.3235325813293457, + "learning_rate": 7.874454208413731e-05, + "loss": 0.1561, + "step": 524 + }, + { + "epoch": 0.93, + "grad_norm": 0.1256486028432846, + "learning_rate": 7.866821484704776e-05, + "loss": 0.0364, + "step": 525 + }, + { + "epoch": 0.93, + "grad_norm": 0.2234162986278534, + "learning_rate": 7.859178795464472e-05, + "loss": 0.0883, + "step": 526 + }, + { + "epoch": 0.93, + "grad_norm": 0.1564294993877411, + "learning_rate": 7.851526167260034e-05, + "loss": 0.0679, + "step": 527 + }, + { + "epoch": 0.93, + "grad_norm": 0.16309525072574615, + "learning_rate": 7.84386362669322e-05, + "loss": 0.0912, + "step": 528 + }, + { + "epoch": 0.94, + "grad_norm": 0.21584004163742065, + "learning_rate": 7.836191200400255e-05, + "loss": 0.0695, + "step": 529 + }, + { + "epoch": 0.94, + "grad_norm": 0.15948422253131866, + "learning_rate": 7.828508915051724e-05, + "loss": 0.0459, + "step": 530 + }, + { + "epoch": 0.94, + "grad_norm": 0.24016940593719482, + "learning_rate": 7.82081679735248e-05, + "loss": 0.1127, + "step": 531 + }, + { + "epoch": 0.94, + "grad_norm": 0.2894397974014282, + "learning_rate": 7.813114874041557e-05, + "loss": 0.0584, + "step": 532 + }, + { + "epoch": 0.94, + "grad_norm": 0.20707662403583527, + "learning_rate": 7.805403171892079e-05, + "loss": 0.1045, + "step": 533 + }, + { + "epoch": 0.94, + "grad_norm": 0.23427248001098633, + "learning_rate": 7.797681717711161e-05, + "loss": 0.1345, + "step": 534 + }, + { + "epoch": 0.95, + "grad_norm": 0.13141866028308868, + "learning_rate": 7.789950538339812e-05, + "loss": 0.052, + "step": 535 + }, + { + "epoch": 0.95, + "grad_norm": 0.21118536591529846, + "learning_rate": 7.782209660652855e-05, + "loss": 0.1272, + "step": 536 + }, + { + "epoch": 0.95, + "grad_norm": 0.15485352277755737, + "learning_rate": 7.77445911155882e-05, + "loss": 0.0686, + "step": 537 + }, + { + "epoch": 0.95, + "grad_norm": 0.11380946636199951, + "learning_rate": 7.766698917999861e-05, + "loss": 0.0735, + "step": 538 + }, + { + "epoch": 0.95, + "grad_norm": 0.26798170804977417, + "learning_rate": 7.758929106951656e-05, + "loss": 0.0934, + "step": 539 + }, + { + "epoch": 0.96, + "grad_norm": 0.23003587126731873, + "learning_rate": 7.751149705423312e-05, + "loss": 0.0816, + "step": 540 + }, + { + "epoch": 0.96, + "grad_norm": 0.2122953236103058, + "learning_rate": 7.743360740457278e-05, + "loss": 0.0827, + "step": 541 + }, + { + "epoch": 0.96, + "grad_norm": 0.22673499584197998, + "learning_rate": 7.735562239129247e-05, + "loss": 0.1232, + "step": 542 + }, + { + "epoch": 0.96, + "grad_norm": 0.24960415065288544, + "learning_rate": 7.727754228548058e-05, + "loss": 0.1124, + "step": 543 + }, + { + "epoch": 0.96, + "grad_norm": 0.10405872017145157, + "learning_rate": 7.719936735855611e-05, + "loss": 0.0687, + "step": 544 + }, + { + "epoch": 0.96, + "grad_norm": 0.16980154812335968, + "learning_rate": 7.712109788226762e-05, + "loss": 0.0874, + "step": 545 + }, + { + "epoch": 0.97, + "grad_norm": 0.1486412137746811, + "learning_rate": 7.704273412869238e-05, + "loss": 0.0815, + "step": 546 + }, + { + "epoch": 0.97, + "grad_norm": 0.14432762563228607, + "learning_rate": 7.696427637023538e-05, + "loss": 0.0752, + "step": 547 + }, + { + "epoch": 0.97, + "grad_norm": 0.2627028822898865, + "learning_rate": 7.688572487962835e-05, + "loss": 0.0982, + "step": 548 + }, + { + "epoch": 0.97, + "grad_norm": 0.16832011938095093, + "learning_rate": 7.680707992992888e-05, + "loss": 0.0895, + "step": 549 + }, + { + "epoch": 0.97, + "grad_norm": 0.14999301731586456, + "learning_rate": 7.672834179451942e-05, + "loss": 0.0544, + "step": 550 + }, + { + "epoch": 0.97, + "grad_norm": 0.14237482845783234, + "learning_rate": 7.664951074710638e-05, + "loss": 0.0623, + "step": 551 + }, + { + "epoch": 0.98, + "grad_norm": 0.1694159060716629, + "learning_rate": 7.657058706171911e-05, + "loss": 0.0784, + "step": 552 + }, + { + "epoch": 0.98, + "grad_norm": 0.1470886617898941, + "learning_rate": 7.649157101270902e-05, + "loss": 0.0635, + "step": 553 + }, + { + "epoch": 0.98, + "grad_norm": 0.16492018103599548, + "learning_rate": 7.641246287474855e-05, + "loss": 0.0669, + "step": 554 + }, + { + "epoch": 0.98, + "grad_norm": 0.195392444729805, + "learning_rate": 7.633326292283028e-05, + "loss": 0.0387, + "step": 555 + }, + { + "epoch": 0.98, + "grad_norm": 0.17653177678585052, + "learning_rate": 7.625397143226596e-05, + "loss": 0.0592, + "step": 556 + }, + { + "epoch": 0.99, + "grad_norm": 0.23455718159675598, + "learning_rate": 7.617458867868553e-05, + "loss": 0.0882, + "step": 557 + }, + { + "epoch": 0.99, + "grad_norm": 0.3588998317718506, + "learning_rate": 7.609511493803616e-05, + "loss": 0.107, + "step": 558 + }, + { + "epoch": 0.99, + "grad_norm": 0.2767946720123291, + "learning_rate": 7.601555048658134e-05, + "loss": 0.1609, + "step": 559 + }, + { + "epoch": 0.99, + "grad_norm": 0.22181196510791779, + "learning_rate": 7.593589560089985e-05, + "loss": 0.0598, + "step": 560 + }, + { + "epoch": 0.99, + "grad_norm": 0.30335313081741333, + "learning_rate": 7.585615055788484e-05, + "loss": 0.0825, + "step": 561 + }, + { + "epoch": 0.99, + "grad_norm": 0.19477833807468414, + "learning_rate": 7.577631563474291e-05, + "loss": 0.0446, + "step": 562 + }, + { + "epoch": 1.0, + "grad_norm": 0.11036123335361481, + "learning_rate": 7.569639110899303e-05, + "loss": 0.025, + "step": 563 + }, + { + "epoch": 1.0, + "grad_norm": 0.19955220818519592, + "learning_rate": 7.561637725846568e-05, + "loss": 0.0484, + "step": 564 + }, + { + "epoch": 1.0, + "grad_norm": 0.20293684303760529, + "learning_rate": 7.553627436130183e-05, + "loss": 0.0689, + "step": 565 + }, + { + "epoch": 1.0, + "grad_norm": 0.18100765347480774, + "learning_rate": 7.545608269595202e-05, + "loss": 0.0371, + "step": 566 + }, + { + "epoch": 1.0, + "grad_norm": 0.43053922057151794, + "learning_rate": 7.537580254117531e-05, + "loss": 0.0901, + "step": 567 + }, + { + "epoch": 1.0, + "grad_norm": 0.3413926959037781, + "learning_rate": 7.529543417603844e-05, + "loss": 0.1088, + "step": 568 + }, + { + "epoch": 1.0, + "eval_loss": 0.09266742318868637, + "eval_runtime": 14.642, + "eval_samples_per_second": 32.578, + "eval_steps_per_second": 8.196, + "step": 568 + }, + { + "epoch": 1.01, + "grad_norm": 0.1796027272939682, + "learning_rate": 7.521497787991471e-05, + "loss": 0.0244, + "step": 569 + }, + { + "epoch": 1.01, + "grad_norm": 0.30515041947364807, + "learning_rate": 7.513443393248312e-05, + "loss": 0.0682, + "step": 570 + }, + { + "epoch": 1.01, + "grad_norm": 0.3712550103664398, + "learning_rate": 7.505380261372734e-05, + "loss": 0.0921, + "step": 571 + }, + { + "epoch": 1.01, + "grad_norm": 0.4219339191913605, + "learning_rate": 7.497308420393477e-05, + "loss": 0.0785, + "step": 572 + }, + { + "epoch": 1.01, + "grad_norm": 0.24129725992679596, + "learning_rate": 7.489227898369559e-05, + "loss": 0.0851, + "step": 573 + }, + { + "epoch": 1.02, + "grad_norm": 0.24595998227596283, + "learning_rate": 7.481138723390164e-05, + "loss": 0.1143, + "step": 574 + }, + { + "epoch": 1.02, + "grad_norm": 0.13906948268413544, + "learning_rate": 7.473040923574567e-05, + "loss": 0.0402, + "step": 575 + }, + { + "epoch": 1.02, + "grad_norm": 0.1885530650615692, + "learning_rate": 7.464934527072016e-05, + "loss": 0.0384, + "step": 576 + }, + { + "epoch": 1.02, + "grad_norm": 0.13116823136806488, + "learning_rate": 7.456819562061649e-05, + "loss": 0.0447, + "step": 577 + }, + { + "epoch": 1.02, + "grad_norm": 0.23953841626644135, + "learning_rate": 7.448696056752383e-05, + "loss": 0.0602, + "step": 578 + }, + { + "epoch": 1.02, + "grad_norm": 0.17374739050865173, + "learning_rate": 7.440564039382827e-05, + "loss": 0.0657, + "step": 579 + }, + { + "epoch": 1.03, + "grad_norm": 0.20921552181243896, + "learning_rate": 7.432423538221178e-05, + "loss": 0.0757, + "step": 580 + }, + { + "epoch": 1.03, + "grad_norm": 0.10258325189352036, + "learning_rate": 7.424274581565123e-05, + "loss": 0.0237, + "step": 581 + }, + { + "epoch": 1.03, + "grad_norm": 0.31752172112464905, + "learning_rate": 7.416117197741742e-05, + "loss": 0.0625, + "step": 582 + }, + { + "epoch": 1.03, + "grad_norm": 0.229179248213768, + "learning_rate": 7.407951415107413e-05, + "loss": 0.0792, + "step": 583 + }, + { + "epoch": 1.03, + "grad_norm": 0.16059361398220062, + "learning_rate": 7.3997772620477e-05, + "loss": 0.0718, + "step": 584 + }, + { + "epoch": 1.03, + "grad_norm": 0.1626499593257904, + "learning_rate": 7.391594766977277e-05, + "loss": 0.0457, + "step": 585 + }, + { + "epoch": 1.04, + "grad_norm": 0.1549261212348938, + "learning_rate": 7.383403958339807e-05, + "loss": 0.0544, + "step": 586 + }, + { + "epoch": 1.04, + "grad_norm": 0.1588374525308609, + "learning_rate": 7.375204864607852e-05, + "loss": 0.0342, + "step": 587 + }, + { + "epoch": 1.04, + "grad_norm": 0.09883646667003632, + "learning_rate": 7.366997514282782e-05, + "loss": 0.0292, + "step": 588 + }, + { + "epoch": 1.04, + "grad_norm": 0.3421178460121155, + "learning_rate": 7.358781935894659e-05, + "loss": 0.0999, + "step": 589 + }, + { + "epoch": 1.04, + "grad_norm": 0.12105683982372284, + "learning_rate": 7.350558158002154e-05, + "loss": 0.023, + "step": 590 + }, + { + "epoch": 1.05, + "grad_norm": 0.15255074203014374, + "learning_rate": 7.342326209192435e-05, + "loss": 0.0423, + "step": 591 + }, + { + "epoch": 1.05, + "grad_norm": 0.18337713181972504, + "learning_rate": 7.33408611808108e-05, + "loss": 0.0305, + "step": 592 + }, + { + "epoch": 1.05, + "grad_norm": 0.12969495356082916, + "learning_rate": 7.325837913311966e-05, + "loss": 0.0175, + "step": 593 + }, + { + "epoch": 1.05, + "grad_norm": 0.2849477231502533, + "learning_rate": 7.317581623557177e-05, + "loss": 0.0878, + "step": 594 + }, + { + "epoch": 1.05, + "grad_norm": 0.24307942390441895, + "learning_rate": 7.3093172775169e-05, + "loss": 0.037, + "step": 595 + }, + { + "epoch": 1.05, + "grad_norm": 0.2612784206867218, + "learning_rate": 7.301044903919325e-05, + "loss": 0.097, + "step": 596 + }, + { + "epoch": 1.06, + "grad_norm": 0.2548207640647888, + "learning_rate": 7.292764531520553e-05, + "loss": 0.0928, + "step": 597 + }, + { + "epoch": 1.06, + "grad_norm": 0.3221377432346344, + "learning_rate": 7.284476189104485e-05, + "loss": 0.0874, + "step": 598 + }, + { + "epoch": 1.06, + "grad_norm": 0.20441681146621704, + "learning_rate": 7.27617990548273e-05, + "loss": 0.035, + "step": 599 + }, + { + "epoch": 1.06, + "grad_norm": 0.26378926634788513, + "learning_rate": 7.267875709494499e-05, + "loss": 0.0494, + "step": 600 + }, + { + "epoch": 1.06, + "grad_norm": 0.5505862832069397, + "learning_rate": 7.259563630006512e-05, + "loss": 0.1241, + "step": 601 + }, + { + "epoch": 1.07, + "grad_norm": 0.13375498354434967, + "learning_rate": 7.251243695912886e-05, + "loss": 0.0241, + "step": 602 + }, + { + "epoch": 1.07, + "grad_norm": 0.19569019973278046, + "learning_rate": 7.242915936135051e-05, + "loss": 0.0698, + "step": 603 + }, + { + "epoch": 1.07, + "grad_norm": 0.28432735800743103, + "learning_rate": 7.234580379621637e-05, + "loss": 0.0641, + "step": 604 + }, + { + "epoch": 1.07, + "grad_norm": 0.13998962938785553, + "learning_rate": 7.22623705534837e-05, + "loss": 0.0365, + "step": 605 + }, + { + "epoch": 1.07, + "grad_norm": 0.1772097498178482, + "learning_rate": 7.217885992317985e-05, + "loss": 0.081, + "step": 606 + }, + { + "epoch": 1.07, + "grad_norm": 0.4318295121192932, + "learning_rate": 7.209527219560119e-05, + "loss": 0.0532, + "step": 607 + }, + { + "epoch": 1.08, + "grad_norm": 0.2505156695842743, + "learning_rate": 7.201160766131207e-05, + "loss": 0.0667, + "step": 608 + }, + { + "epoch": 1.08, + "grad_norm": 0.13024090230464935, + "learning_rate": 7.192786661114384e-05, + "loss": 0.0234, + "step": 609 + }, + { + "epoch": 1.08, + "grad_norm": 0.2824789583683014, + "learning_rate": 7.184404933619377e-05, + "loss": 0.095, + "step": 610 + }, + { + "epoch": 1.08, + "grad_norm": 0.18059489130973816, + "learning_rate": 7.17601561278242e-05, + "loss": 0.0471, + "step": 611 + }, + { + "epoch": 1.08, + "grad_norm": 0.2839769124984741, + "learning_rate": 7.167618727766138e-05, + "loss": 0.0783, + "step": 612 + }, + { + "epoch": 1.08, + "grad_norm": 0.1342955082654953, + "learning_rate": 7.159214307759448e-05, + "loss": 0.0453, + "step": 613 + }, + { + "epoch": 1.09, + "grad_norm": 0.1336507499217987, + "learning_rate": 7.150802381977464e-05, + "loss": 0.0431, + "step": 614 + }, + { + "epoch": 1.09, + "grad_norm": 0.2953212559223175, + "learning_rate": 7.142382979661386e-05, + "loss": 0.0705, + "step": 615 + }, + { + "epoch": 1.09, + "grad_norm": 0.17532870173454285, + "learning_rate": 7.133956130078412e-05, + "loss": 0.0666, + "step": 616 + }, + { + "epoch": 1.09, + "grad_norm": 0.17404836416244507, + "learning_rate": 7.12552186252162e-05, + "loss": 0.0522, + "step": 617 + }, + { + "epoch": 1.09, + "grad_norm": 0.25231000781059265, + "learning_rate": 7.117080206309878e-05, + "loss": 0.0854, + "step": 618 + }, + { + "epoch": 1.1, + "grad_norm": 0.2264215499162674, + "learning_rate": 7.108631190787735e-05, + "loss": 0.0692, + "step": 619 + }, + { + "epoch": 1.1, + "grad_norm": 0.3555202782154083, + "learning_rate": 7.100174845325327e-05, + "loss": 0.074, + "step": 620 + }, + { + "epoch": 1.1, + "grad_norm": 0.34550729393959045, + "learning_rate": 7.091711199318264e-05, + "loss": 0.0831, + "step": 621 + }, + { + "epoch": 1.1, + "grad_norm": 0.14560338854789734, + "learning_rate": 7.083240282187543e-05, + "loss": 0.0404, + "step": 622 + }, + { + "epoch": 1.1, + "grad_norm": 0.23464788496494293, + "learning_rate": 7.074762123379423e-05, + "loss": 0.0699, + "step": 623 + }, + { + "epoch": 1.1, + "grad_norm": 0.22587832808494568, + "learning_rate": 7.066276752365352e-05, + "loss": 0.0887, + "step": 624 + }, + { + "epoch": 1.11, + "grad_norm": 0.17183855175971985, + "learning_rate": 7.057784198641834e-05, + "loss": 0.0373, + "step": 625 + }, + { + "epoch": 1.11, + "grad_norm": 0.19148162007331848, + "learning_rate": 7.049284491730354e-05, + "loss": 0.0289, + "step": 626 + }, + { + "epoch": 1.11, + "grad_norm": 0.26134082674980164, + "learning_rate": 7.040777661177251e-05, + "loss": 0.0367, + "step": 627 + }, + { + "epoch": 1.11, + "grad_norm": 0.5379131436347961, + "learning_rate": 7.032263736553635e-05, + "loss": 0.1049, + "step": 628 + }, + { + "epoch": 1.11, + "grad_norm": 0.13634662330150604, + "learning_rate": 7.023742747455276e-05, + "loss": 0.018, + "step": 629 + }, + { + "epoch": 1.11, + "grad_norm": 0.28767991065979004, + "learning_rate": 7.015214723502496e-05, + "loss": 0.06, + "step": 630 + }, + { + "epoch": 1.12, + "grad_norm": 0.2551933825016022, + "learning_rate": 7.006679694340073e-05, + "loss": 0.0407, + "step": 631 + }, + { + "epoch": 1.12, + "grad_norm": 0.41325151920318604, + "learning_rate": 6.998137689637142e-05, + "loss": 0.046, + "step": 632 + }, + { + "epoch": 1.12, + "grad_norm": 0.25655174255371094, + "learning_rate": 6.989588739087078e-05, + "loss": 0.0398, + "step": 633 + }, + { + "epoch": 1.12, + "grad_norm": 0.3917771279811859, + "learning_rate": 6.981032872407405e-05, + "loss": 0.1072, + "step": 634 + }, + { + "epoch": 1.12, + "grad_norm": 0.20738206803798676, + "learning_rate": 6.972470119339691e-05, + "loss": 0.0457, + "step": 635 + }, + { + "epoch": 1.13, + "grad_norm": 0.1865154653787613, + "learning_rate": 6.963900509649434e-05, + "loss": 0.0258, + "step": 636 + }, + { + "epoch": 1.13, + "grad_norm": 0.282071590423584, + "learning_rate": 6.955324073125979e-05, + "loss": 0.07, + "step": 637 + }, + { + "epoch": 1.13, + "grad_norm": 0.27442115545272827, + "learning_rate": 6.946740839582388e-05, + "loss": 0.0875, + "step": 638 + }, + { + "epoch": 1.13, + "grad_norm": 0.2635151445865631, + "learning_rate": 6.938150838855359e-05, + "loss": 0.0332, + "step": 639 + }, + { + "epoch": 1.13, + "grad_norm": 0.16783182322978973, + "learning_rate": 6.929554100805118e-05, + "loss": 0.0405, + "step": 640 + }, + { + "epoch": 1.13, + "grad_norm": 0.3328685760498047, + "learning_rate": 6.920950655315297e-05, + "loss": 0.1076, + "step": 641 + }, + { + "epoch": 1.14, + "grad_norm": 0.20146729052066803, + "learning_rate": 6.91234053229286e-05, + "loss": 0.0481, + "step": 642 + }, + { + "epoch": 1.14, + "grad_norm": 0.21599121391773224, + "learning_rate": 6.903723761667973e-05, + "loss": 0.0502, + "step": 643 + }, + { + "epoch": 1.14, + "grad_norm": 0.16269706189632416, + "learning_rate": 6.895100373393913e-05, + "loss": 0.0652, + "step": 644 + }, + { + "epoch": 1.14, + "grad_norm": 0.3716180622577667, + "learning_rate": 6.886470397446958e-05, + "loss": 0.0914, + "step": 645 + }, + { + "epoch": 1.14, + "grad_norm": 0.18003414571285248, + "learning_rate": 6.877833863826295e-05, + "loss": 0.0484, + "step": 646 + }, + { + "epoch": 1.14, + "grad_norm": 0.2745915949344635, + "learning_rate": 6.869190802553894e-05, + "loss": 0.1057, + "step": 647 + }, + { + "epoch": 1.15, + "grad_norm": 0.2507147490978241, + "learning_rate": 6.860541243674426e-05, + "loss": 0.0587, + "step": 648 + }, + { + "epoch": 1.15, + "grad_norm": 0.19874247908592224, + "learning_rate": 6.851885217255145e-05, + "loss": 0.0452, + "step": 649 + }, + { + "epoch": 1.15, + "grad_norm": 0.21256215870380402, + "learning_rate": 6.843222753385786e-05, + "loss": 0.0434, + "step": 650 + }, + { + "epoch": 1.15, + "grad_norm": 0.13569054007530212, + "learning_rate": 6.834553882178463e-05, + "loss": 0.0275, + "step": 651 + }, + { + "epoch": 1.15, + "grad_norm": 0.26869267225265503, + "learning_rate": 6.825878633767563e-05, + "loss": 0.1006, + "step": 652 + }, + { + "epoch": 1.16, + "grad_norm": 0.21900776028633118, + "learning_rate": 6.817197038309644e-05, + "loss": 0.0564, + "step": 653 + }, + { + "epoch": 1.16, + "grad_norm": 0.13306765258312225, + "learning_rate": 6.80850912598332e-05, + "loss": 0.0304, + "step": 654 + }, + { + "epoch": 1.16, + "grad_norm": 0.09863998740911484, + "learning_rate": 6.79981492698917e-05, + "loss": 0.0178, + "step": 655 + }, + { + "epoch": 1.16, + "grad_norm": 0.2170545756816864, + "learning_rate": 6.791114471549627e-05, + "loss": 0.0754, + "step": 656 + }, + { + "epoch": 1.16, + "grad_norm": 0.26661446690559387, + "learning_rate": 6.782407789908863e-05, + "loss": 0.1083, + "step": 657 + }, + { + "epoch": 1.16, + "grad_norm": 0.2747049629688263, + "learning_rate": 6.773694912332707e-05, + "loss": 0.0758, + "step": 658 + }, + { + "epoch": 1.17, + "grad_norm": 0.252560019493103, + "learning_rate": 6.764975869108514e-05, + "loss": 0.0681, + "step": 659 + }, + { + "epoch": 1.17, + "grad_norm": 0.2867240905761719, + "learning_rate": 6.756250690545079e-05, + "loss": 0.095, + "step": 660 + }, + { + "epoch": 1.17, + "grad_norm": 0.09760677814483643, + "learning_rate": 6.747519406972524e-05, + "loss": 0.0123, + "step": 661 + }, + { + "epoch": 1.17, + "grad_norm": 0.17589041590690613, + "learning_rate": 6.738782048742187e-05, + "loss": 0.0437, + "step": 662 + }, + { + "epoch": 1.17, + "grad_norm": 0.29266613721847534, + "learning_rate": 6.730038646226532e-05, + "loss": 0.0706, + "step": 663 + }, + { + "epoch": 1.17, + "grad_norm": 0.1055804044008255, + "learning_rate": 6.721289229819024e-05, + "loss": 0.0343, + "step": 664 + }, + { + "epoch": 1.18, + "grad_norm": 0.2909635305404663, + "learning_rate": 6.712533829934042e-05, + "loss": 0.0817, + "step": 665 + }, + { + "epoch": 1.18, + "grad_norm": 0.2604895532131195, + "learning_rate": 6.703772477006757e-05, + "loss": 0.0452, + "step": 666 + }, + { + "epoch": 1.18, + "grad_norm": 0.10520771890878677, + "learning_rate": 6.695005201493038e-05, + "loss": 0.0215, + "step": 667 + }, + { + "epoch": 1.18, + "grad_norm": 0.10080817341804504, + "learning_rate": 6.686232033869344e-05, + "loss": 0.0188, + "step": 668 + }, + { + "epoch": 1.18, + "grad_norm": 0.3340647220611572, + "learning_rate": 6.677453004632608e-05, + "loss": 0.0612, + "step": 669 + }, + { + "epoch": 1.19, + "grad_norm": 0.29719796776771545, + "learning_rate": 6.668668144300149e-05, + "loss": 0.1014, + "step": 670 + }, + { + "epoch": 1.19, + "grad_norm": 0.2131602168083191, + "learning_rate": 6.659877483409545e-05, + "loss": 0.0621, + "step": 671 + }, + { + "epoch": 1.19, + "grad_norm": 0.1867963820695877, + "learning_rate": 6.65108105251855e-05, + "loss": 0.0312, + "step": 672 + }, + { + "epoch": 1.19, + "grad_norm": 0.4250008463859558, + "learning_rate": 6.642278882204963e-05, + "loss": 0.0684, + "step": 673 + }, + { + "epoch": 1.19, + "grad_norm": 0.20828047394752502, + "learning_rate": 6.633471003066543e-05, + "loss": 0.0421, + "step": 674 + }, + { + "epoch": 1.19, + "grad_norm": 0.23356445133686066, + "learning_rate": 6.62465744572089e-05, + "loss": 0.0277, + "step": 675 + }, + { + "epoch": 1.2, + "grad_norm": 0.42427390813827515, + "learning_rate": 6.615838240805344e-05, + "loss": 0.0745, + "step": 676 + }, + { + "epoch": 1.2, + "grad_norm": 0.23298533260822296, + "learning_rate": 6.607013418976874e-05, + "loss": 0.047, + "step": 677 + }, + { + "epoch": 1.2, + "grad_norm": 0.5681192278862, + "learning_rate": 6.598183010911978e-05, + "loss": 0.1032, + "step": 678 + }, + { + "epoch": 1.2, + "grad_norm": 0.15370431542396545, + "learning_rate": 6.589347047306571e-05, + "loss": 0.0224, + "step": 679 + }, + { + "epoch": 1.2, + "grad_norm": 0.2974132001399994, + "learning_rate": 6.580505558875877e-05, + "loss": 0.0908, + "step": 680 + }, + { + "epoch": 1.2, + "grad_norm": 0.12158460170030594, + "learning_rate": 6.571658576354333e-05, + "loss": 0.0212, + "step": 681 + }, + { + "epoch": 1.21, + "grad_norm": 0.32594335079193115, + "learning_rate": 6.562806130495467e-05, + "loss": 0.1016, + "step": 682 + }, + { + "epoch": 1.21, + "grad_norm": 0.3316996097564697, + "learning_rate": 6.5539482520718e-05, + "loss": 0.0639, + "step": 683 + }, + { + "epoch": 1.21, + "grad_norm": 0.21660655736923218, + "learning_rate": 6.545084971874738e-05, + "loss": 0.043, + "step": 684 + }, + { + "epoch": 1.21, + "grad_norm": 0.350033164024353, + "learning_rate": 6.536216320714466e-05, + "loss": 0.0752, + "step": 685 + }, + { + "epoch": 1.21, + "grad_norm": 0.30745336413383484, + "learning_rate": 6.527342329419837e-05, + "loss": 0.0927, + "step": 686 + }, + { + "epoch": 1.22, + "grad_norm": 0.24984771013259888, + "learning_rate": 6.51846302883827e-05, + "loss": 0.0685, + "step": 687 + }, + { + "epoch": 1.22, + "grad_norm": 0.07773179560899734, + "learning_rate": 6.509578449835636e-05, + "loss": 0.0152, + "step": 688 + }, + { + "epoch": 1.22, + "grad_norm": 0.1620987057685852, + "learning_rate": 6.500688623296159e-05, + "loss": 0.0514, + "step": 689 + }, + { + "epoch": 1.22, + "grad_norm": 0.1917831003665924, + "learning_rate": 6.491793580122301e-05, + "loss": 0.066, + "step": 690 + }, + { + "epoch": 1.22, + "grad_norm": 0.21920029819011688, + "learning_rate": 6.482893351234658e-05, + "loss": 0.0547, + "step": 691 + }, + { + "epoch": 1.22, + "grad_norm": 0.29076483845710754, + "learning_rate": 6.473987967571856e-05, + "loss": 0.079, + "step": 692 + }, + { + "epoch": 1.23, + "grad_norm": 0.30292215943336487, + "learning_rate": 6.46507746009043e-05, + "loss": 0.0957, + "step": 693 + }, + { + "epoch": 1.23, + "grad_norm": 0.14139439165592194, + "learning_rate": 6.456161859764744e-05, + "loss": 0.0346, + "step": 694 + }, + { + "epoch": 1.23, + "grad_norm": 0.22850438952445984, + "learning_rate": 6.447241197586847e-05, + "loss": 0.0744, + "step": 695 + }, + { + "epoch": 1.23, + "grad_norm": 0.48915836215019226, + "learning_rate": 6.438315504566397e-05, + "loss": 0.0953, + "step": 696 + }, + { + "epoch": 1.23, + "grad_norm": 0.17644958198070526, + "learning_rate": 6.429384811730528e-05, + "loss": 0.046, + "step": 697 + }, + { + "epoch": 1.23, + "grad_norm": 0.2039819210767746, + "learning_rate": 6.420449150123767e-05, + "loss": 0.1052, + "step": 698 + }, + { + "epoch": 1.24, + "grad_norm": 0.17715586721897125, + "learning_rate": 6.411508550807906e-05, + "loss": 0.0447, + "step": 699 + }, + { + "epoch": 1.24, + "grad_norm": 0.16100600361824036, + "learning_rate": 6.4025630448619e-05, + "loss": 0.0344, + "step": 700 + }, + { + "epoch": 1.24, + "grad_norm": 0.22480256855487823, + "learning_rate": 6.393612663381763e-05, + "loss": 0.0495, + "step": 701 + }, + { + "epoch": 1.24, + "grad_norm": 0.12992677092552185, + "learning_rate": 6.384657437480458e-05, + "loss": 0.0409, + "step": 702 + }, + { + "epoch": 1.24, + "grad_norm": 0.1325366348028183, + "learning_rate": 6.375697398287787e-05, + "loss": 0.0257, + "step": 703 + }, + { + "epoch": 1.25, + "grad_norm": 0.16241514682769775, + "learning_rate": 6.366732576950284e-05, + "loss": 0.0427, + "step": 704 + }, + { + "epoch": 1.25, + "grad_norm": 0.21476183831691742, + "learning_rate": 6.357763004631104e-05, + "loss": 0.0451, + "step": 705 + }, + { + "epoch": 1.25, + "grad_norm": 0.32039332389831543, + "learning_rate": 6.34878871250992e-05, + "loss": 0.0545, + "step": 706 + }, + { + "epoch": 1.25, + "grad_norm": 0.3203076124191284, + "learning_rate": 6.33980973178281e-05, + "loss": 0.0917, + "step": 707 + }, + { + "epoch": 1.25, + "grad_norm": 0.25006967782974243, + "learning_rate": 6.330826093662156e-05, + "loss": 0.1028, + "step": 708 + }, + { + "epoch": 1.25, + "grad_norm": 0.52630215883255, + "learning_rate": 6.32183782937652e-05, + "loss": 0.0889, + "step": 709 + }, + { + "epoch": 1.26, + "grad_norm": 0.33741331100463867, + "learning_rate": 6.31284497017055e-05, + "loss": 0.0725, + "step": 710 + }, + { + "epoch": 1.26, + "eval_loss": 0.07285241782665253, + "eval_runtime": 14.6756, + "eval_samples_per_second": 32.503, + "eval_steps_per_second": 8.177, + "step": 710 + }, + { + "epoch": 1.26, + "grad_norm": 0.40746867656707764, + "learning_rate": 6.303847547304873e-05, + "loss": 0.0945, + "step": 711 + }, + { + "epoch": 1.26, + "grad_norm": 0.22757941484451294, + "learning_rate": 6.294845592055967e-05, + "loss": 0.0532, + "step": 712 + }, + { + "epoch": 1.26, + "grad_norm": 0.19006334245204926, + "learning_rate": 6.285839135716079e-05, + "loss": 0.0484, + "step": 713 + }, + { + "epoch": 1.26, + "grad_norm": 0.48126357793807983, + "learning_rate": 6.27682820959309e-05, + "loss": 0.0967, + "step": 714 + }, + { + "epoch": 1.26, + "grad_norm": 0.23766569793224335, + "learning_rate": 6.26781284501043e-05, + "loss": 0.106, + "step": 715 + }, + { + "epoch": 1.27, + "grad_norm": 0.16818860173225403, + "learning_rate": 6.258793073306949e-05, + "loss": 0.0494, + "step": 716 + }, + { + "epoch": 1.27, + "grad_norm": 0.28579115867614746, + "learning_rate": 6.249768925836822e-05, + "loss": 0.0937, + "step": 717 + }, + { + "epoch": 1.27, + "grad_norm": 0.16623319685459137, + "learning_rate": 6.240740433969432e-05, + "loss": 0.0301, + "step": 718 + }, + { + "epoch": 1.27, + "grad_norm": 0.1560198813676834, + "learning_rate": 6.231707629089262e-05, + "loss": 0.0384, + "step": 719 + }, + { + "epoch": 1.27, + "grad_norm": 0.2002251148223877, + "learning_rate": 6.2226705425958e-05, + "loss": 0.0545, + "step": 720 + }, + { + "epoch": 1.28, + "grad_norm": 0.16213096678256989, + "learning_rate": 6.2136292059034e-05, + "loss": 0.0433, + "step": 721 + }, + { + "epoch": 1.28, + "grad_norm": 0.27064821124076843, + "learning_rate": 6.204583650441201e-05, + "loss": 0.0796, + "step": 722 + }, + { + "epoch": 1.28, + "grad_norm": 0.11131159216165543, + "learning_rate": 6.195533907653004e-05, + "loss": 0.0229, + "step": 723 + }, + { + "epoch": 1.28, + "grad_norm": 0.22354401648044586, + "learning_rate": 6.18648000899717e-05, + "loss": 0.0475, + "step": 724 + }, + { + "epoch": 1.28, + "grad_norm": 0.19944117963314056, + "learning_rate": 6.177421985946499e-05, + "loss": 0.0413, + "step": 725 + }, + { + "epoch": 1.28, + "grad_norm": 0.32458746433258057, + "learning_rate": 6.168359869988134e-05, + "loss": 0.1205, + "step": 726 + }, + { + "epoch": 1.29, + "grad_norm": 0.19088833034038544, + "learning_rate": 6.159293692623443e-05, + "loss": 0.0626, + "step": 727 + }, + { + "epoch": 1.29, + "grad_norm": 0.2114744633436203, + "learning_rate": 6.150223485367914e-05, + "loss": 0.048, + "step": 728 + }, + { + "epoch": 1.29, + "grad_norm": 0.11308068782091141, + "learning_rate": 6.141149279751043e-05, + "loss": 0.0286, + "step": 729 + }, + { + "epoch": 1.29, + "grad_norm": 0.22453975677490234, + "learning_rate": 6.13207110731622e-05, + "loss": 0.0279, + "step": 730 + }, + { + "epoch": 1.29, + "grad_norm": 0.274513840675354, + "learning_rate": 6.122988999620634e-05, + "loss": 0.0553, + "step": 731 + }, + { + "epoch": 1.3, + "grad_norm": 0.2700372636318207, + "learning_rate": 6.113902988235145e-05, + "loss": 0.0973, + "step": 732 + }, + { + "epoch": 1.3, + "grad_norm": 0.3287579119205475, + "learning_rate": 6.104813104744188e-05, + "loss": 0.0853, + "step": 733 + }, + { + "epoch": 1.3, + "grad_norm": 0.37582048773765564, + "learning_rate": 6.095719380745654e-05, + "loss": 0.088, + "step": 734 + }, + { + "epoch": 1.3, + "grad_norm": 0.2009502649307251, + "learning_rate": 6.086621847850788e-05, + "loss": 0.0525, + "step": 735 + }, + { + "epoch": 1.3, + "grad_norm": 0.1417909413576126, + "learning_rate": 6.077520537684072e-05, + "loss": 0.0311, + "step": 736 + }, + { + "epoch": 1.3, + "grad_norm": 0.18328174948692322, + "learning_rate": 6.068415481883122e-05, + "loss": 0.0379, + "step": 737 + }, + { + "epoch": 1.31, + "grad_norm": 0.1513252556324005, + "learning_rate": 6.059306712098571e-05, + "loss": 0.0319, + "step": 738 + }, + { + "epoch": 1.31, + "grad_norm": 0.2544059753417969, + "learning_rate": 6.0501942599939666e-05, + "loss": 0.0593, + "step": 739 + }, + { + "epoch": 1.31, + "grad_norm": 0.4964008331298828, + "learning_rate": 6.0410781572456486e-05, + "loss": 0.0367, + "step": 740 + }, + { + "epoch": 1.31, + "grad_norm": 0.14776591956615448, + "learning_rate": 6.031958435542659e-05, + "loss": 0.0284, + "step": 741 + }, + { + "epoch": 1.31, + "grad_norm": 0.20590472221374512, + "learning_rate": 6.022835126586609e-05, + "loss": 0.0359, + "step": 742 + }, + { + "epoch": 1.31, + "grad_norm": 0.2493211179971695, + "learning_rate": 6.0137082620915863e-05, + "loss": 0.0424, + "step": 743 + }, + { + "epoch": 1.32, + "grad_norm": 0.31557443737983704, + "learning_rate": 6.0045778737840344e-05, + "loss": 0.0563, + "step": 744 + }, + { + "epoch": 1.32, + "grad_norm": 0.14257828891277313, + "learning_rate": 5.995443993402647e-05, + "loss": 0.024, + "step": 745 + }, + { + "epoch": 1.32, + "grad_norm": 0.21385452151298523, + "learning_rate": 5.9863066526982605e-05, + "loss": 0.0721, + "step": 746 + }, + { + "epoch": 1.32, + "grad_norm": 0.17539048194885254, + "learning_rate": 5.977165883433734e-05, + "loss": 0.025, + "step": 747 + }, + { + "epoch": 1.32, + "grad_norm": 0.28508231043815613, + "learning_rate": 5.9680217173838494e-05, + "loss": 0.0595, + "step": 748 + }, + { + "epoch": 1.33, + "grad_norm": 0.30929744243621826, + "learning_rate": 5.9588741863351924e-05, + "loss": 0.112, + "step": 749 + }, + { + "epoch": 1.33, + "grad_norm": 0.439656525850296, + "learning_rate": 5.949723322086053e-05, + "loss": 0.0427, + "step": 750 + }, + { + "epoch": 1.33, + "grad_norm": 0.2300054430961609, + "learning_rate": 5.940569156446298e-05, + "loss": 0.0437, + "step": 751 + }, + { + "epoch": 1.33, + "grad_norm": 0.4155109226703644, + "learning_rate": 5.931411721237279e-05, + "loss": 0.0569, + "step": 752 + }, + { + "epoch": 1.33, + "grad_norm": 0.25196224451065063, + "learning_rate": 5.922251048291707e-05, + "loss": 0.0413, + "step": 753 + }, + { + "epoch": 1.33, + "grad_norm": 0.5078486204147339, + "learning_rate": 5.913087169453554e-05, + "loss": 0.0988, + "step": 754 + }, + { + "epoch": 1.34, + "grad_norm": 0.26931652426719666, + "learning_rate": 5.9039201165779315e-05, + "loss": 0.0578, + "step": 755 + }, + { + "epoch": 1.34, + "grad_norm": 0.2641213834285736, + "learning_rate": 5.8947499215309834e-05, + "loss": 0.0362, + "step": 756 + }, + { + "epoch": 1.34, + "grad_norm": 0.23865339159965515, + "learning_rate": 5.8855766161897805e-05, + "loss": 0.0375, + "step": 757 + }, + { + "epoch": 1.34, + "grad_norm": 0.2594137191772461, + "learning_rate": 5.876400232442205e-05, + "loss": 0.0489, + "step": 758 + }, + { + "epoch": 1.34, + "grad_norm": 0.2721590995788574, + "learning_rate": 5.867220802186837e-05, + "loss": 0.0407, + "step": 759 + }, + { + "epoch": 1.34, + "grad_norm": 0.3681499660015106, + "learning_rate": 5.85803835733285e-05, + "loss": 0.0554, + "step": 760 + }, + { + "epoch": 1.35, + "grad_norm": 0.3132595121860504, + "learning_rate": 5.848852929799894e-05, + "loss": 0.0486, + "step": 761 + }, + { + "epoch": 1.35, + "grad_norm": 0.16972127556800842, + "learning_rate": 5.8396645515179884e-05, + "loss": 0.0473, + "step": 762 + }, + { + "epoch": 1.35, + "grad_norm": 0.30628886818885803, + "learning_rate": 5.83047325442741e-05, + "loss": 0.0664, + "step": 763 + }, + { + "epoch": 1.35, + "grad_norm": 0.3327179551124573, + "learning_rate": 5.8212790704785824e-05, + "loss": 0.0605, + "step": 764 + }, + { + "epoch": 1.35, + "grad_norm": 0.3301398754119873, + "learning_rate": 5.812082031631966e-05, + "loss": 0.0477, + "step": 765 + }, + { + "epoch": 1.36, + "grad_norm": 0.23960134387016296, + "learning_rate": 5.8028821698579385e-05, + "loss": 0.0376, + "step": 766 + }, + { + "epoch": 1.36, + "grad_norm": 0.2526357173919678, + "learning_rate": 5.7936795171367e-05, + "loss": 0.0712, + "step": 767 + }, + { + "epoch": 1.36, + "grad_norm": 0.32746273279190063, + "learning_rate": 5.784474105458143e-05, + "loss": 0.0542, + "step": 768 + }, + { + "epoch": 1.36, + "grad_norm": 0.10859230905771255, + "learning_rate": 5.77526596682176e-05, + "loss": 0.019, + "step": 769 + }, + { + "epoch": 1.36, + "grad_norm": 0.2908915877342224, + "learning_rate": 5.766055133236513e-05, + "loss": 0.142, + "step": 770 + }, + { + "epoch": 1.36, + "grad_norm": 0.26869770884513855, + "learning_rate": 5.7568416367207404e-05, + "loss": 0.0774, + "step": 771 + }, + { + "epoch": 1.37, + "grad_norm": 0.39681994915008545, + "learning_rate": 5.7476255093020326e-05, + "loss": 0.0632, + "step": 772 + }, + { + "epoch": 1.37, + "grad_norm": 0.14335761964321136, + "learning_rate": 5.7384067830171274e-05, + "loss": 0.03, + "step": 773 + }, + { + "epoch": 1.37, + "grad_norm": 0.1377771943807602, + "learning_rate": 5.729185489911797e-05, + "loss": 0.0263, + "step": 774 + }, + { + "epoch": 1.37, + "grad_norm": 0.19834232330322266, + "learning_rate": 5.719961662040733e-05, + "loss": 0.0506, + "step": 775 + }, + { + "epoch": 1.37, + "grad_norm": 0.14378659427165985, + "learning_rate": 5.710735331467444e-05, + "loss": 0.0285, + "step": 776 + }, + { + "epoch": 1.37, + "grad_norm": 0.25368401408195496, + "learning_rate": 5.701506530264132e-05, + "loss": 0.0584, + "step": 777 + }, + { + "epoch": 1.38, + "grad_norm": 0.12339203804731369, + "learning_rate": 5.692275290511592e-05, + "loss": 0.0282, + "step": 778 + }, + { + "epoch": 1.38, + "grad_norm": 0.203715518116951, + "learning_rate": 5.683041644299093e-05, + "loss": 0.0849, + "step": 779 + }, + { + "epoch": 1.38, + "grad_norm": 0.1526814103126526, + "learning_rate": 5.673805623724272e-05, + "loss": 0.0256, + "step": 780 + }, + { + "epoch": 1.38, + "grad_norm": 0.18840323388576508, + "learning_rate": 5.664567260893019e-05, + "loss": 0.048, + "step": 781 + }, + { + "epoch": 1.38, + "grad_norm": 0.15979206562042236, + "learning_rate": 5.6553265879193606e-05, + "loss": 0.0237, + "step": 782 + }, + { + "epoch": 1.39, + "grad_norm": 0.1128401905298233, + "learning_rate": 5.6460836369253624e-05, + "loss": 0.0213, + "step": 783 + }, + { + "epoch": 1.39, + "grad_norm": 0.1648949831724167, + "learning_rate": 5.6368384400410035e-05, + "loss": 0.0348, + "step": 784 + }, + { + "epoch": 1.39, + "grad_norm": 0.31091129779815674, + "learning_rate": 5.627591029404071e-05, + "loss": 0.0685, + "step": 785 + }, + { + "epoch": 1.39, + "grad_norm": 0.2921251654624939, + "learning_rate": 5.6183414371600496e-05, + "loss": 0.045, + "step": 786 + }, + { + "epoch": 1.39, + "grad_norm": 0.3398689925670624, + "learning_rate": 5.609089695462002e-05, + "loss": 0.0546, + "step": 787 + }, + { + "epoch": 1.39, + "grad_norm": 0.21610289812088013, + "learning_rate": 5.599835836470469e-05, + "loss": 0.0322, + "step": 788 + }, + { + "epoch": 1.4, + "grad_norm": 0.3218781054019928, + "learning_rate": 5.5905798923533484e-05, + "loss": 0.0331, + "step": 789 + }, + { + "epoch": 1.4, + "grad_norm": 0.5338783860206604, + "learning_rate": 5.581321895285787e-05, + "loss": 0.0764, + "step": 790 + }, + { + "epoch": 1.4, + "grad_norm": 0.2539553940296173, + "learning_rate": 5.5720618774500675e-05, + "loss": 0.0553, + "step": 791 + }, + { + "epoch": 1.4, + "grad_norm": 0.22199298441410065, + "learning_rate": 5.5627998710354957e-05, + "loss": 0.0304, + "step": 792 + }, + { + "epoch": 1.4, + "grad_norm": 0.13213643431663513, + "learning_rate": 5.5535359082382944e-05, + "loss": 0.0115, + "step": 793 + }, + { + "epoch": 1.4, + "grad_norm": 0.3686007857322693, + "learning_rate": 5.544270021261483e-05, + "loss": 0.0371, + "step": 794 + }, + { + "epoch": 1.41, + "grad_norm": 0.08815140277147293, + "learning_rate": 5.535002242314772e-05, + "loss": 0.0089, + "step": 795 + }, + { + "epoch": 1.41, + "grad_norm": 0.40321916341781616, + "learning_rate": 5.525732603614444e-05, + "loss": 0.0653, + "step": 796 + }, + { + "epoch": 1.41, + "grad_norm": 0.44097116589546204, + "learning_rate": 5.5164611373832544e-05, + "loss": 0.0555, + "step": 797 + }, + { + "epoch": 1.41, + "grad_norm": 0.5565125942230225, + "learning_rate": 5.5071878758503046e-05, + "loss": 0.0646, + "step": 798 + }, + { + "epoch": 1.41, + "grad_norm": 0.8579866290092468, + "learning_rate": 5.49791285125094e-05, + "loss": 0.1532, + "step": 799 + }, + { + "epoch": 1.42, + "grad_norm": 0.550639271736145, + "learning_rate": 5.488636095826636e-05, + "loss": 0.0574, + "step": 800 + }, + { + "epoch": 1.42, + "grad_norm": 0.07725897431373596, + "learning_rate": 5.479357641824877e-05, + "loss": 0.0087, + "step": 801 + }, + { + "epoch": 1.42, + "grad_norm": 0.25981655716896057, + "learning_rate": 5.470077521499063e-05, + "loss": 0.0328, + "step": 802 + }, + { + "epoch": 1.42, + "grad_norm": 0.07465404272079468, + "learning_rate": 5.4607957671083786e-05, + "loss": 0.0117, + "step": 803 + }, + { + "epoch": 1.42, + "grad_norm": 0.22613628208637238, + "learning_rate": 5.4515124109176904e-05, + "loss": 0.0596, + "step": 804 + }, + { + "epoch": 1.42, + "grad_norm": 0.20493067800998688, + "learning_rate": 5.442227485197435e-05, + "loss": 0.0394, + "step": 805 + }, + { + "epoch": 1.43, + "grad_norm": 0.2182394117116928, + "learning_rate": 5.4329410222235034e-05, + "loss": 0.0491, + "step": 806 + }, + { + "epoch": 1.43, + "grad_norm": 0.16270771622657776, + "learning_rate": 5.42365305427713e-05, + "loss": 0.0333, + "step": 807 + }, + { + "epoch": 1.43, + "grad_norm": 0.3527982234954834, + "learning_rate": 5.414363613644782e-05, + "loss": 0.1369, + "step": 808 + }, + { + "epoch": 1.43, + "grad_norm": 0.38832610845565796, + "learning_rate": 5.405072732618043e-05, + "loss": 0.0719, + "step": 809 + }, + { + "epoch": 1.43, + "grad_norm": 0.18581318855285645, + "learning_rate": 5.395780443493508e-05, + "loss": 0.0305, + "step": 810 + }, + { + "epoch": 1.43, + "grad_norm": 0.298115611076355, + "learning_rate": 5.386486778572665e-05, + "loss": 0.0676, + "step": 811 + }, + { + "epoch": 1.44, + "grad_norm": 0.15877433121204376, + "learning_rate": 5.3771917701617827e-05, + "loss": 0.0343, + "step": 812 + }, + { + "epoch": 1.44, + "grad_norm": 0.14393776655197144, + "learning_rate": 5.367895450571801e-05, + "loss": 0.0395, + "step": 813 + }, + { + "epoch": 1.44, + "grad_norm": 0.25177934765815735, + "learning_rate": 5.358597852118219e-05, + "loss": 0.0757, + "step": 814 + }, + { + "epoch": 1.44, + "grad_norm": 0.16125288605690002, + "learning_rate": 5.3492990071209806e-05, + "loss": 0.0432, + "step": 815 + }, + { + "epoch": 1.44, + "grad_norm": 0.21766537427902222, + "learning_rate": 5.3399989479043624e-05, + "loss": 0.087, + "step": 816 + }, + { + "epoch": 1.45, + "grad_norm": 0.18221743404865265, + "learning_rate": 5.3306977067968614e-05, + "loss": 0.0403, + "step": 817 + }, + { + "epoch": 1.45, + "grad_norm": 0.2517869472503662, + "learning_rate": 5.3213953161310825e-05, + "loss": 0.0666, + "step": 818 + }, + { + "epoch": 1.45, + "grad_norm": 0.20983122289180756, + "learning_rate": 5.3120918082436314e-05, + "loss": 0.0664, + "step": 819 + }, + { + "epoch": 1.45, + "grad_norm": 0.17914025485515594, + "learning_rate": 5.3027872154749915e-05, + "loss": 0.0382, + "step": 820 + }, + { + "epoch": 1.45, + "grad_norm": 0.12290598452091217, + "learning_rate": 5.2934815701694204e-05, + "loss": 0.0282, + "step": 821 + }, + { + "epoch": 1.45, + "grad_norm": 0.17699532210826874, + "learning_rate": 5.2841749046748345e-05, + "loss": 0.0413, + "step": 822 + }, + { + "epoch": 1.46, + "grad_norm": 0.3889511227607727, + "learning_rate": 5.274867251342694e-05, + "loss": 0.0758, + "step": 823 + }, + { + "epoch": 1.46, + "grad_norm": 0.24286973476409912, + "learning_rate": 5.2655586425278966e-05, + "loss": 0.0532, + "step": 824 + }, + { + "epoch": 1.46, + "grad_norm": 0.14911137521266937, + "learning_rate": 5.256249110588659e-05, + "loss": 0.0277, + "step": 825 + }, + { + "epoch": 1.46, + "grad_norm": 0.31271466612815857, + "learning_rate": 5.246938687886409e-05, + "loss": 0.0726, + "step": 826 + }, + { + "epoch": 1.46, + "grad_norm": 0.2684333920478821, + "learning_rate": 5.237627406785667e-05, + "loss": 0.0993, + "step": 827 + }, + { + "epoch": 1.46, + "grad_norm": 0.14797139167785645, + "learning_rate": 5.228315299653942e-05, + "loss": 0.0198, + "step": 828 + }, + { + "epoch": 1.47, + "grad_norm": 0.29548555612564087, + "learning_rate": 5.2190023988616113e-05, + "loss": 0.0562, + "step": 829 + }, + { + "epoch": 1.47, + "grad_norm": 0.29321712255477905, + "learning_rate": 5.2096887367818105e-05, + "loss": 0.1208, + "step": 830 + }, + { + "epoch": 1.47, + "grad_norm": 0.18331380188465118, + "learning_rate": 5.2003743457903256e-05, + "loss": 0.0256, + "step": 831 + }, + { + "epoch": 1.47, + "grad_norm": 0.21740898489952087, + "learning_rate": 5.1910592582654715e-05, + "loss": 0.057, + "step": 832 + }, + { + "epoch": 1.47, + "grad_norm": 0.2625051736831665, + "learning_rate": 5.181743506587989e-05, + "loss": 0.0667, + "step": 833 + }, + { + "epoch": 1.48, + "grad_norm": 0.2670525908470154, + "learning_rate": 5.172427123140923e-05, + "loss": 0.0883, + "step": 834 + }, + { + "epoch": 1.48, + "grad_norm": 0.33282265067100525, + "learning_rate": 5.1631101403095184e-05, + "loss": 0.0424, + "step": 835 + }, + { + "epoch": 1.48, + "grad_norm": 0.21608753502368927, + "learning_rate": 5.1537925904811004e-05, + "loss": 0.049, + "step": 836 + }, + { + "epoch": 1.48, + "grad_norm": 0.10450909286737442, + "learning_rate": 5.144474506044968e-05, + "loss": 0.0158, + "step": 837 + }, + { + "epoch": 1.48, + "grad_norm": 0.3188491761684418, + "learning_rate": 5.135155919392279e-05, + "loss": 0.0547, + "step": 838 + }, + { + "epoch": 1.48, + "grad_norm": 0.24398969113826752, + "learning_rate": 5.125836862915934e-05, + "loss": 0.053, + "step": 839 + }, + { + "epoch": 1.49, + "grad_norm": 0.1743936687707901, + "learning_rate": 5.116517369010466e-05, + "loss": 0.0239, + "step": 840 + }, + { + "epoch": 1.49, + "grad_norm": 0.180791437625885, + "learning_rate": 5.1071974700719326e-05, + "loss": 0.0864, + "step": 841 + }, + { + "epoch": 1.49, + "grad_norm": 0.19678902626037598, + "learning_rate": 5.0978771984978003e-05, + "loss": 0.0376, + "step": 842 + }, + { + "epoch": 1.49, + "grad_norm": 0.230797678232193, + "learning_rate": 5.0885565866868227e-05, + "loss": 0.0597, + "step": 843 + }, + { + "epoch": 1.49, + "grad_norm": 0.4890972971916199, + "learning_rate": 5.079235667038944e-05, + "loss": 0.0832, + "step": 844 + }, + { + "epoch": 1.49, + "grad_norm": 0.20508797466754913, + "learning_rate": 5.069914471955178e-05, + "loss": 0.0349, + "step": 845 + }, + { + "epoch": 1.5, + "grad_norm": 0.21593628823757172, + "learning_rate": 5.060593033837493e-05, + "loss": 0.0354, + "step": 846 + }, + { + "epoch": 1.5, + "grad_norm": 0.2712628245353699, + "learning_rate": 5.051271385088702e-05, + "loss": 0.0311, + "step": 847 + }, + { + "epoch": 1.5, + "grad_norm": 0.11844774335622787, + "learning_rate": 5.041949558112351e-05, + "loss": 0.0109, + "step": 848 + }, + { + "epoch": 1.5, + "grad_norm": 0.1798882633447647, + "learning_rate": 5.032627585312608e-05, + "loss": 0.0196, + "step": 849 + }, + { + "epoch": 1.5, + "grad_norm": 0.290019690990448, + "learning_rate": 5.023305499094144e-05, + "loss": 0.0667, + "step": 850 + }, + { + "epoch": 1.51, + "grad_norm": 0.24924272298812866, + "learning_rate": 5.013983331862027e-05, + "loss": 0.0556, + "step": 851 + }, + { + "epoch": 1.51, + "grad_norm": 0.22597135603427887, + "learning_rate": 5.004661116021605e-05, + "loss": 0.0495, + "step": 852 + }, + { + "epoch": 1.51, + "eval_loss": 0.07545028626918793, + "eval_runtime": 14.6561, + "eval_samples_per_second": 32.546, + "eval_steps_per_second": 8.188, + "step": 852 + }, + { + "epoch": 1.51, + "grad_norm": 0.15348747372627258, + "learning_rate": 4.9953388839783954e-05, + "loss": 0.0204, + "step": 853 + }, + { + "epoch": 1.51, + "grad_norm": 0.6507572531700134, + "learning_rate": 4.9860166681379745e-05, + "loss": 0.076, + "step": 854 + }, + { + "epoch": 1.51, + "grad_norm": 0.13102935254573822, + "learning_rate": 4.976694500905857e-05, + "loss": 0.0143, + "step": 855 + }, + { + "epoch": 1.51, + "grad_norm": 0.43004414439201355, + "learning_rate": 4.967372414687393e-05, + "loss": 0.0675, + "step": 856 + }, + { + "epoch": 1.52, + "grad_norm": 0.38339918851852417, + "learning_rate": 4.95805044188765e-05, + "loss": 0.0747, + "step": 857 + }, + { + "epoch": 1.52, + "grad_norm": 0.4646240472793579, + "learning_rate": 4.9487286149112986e-05, + "loss": 0.0883, + "step": 858 + }, + { + "epoch": 1.52, + "grad_norm": 0.2721651792526245, + "learning_rate": 4.9394069661625076e-05, + "loss": 0.062, + "step": 859 + }, + { + "epoch": 1.52, + "grad_norm": 0.356275349855423, + "learning_rate": 4.930085528044823e-05, + "loss": 0.0321, + "step": 860 + }, + { + "epoch": 1.52, + "grad_norm": 0.3066048324108124, + "learning_rate": 4.9207643329610556e-05, + "loss": 0.0525, + "step": 861 + }, + { + "epoch": 1.52, + "grad_norm": 0.12254035472869873, + "learning_rate": 4.911443413313179e-05, + "loss": 0.0106, + "step": 862 + }, + { + "epoch": 1.53, + "grad_norm": 0.19596797227859497, + "learning_rate": 4.9021228015022015e-05, + "loss": 0.0242, + "step": 863 + }, + { + "epoch": 1.53, + "grad_norm": 0.39066678285598755, + "learning_rate": 4.892802529928067e-05, + "loss": 0.0558, + "step": 864 + }, + { + "epoch": 1.53, + "grad_norm": 0.17401085793972015, + "learning_rate": 4.883482630989535e-05, + "loss": 0.0203, + "step": 865 + }, + { + "epoch": 1.53, + "grad_norm": 0.28903472423553467, + "learning_rate": 4.874163137084068e-05, + "loss": 0.0819, + "step": 866 + }, + { + "epoch": 1.53, + "grad_norm": 0.2746363580226898, + "learning_rate": 4.8648440806077226e-05, + "loss": 0.0696, + "step": 867 + }, + { + "epoch": 1.54, + "grad_norm": 0.40534642338752747, + "learning_rate": 4.8555254939550324e-05, + "loss": 0.111, + "step": 868 + }, + { + "epoch": 1.54, + "grad_norm": 0.41272208094596863, + "learning_rate": 4.8462074095188994e-05, + "loss": 0.1089, + "step": 869 + }, + { + "epoch": 1.54, + "grad_norm": 0.40718454122543335, + "learning_rate": 4.8368898596904834e-05, + "loss": 0.1339, + "step": 870 + }, + { + "epoch": 1.54, + "grad_norm": 0.14493143558502197, + "learning_rate": 4.827572876859078e-05, + "loss": 0.0227, + "step": 871 + }, + { + "epoch": 1.54, + "grad_norm": 0.20390640199184418, + "learning_rate": 4.8182564934120115e-05, + "loss": 0.0464, + "step": 872 + }, + { + "epoch": 1.54, + "grad_norm": 0.12927311658859253, + "learning_rate": 4.80894074173453e-05, + "loss": 0.0253, + "step": 873 + }, + { + "epoch": 1.55, + "grad_norm": 0.17653903365135193, + "learning_rate": 4.799625654209675e-05, + "loss": 0.0509, + "step": 874 + }, + { + "epoch": 1.55, + "grad_norm": 0.34687289595603943, + "learning_rate": 4.790311263218191e-05, + "loss": 0.0916, + "step": 875 + }, + { + "epoch": 1.55, + "grad_norm": 0.22851605713367462, + "learning_rate": 4.7809976011383905e-05, + "loss": 0.0857, + "step": 876 + }, + { + "epoch": 1.55, + "grad_norm": 0.27485382556915283, + "learning_rate": 4.771684700346059e-05, + "loss": 0.0623, + "step": 877 + }, + { + "epoch": 1.55, + "grad_norm": 0.21887461841106415, + "learning_rate": 4.762372593214335e-05, + "loss": 0.0573, + "step": 878 + }, + { + "epoch": 1.56, + "grad_norm": 0.2466115951538086, + "learning_rate": 4.753061312113592e-05, + "loss": 0.1039, + "step": 879 + }, + { + "epoch": 1.56, + "grad_norm": 0.344625860452652, + "learning_rate": 4.743750889411342e-05, + "loss": 0.0637, + "step": 880 + }, + { + "epoch": 1.56, + "grad_norm": 0.1676146686077118, + "learning_rate": 4.7344413574721046e-05, + "loss": 0.0372, + "step": 881 + }, + { + "epoch": 1.56, + "grad_norm": 0.14225785434246063, + "learning_rate": 4.725132748657307e-05, + "loss": 0.0506, + "step": 882 + }, + { + "epoch": 1.56, + "grad_norm": 0.19915729761123657, + "learning_rate": 4.715825095325168e-05, + "loss": 0.0459, + "step": 883 + }, + { + "epoch": 1.56, + "grad_norm": 0.20955249667167664, + "learning_rate": 4.70651842983058e-05, + "loss": 0.0539, + "step": 884 + }, + { + "epoch": 1.57, + "grad_norm": 0.171535924077034, + "learning_rate": 4.697212784525008e-05, + "loss": 0.0346, + "step": 885 + }, + { + "epoch": 1.57, + "grad_norm": 0.11981090158224106, + "learning_rate": 4.687908191756369e-05, + "loss": 0.0378, + "step": 886 + }, + { + "epoch": 1.57, + "grad_norm": 0.18210795521736145, + "learning_rate": 4.678604683868918e-05, + "loss": 0.0563, + "step": 887 + }, + { + "epoch": 1.57, + "grad_norm": 0.18884742259979248, + "learning_rate": 4.669302293203142e-05, + "loss": 0.0393, + "step": 888 + }, + { + "epoch": 1.57, + "grad_norm": 0.21338443458080292, + "learning_rate": 4.660001052095639e-05, + "loss": 0.054, + "step": 889 + }, + { + "epoch": 1.57, + "grad_norm": 0.16022799909114838, + "learning_rate": 4.65070099287902e-05, + "loss": 0.0497, + "step": 890 + }, + { + "epoch": 1.58, + "grad_norm": 0.30642077326774597, + "learning_rate": 4.641402147881782e-05, + "loss": 0.0702, + "step": 891 + }, + { + "epoch": 1.58, + "grad_norm": 0.24659690260887146, + "learning_rate": 4.6321045494282e-05, + "loss": 0.0986, + "step": 892 + }, + { + "epoch": 1.58, + "grad_norm": 0.4151371419429779, + "learning_rate": 4.62280822983822e-05, + "loss": 0.1064, + "step": 893 + }, + { + "epoch": 1.58, + "grad_norm": 0.19555744528770447, + "learning_rate": 4.613513221427337e-05, + "loss": 0.034, + "step": 894 + }, + { + "epoch": 1.58, + "grad_norm": 0.3575385510921478, + "learning_rate": 4.604219556506492e-05, + "loss": 0.0563, + "step": 895 + }, + { + "epoch": 1.59, + "grad_norm": 0.33982524275779724, + "learning_rate": 4.594927267381958e-05, + "loss": 0.1152, + "step": 896 + }, + { + "epoch": 1.59, + "grad_norm": 0.40054503083229065, + "learning_rate": 4.58563638635522e-05, + "loss": 0.0684, + "step": 897 + }, + { + "epoch": 1.59, + "grad_norm": 0.16741478443145752, + "learning_rate": 4.5763469457228695e-05, + "loss": 0.0221, + "step": 898 + }, + { + "epoch": 1.59, + "grad_norm": 0.30603042244911194, + "learning_rate": 4.5670589777764984e-05, + "loss": 0.0725, + "step": 899 + }, + { + "epoch": 1.59, + "grad_norm": 0.345217227935791, + "learning_rate": 4.5577725148025646e-05, + "loss": 0.062, + "step": 900 + }, + { + "epoch": 1.59, + "grad_norm": 0.4248473048210144, + "learning_rate": 4.54848758908231e-05, + "loss": 0.1482, + "step": 901 + }, + { + "epoch": 1.6, + "grad_norm": 0.25596097111701965, + "learning_rate": 4.5392042328916226e-05, + "loss": 0.0417, + "step": 902 + }, + { + "epoch": 1.6, + "grad_norm": 0.14021873474121094, + "learning_rate": 4.5299224785009374e-05, + "loss": 0.0242, + "step": 903 + }, + { + "epoch": 1.6, + "grad_norm": 0.16665437817573547, + "learning_rate": 4.5206423581751245e-05, + "loss": 0.0569, + "step": 904 + }, + { + "epoch": 1.6, + "grad_norm": 0.29362550377845764, + "learning_rate": 4.511363904173366e-05, + "loss": 0.068, + "step": 905 + }, + { + "epoch": 1.6, + "grad_norm": 0.44577184319496155, + "learning_rate": 4.5020871487490604e-05, + "loss": 0.0787, + "step": 906 + }, + { + "epoch": 1.6, + "grad_norm": 0.20594125986099243, + "learning_rate": 4.492812124149696e-05, + "loss": 0.0868, + "step": 907 + }, + { + "epoch": 1.61, + "grad_norm": 0.2906559109687805, + "learning_rate": 4.483538862616747e-05, + "loss": 0.0592, + "step": 908 + }, + { + "epoch": 1.61, + "grad_norm": 0.17545486986637115, + "learning_rate": 4.4742673963855576e-05, + "loss": 0.0225, + "step": 909 + }, + { + "epoch": 1.61, + "grad_norm": 0.18305286765098572, + "learning_rate": 4.46499775768523e-05, + "loss": 0.0483, + "step": 910 + }, + { + "epoch": 1.61, + "grad_norm": 0.2249644249677658, + "learning_rate": 4.455729978738517e-05, + "loss": 0.0383, + "step": 911 + }, + { + "epoch": 1.61, + "grad_norm": 0.3094448149204254, + "learning_rate": 4.446464091761706e-05, + "loss": 0.0533, + "step": 912 + }, + { + "epoch": 1.62, + "grad_norm": 0.22453933954238892, + "learning_rate": 4.437200128964504e-05, + "loss": 0.0435, + "step": 913 + }, + { + "epoch": 1.62, + "grad_norm": 0.1814616322517395, + "learning_rate": 4.4279381225499344e-05, + "loss": 0.0245, + "step": 914 + }, + { + "epoch": 1.62, + "grad_norm": 0.20599542558193207, + "learning_rate": 4.418678104714214e-05, + "loss": 0.0321, + "step": 915 + }, + { + "epoch": 1.62, + "grad_norm": 0.27197298407554626, + "learning_rate": 4.409420107646652e-05, + "loss": 0.0512, + "step": 916 + }, + { + "epoch": 1.62, + "grad_norm": 0.35009968280792236, + "learning_rate": 4.400164163529532e-05, + "loss": 0.0717, + "step": 917 + }, + { + "epoch": 1.62, + "grad_norm": 0.17196977138519287, + "learning_rate": 4.390910304537999e-05, + "loss": 0.033, + "step": 918 + }, + { + "epoch": 1.63, + "grad_norm": 0.1884760707616806, + "learning_rate": 4.381658562839953e-05, + "loss": 0.0526, + "step": 919 + }, + { + "epoch": 1.63, + "grad_norm": 0.4165942966938019, + "learning_rate": 4.3724089705959305e-05, + "loss": 0.0824, + "step": 920 + }, + { + "epoch": 1.63, + "grad_norm": 0.36213231086730957, + "learning_rate": 4.363161559958996e-05, + "loss": 0.0524, + "step": 921 + }, + { + "epoch": 1.63, + "grad_norm": 0.22675907611846924, + "learning_rate": 4.353916363074638e-05, + "loss": 0.0367, + "step": 922 + }, + { + "epoch": 1.63, + "grad_norm": 0.29561713337898254, + "learning_rate": 4.34467341208064e-05, + "loss": 0.0364, + "step": 923 + }, + { + "epoch": 1.63, + "grad_norm": 0.22054970264434814, + "learning_rate": 4.3354327391069826e-05, + "loss": 0.025, + "step": 924 + }, + { + "epoch": 1.64, + "grad_norm": 0.06693907827138901, + "learning_rate": 4.3261943762757287e-05, + "loss": 0.0104, + "step": 925 + }, + { + "epoch": 1.64, + "grad_norm": 0.339631050825119, + "learning_rate": 4.3169583557009064e-05, + "loss": 0.0732, + "step": 926 + }, + { + "epoch": 1.64, + "grad_norm": 0.2175193578004837, + "learning_rate": 4.307724709488409e-05, + "loss": 0.0464, + "step": 927 + }, + { + "epoch": 1.64, + "grad_norm": 0.23093104362487793, + "learning_rate": 4.298493469735869e-05, + "loss": 0.0335, + "step": 928 + }, + { + "epoch": 1.64, + "grad_norm": 0.38875579833984375, + "learning_rate": 4.289264668532557e-05, + "loss": 0.0327, + "step": 929 + }, + { + "epoch": 1.65, + "grad_norm": 0.05294647812843323, + "learning_rate": 4.280038337959268e-05, + "loss": 0.007, + "step": 930 + }, + { + "epoch": 1.65, + "grad_norm": 0.36184802651405334, + "learning_rate": 4.270814510088203e-05, + "loss": 0.0688, + "step": 931 + }, + { + "epoch": 1.65, + "grad_norm": 0.531517744064331, + "learning_rate": 4.2615932169828744e-05, + "loss": 0.1305, + "step": 932 + }, + { + "epoch": 1.65, + "grad_norm": 0.3471108376979828, + "learning_rate": 4.2523744906979686e-05, + "loss": 0.0236, + "step": 933 + }, + { + "epoch": 1.65, + "grad_norm": 0.2624709904193878, + "learning_rate": 4.24315836327926e-05, + "loss": 0.0272, + "step": 934 + }, + { + "epoch": 1.65, + "grad_norm": 0.3648707866668701, + "learning_rate": 4.233944866763489e-05, + "loss": 0.0384, + "step": 935 + }, + { + "epoch": 1.66, + "grad_norm": 0.2199166864156723, + "learning_rate": 4.224734033178241e-05, + "loss": 0.0347, + "step": 936 + }, + { + "epoch": 1.66, + "grad_norm": 0.44493308663368225, + "learning_rate": 4.2155258945418566e-05, + "loss": 0.0405, + "step": 937 + }, + { + "epoch": 1.66, + "grad_norm": 0.4102453291416168, + "learning_rate": 4.206320482863301e-05, + "loss": 0.0849, + "step": 938 + }, + { + "epoch": 1.66, + "grad_norm": 0.33510318398475647, + "learning_rate": 4.1971178301420613e-05, + "loss": 0.052, + "step": 939 + }, + { + "epoch": 1.66, + "grad_norm": 0.40965744853019714, + "learning_rate": 4.187917968368036e-05, + "loss": 0.0848, + "step": 940 + }, + { + "epoch": 1.66, + "grad_norm": 0.2755095064640045, + "learning_rate": 4.178720929521418e-05, + "loss": 0.0391, + "step": 941 + }, + { + "epoch": 1.67, + "grad_norm": 0.32818931341171265, + "learning_rate": 4.16952674557259e-05, + "loss": 0.0327, + "step": 942 + }, + { + "epoch": 1.67, + "grad_norm": 0.39538365602493286, + "learning_rate": 4.1603354484820134e-05, + "loss": 0.043, + "step": 943 + }, + { + "epoch": 1.67, + "grad_norm": 0.2221785932779312, + "learning_rate": 4.1511470702001074e-05, + "loss": 0.0288, + "step": 944 + }, + { + "epoch": 1.67, + "grad_norm": 0.2112448811531067, + "learning_rate": 4.141961642667152e-05, + "loss": 0.025, + "step": 945 + }, + { + "epoch": 1.67, + "grad_norm": 0.15548011660575867, + "learning_rate": 4.132779197813164e-05, + "loss": 0.0694, + "step": 946 + }, + { + "epoch": 1.68, + "grad_norm": 0.10790842771530151, + "learning_rate": 4.1235997675577956e-05, + "loss": 0.0124, + "step": 947 + }, + { + "epoch": 1.68, + "grad_norm": 0.42269936203956604, + "learning_rate": 4.11442338381022e-05, + "loss": 0.0937, + "step": 948 + }, + { + "epoch": 1.68, + "grad_norm": 0.36002832651138306, + "learning_rate": 4.105250078469018e-05, + "loss": 0.1242, + "step": 949 + }, + { + "epoch": 1.68, + "grad_norm": 0.22437407076358795, + "learning_rate": 4.0960798834220704e-05, + "loss": 0.0638, + "step": 950 + }, + { + "epoch": 1.68, + "grad_norm": 0.3967968225479126, + "learning_rate": 4.086912830546448e-05, + "loss": 0.0439, + "step": 951 + }, + { + "epoch": 1.68, + "grad_norm": 0.20550177991390228, + "learning_rate": 4.077748951708292e-05, + "loss": 0.0347, + "step": 952 + }, + { + "epoch": 1.69, + "grad_norm": 0.2020653337240219, + "learning_rate": 4.068588278762723e-05, + "loss": 0.0376, + "step": 953 + }, + { + "epoch": 1.69, + "grad_norm": 0.19614431262016296, + "learning_rate": 4.0594308435537024e-05, + "loss": 0.032, + "step": 954 + }, + { + "epoch": 1.69, + "grad_norm": 0.08721073716878891, + "learning_rate": 4.0502766779139484e-05, + "loss": 0.012, + "step": 955 + }, + { + "epoch": 1.69, + "grad_norm": 0.2303171306848526, + "learning_rate": 4.041125813664808e-05, + "loss": 0.0268, + "step": 956 + }, + { + "epoch": 1.69, + "grad_norm": 0.2725672721862793, + "learning_rate": 4.031978282616151e-05, + "loss": 0.0413, + "step": 957 + }, + { + "epoch": 1.69, + "grad_norm": 0.28184202313423157, + "learning_rate": 4.0228341165662685e-05, + "loss": 0.0383, + "step": 958 + }, + { + "epoch": 1.7, + "grad_norm": 0.28514358401298523, + "learning_rate": 4.0136933473017407e-05, + "loss": 0.044, + "step": 959 + }, + { + "epoch": 1.7, + "grad_norm": 0.374714732170105, + "learning_rate": 4.004556006597353e-05, + "loss": 0.044, + "step": 960 + }, + { + "epoch": 1.7, + "grad_norm": 0.19936969876289368, + "learning_rate": 3.9954221262159674e-05, + "loss": 0.0334, + "step": 961 + }, + { + "epoch": 1.7, + "grad_norm": 0.3212338984012604, + "learning_rate": 3.986291737908414e-05, + "loss": 0.0473, + "step": 962 + }, + { + "epoch": 1.7, + "grad_norm": 0.28045183420181274, + "learning_rate": 3.9771648734133906e-05, + "loss": 0.0321, + "step": 963 + }, + { + "epoch": 1.71, + "grad_norm": 0.34167036414146423, + "learning_rate": 3.968041564457342e-05, + "loss": 0.0696, + "step": 964 + }, + { + "epoch": 1.71, + "grad_norm": 0.5529135465621948, + "learning_rate": 3.958921842754351e-05, + "loss": 0.131, + "step": 965 + }, + { + "epoch": 1.71, + "grad_norm": 0.275803804397583, + "learning_rate": 3.949805740006036e-05, + "loss": 0.0436, + "step": 966 + }, + { + "epoch": 1.71, + "grad_norm": 0.3003288209438324, + "learning_rate": 3.94069328790143e-05, + "loss": 0.073, + "step": 967 + }, + { + "epoch": 1.71, + "grad_norm": 0.20078504085540771, + "learning_rate": 3.9315845181168784e-05, + "loss": 0.0425, + "step": 968 + }, + { + "epoch": 1.71, + "grad_norm": 0.3904169797897339, + "learning_rate": 3.9224794623159294e-05, + "loss": 0.0668, + "step": 969 + }, + { + "epoch": 1.72, + "grad_norm": 0.212997168302536, + "learning_rate": 3.913378152149214e-05, + "loss": 0.0436, + "step": 970 + }, + { + "epoch": 1.72, + "grad_norm": 0.04039880260825157, + "learning_rate": 3.904280619254348e-05, + "loss": 0.0077, + "step": 971 + }, + { + "epoch": 1.72, + "grad_norm": 0.21076536178588867, + "learning_rate": 3.895186895255814e-05, + "loss": 0.0677, + "step": 972 + }, + { + "epoch": 1.72, + "grad_norm": 0.37436169385910034, + "learning_rate": 3.886097011764856e-05, + "loss": 0.0294, + "step": 973 + }, + { + "epoch": 1.72, + "grad_norm": 0.26611942052841187, + "learning_rate": 3.877011000379367e-05, + "loss": 0.057, + "step": 974 + }, + { + "epoch": 1.72, + "grad_norm": 0.32198566198349, + "learning_rate": 3.8679288926837804e-05, + "loss": 0.0583, + "step": 975 + }, + { + "epoch": 1.73, + "grad_norm": 0.2785477340221405, + "learning_rate": 3.8588507202489586e-05, + "loss": 0.0913, + "step": 976 + }, + { + "epoch": 1.73, + "grad_norm": 0.20920749008655548, + "learning_rate": 3.8497765146320876e-05, + "loss": 0.0454, + "step": 977 + }, + { + "epoch": 1.73, + "grad_norm": 0.31738653779029846, + "learning_rate": 3.840706307376557e-05, + "loss": 0.0464, + "step": 978 + }, + { + "epoch": 1.73, + "grad_norm": 0.1887190192937851, + "learning_rate": 3.8316401300118675e-05, + "loss": 0.026, + "step": 979 + }, + { + "epoch": 1.73, + "grad_norm": 0.22016988694667816, + "learning_rate": 3.8225780140535025e-05, + "loss": 0.0375, + "step": 980 + }, + { + "epoch": 1.74, + "grad_norm": 0.2261650264263153, + "learning_rate": 3.813519991002831e-05, + "loss": 0.0368, + "step": 981 + }, + { + "epoch": 1.74, + "grad_norm": 0.3108493983745575, + "learning_rate": 3.804466092346997e-05, + "loss": 0.0539, + "step": 982 + }, + { + "epoch": 1.74, + "grad_norm": 0.23392857611179352, + "learning_rate": 3.7954163495587995e-05, + "loss": 0.0363, + "step": 983 + }, + { + "epoch": 1.74, + "grad_norm": 0.23699642717838287, + "learning_rate": 3.786370794096603e-05, + "loss": 0.0362, + "step": 984 + }, + { + "epoch": 1.74, + "grad_norm": 0.29672032594680786, + "learning_rate": 3.777329457404202e-05, + "loss": 0.0388, + "step": 985 + }, + { + "epoch": 1.74, + "grad_norm": 0.25258907675743103, + "learning_rate": 3.768292370910737e-05, + "loss": 0.0278, + "step": 986 + }, + { + "epoch": 1.75, + "grad_norm": 0.40179169178009033, + "learning_rate": 3.759259566030571e-05, + "loss": 0.1118, + "step": 987 + }, + { + "epoch": 1.75, + "grad_norm": 0.3777885437011719, + "learning_rate": 3.750231074163179e-05, + "loss": 0.073, + "step": 988 + }, + { + "epoch": 1.75, + "grad_norm": 0.39896661043167114, + "learning_rate": 3.7412069266930516e-05, + "loss": 0.0428, + "step": 989 + }, + { + "epoch": 1.75, + "grad_norm": 0.26577284932136536, + "learning_rate": 3.7321871549895714e-05, + "loss": 0.0335, + "step": 990 + }, + { + "epoch": 1.75, + "grad_norm": 0.5360684990882874, + "learning_rate": 3.7231717904069094e-05, + "loss": 0.0979, + "step": 991 + }, + { + "epoch": 1.75, + "grad_norm": 0.21000511944293976, + "learning_rate": 3.714160864283923e-05, + "loss": 0.0156, + "step": 992 + }, + { + "epoch": 1.76, + "grad_norm": 0.11425631493330002, + "learning_rate": 3.7051544079440336e-05, + "loss": 0.0143, + "step": 993 + }, + { + "epoch": 1.76, + "grad_norm": 0.20283763110637665, + "learning_rate": 3.696152452695128e-05, + "loss": 0.0627, + "step": 994 + }, + { + "epoch": 1.76, + "eval_loss": 0.07780980318784714, + "eval_runtime": 14.6775, + "eval_samples_per_second": 32.499, + "eval_steps_per_second": 8.176, + "step": 994 + }, + { + "epoch": 1.76, + "grad_norm": 0.13721764087677002, + "learning_rate": 3.68715502982945e-05, + "loss": 0.0154, + "step": 995 + }, + { + "epoch": 1.76, + "grad_norm": 0.397158145904541, + "learning_rate": 3.678162170623481e-05, + "loss": 0.0494, + "step": 996 + }, + { + "epoch": 1.76, + "grad_norm": 0.47730910778045654, + "learning_rate": 3.669173906337846e-05, + "loss": 0.0897, + "step": 997 + }, + { + "epoch": 1.77, + "grad_norm": 0.19200514256954193, + "learning_rate": 3.6601902682171894e-05, + "loss": 0.0145, + "step": 998 + }, + { + "epoch": 1.77, + "grad_norm": 0.39415818452835083, + "learning_rate": 3.65121128749008e-05, + "loss": 0.0778, + "step": 999 + }, + { + "epoch": 1.77, + "grad_norm": 0.17673304677009583, + "learning_rate": 3.642236995368897e-05, + "loss": 0.0211, + "step": 1000 + }, + { + "epoch": 1.77, + "grad_norm": 0.3190731108188629, + "learning_rate": 3.633267423049717e-05, + "loss": 0.0856, + "step": 1001 + }, + { + "epoch": 1.77, + "grad_norm": 0.3833164572715759, + "learning_rate": 3.624302601712213e-05, + "loss": 0.0687, + "step": 1002 + }, + { + "epoch": 1.77, + "grad_norm": 0.2938999831676483, + "learning_rate": 3.6153425625195425e-05, + "loss": 0.0717, + "step": 1003 + }, + { + "epoch": 1.78, + "grad_norm": 0.2645472586154938, + "learning_rate": 3.606387336618237e-05, + "loss": 0.0341, + "step": 1004 + }, + { + "epoch": 1.78, + "grad_norm": 0.1330021619796753, + "learning_rate": 3.597436955138102e-05, + "loss": 0.0244, + "step": 1005 + }, + { + "epoch": 1.78, + "grad_norm": 0.34308159351348877, + "learning_rate": 3.588491449192096e-05, + "loss": 0.091, + "step": 1006 + }, + { + "epoch": 1.78, + "grad_norm": 0.2861696481704712, + "learning_rate": 3.579550849876233e-05, + "loss": 0.0601, + "step": 1007 + }, + { + "epoch": 1.78, + "grad_norm": 0.32016104459762573, + "learning_rate": 3.570615188269473e-05, + "loss": 0.0699, + "step": 1008 + }, + { + "epoch": 1.79, + "grad_norm": 0.2865599989891052, + "learning_rate": 3.561684495433605e-05, + "loss": 0.0742, + "step": 1009 + }, + { + "epoch": 1.79, + "grad_norm": 0.2045123279094696, + "learning_rate": 3.5527588024131544e-05, + "loss": 0.0323, + "step": 1010 + }, + { + "epoch": 1.79, + "grad_norm": 0.13562075793743134, + "learning_rate": 3.5438381402352574e-05, + "loss": 0.0167, + "step": 1011 + }, + { + "epoch": 1.79, + "grad_norm": 0.2536921799182892, + "learning_rate": 3.534922539909569e-05, + "loss": 0.047, + "step": 1012 + }, + { + "epoch": 1.79, + "grad_norm": 0.193417489528656, + "learning_rate": 3.5260120324281474e-05, + "loss": 0.0349, + "step": 1013 + }, + { + "epoch": 1.79, + "grad_norm": 0.18863309919834137, + "learning_rate": 3.517106648765343e-05, + "loss": 0.0261, + "step": 1014 + }, + { + "epoch": 1.8, + "grad_norm": 0.5035936832427979, + "learning_rate": 3.5082064198777e-05, + "loss": 0.0963, + "step": 1015 + }, + { + "epoch": 1.8, + "grad_norm": 0.33512285351753235, + "learning_rate": 3.499311376703842e-05, + "loss": 0.0534, + "step": 1016 + }, + { + "epoch": 1.8, + "grad_norm": 0.14217574894428253, + "learning_rate": 3.4904215501643646e-05, + "loss": 0.0246, + "step": 1017 + }, + { + "epoch": 1.8, + "grad_norm": 0.3124421536922455, + "learning_rate": 3.4815369711617316e-05, + "loss": 0.0498, + "step": 1018 + }, + { + "epoch": 1.8, + "grad_norm": 0.22936655580997467, + "learning_rate": 3.4726576705801636e-05, + "loss": 0.0249, + "step": 1019 + }, + { + "epoch": 1.8, + "grad_norm": 0.5534436106681824, + "learning_rate": 3.463783679285535e-05, + "loss": 0.1696, + "step": 1020 + }, + { + "epoch": 1.81, + "grad_norm": 0.3127197027206421, + "learning_rate": 3.4549150281252636e-05, + "loss": 0.045, + "step": 1021 + }, + { + "epoch": 1.81, + "grad_norm": 0.13819609582424164, + "learning_rate": 3.446051747928202e-05, + "loss": 0.0203, + "step": 1022 + }, + { + "epoch": 1.81, + "grad_norm": 0.35015806555747986, + "learning_rate": 3.4371938695045346e-05, + "loss": 0.0608, + "step": 1023 + }, + { + "epoch": 1.81, + "grad_norm": 0.3767643868923187, + "learning_rate": 3.428341423645668e-05, + "loss": 0.0686, + "step": 1024 + }, + { + "epoch": 1.81, + "grad_norm": 0.3282710611820221, + "learning_rate": 3.419494441124121e-05, + "loss": 0.0697, + "step": 1025 + }, + { + "epoch": 1.82, + "grad_norm": 0.24474768340587616, + "learning_rate": 3.4106529526934306e-05, + "loss": 0.0583, + "step": 1026 + }, + { + "epoch": 1.82, + "grad_norm": 0.24781620502471924, + "learning_rate": 3.4018169890880225e-05, + "loss": 0.0327, + "step": 1027 + }, + { + "epoch": 1.82, + "grad_norm": 0.2474932074546814, + "learning_rate": 3.392986581023126e-05, + "loss": 0.0679, + "step": 1028 + }, + { + "epoch": 1.82, + "grad_norm": 0.39474180340766907, + "learning_rate": 3.384161759194658e-05, + "loss": 0.0713, + "step": 1029 + }, + { + "epoch": 1.82, + "grad_norm": 0.13963012397289276, + "learning_rate": 3.375342554279111e-05, + "loss": 0.0179, + "step": 1030 + }, + { + "epoch": 1.82, + "grad_norm": 0.32144859433174133, + "learning_rate": 3.3665289969334585e-05, + "loss": 0.0447, + "step": 1031 + }, + { + "epoch": 1.83, + "grad_norm": 0.14767055213451385, + "learning_rate": 3.3577211177950385e-05, + "loss": 0.017, + "step": 1032 + }, + { + "epoch": 1.83, + "grad_norm": 0.3088414967060089, + "learning_rate": 3.348918947481452e-05, + "loss": 0.0483, + "step": 1033 + }, + { + "epoch": 1.83, + "grad_norm": 0.33189231157302856, + "learning_rate": 3.340122516590456e-05, + "loss": 0.0382, + "step": 1034 + }, + { + "epoch": 1.83, + "grad_norm": 0.4593893885612488, + "learning_rate": 3.3313318556998526e-05, + "loss": 0.0523, + "step": 1035 + }, + { + "epoch": 1.83, + "grad_norm": 0.253412127494812, + "learning_rate": 3.322546995367394e-05, + "loss": 0.0212, + "step": 1036 + }, + { + "epoch": 1.83, + "grad_norm": 0.42397648096084595, + "learning_rate": 3.3137679661306576e-05, + "loss": 0.087, + "step": 1037 + }, + { + "epoch": 1.84, + "grad_norm": 0.40718910098075867, + "learning_rate": 3.3049947985069616e-05, + "loss": 0.0965, + "step": 1038 + }, + { + "epoch": 1.84, + "grad_norm": 0.3604757487773895, + "learning_rate": 3.2962275229932446e-05, + "loss": 0.0973, + "step": 1039 + }, + { + "epoch": 1.84, + "grad_norm": 0.2129022628068924, + "learning_rate": 3.287466170065959e-05, + "loss": 0.0384, + "step": 1040 + }, + { + "epoch": 1.84, + "grad_norm": 0.20552971959114075, + "learning_rate": 3.2787107701809754e-05, + "loss": 0.0305, + "step": 1041 + }, + { + "epoch": 1.84, + "grad_norm": 0.41909754276275635, + "learning_rate": 3.269961353773469e-05, + "loss": 0.0725, + "step": 1042 + }, + { + "epoch": 1.85, + "grad_norm": 0.2766873240470886, + "learning_rate": 3.261217951257813e-05, + "loss": 0.0649, + "step": 1043 + }, + { + "epoch": 1.85, + "grad_norm": 0.1571783572435379, + "learning_rate": 3.252480593027478e-05, + "loss": 0.0238, + "step": 1044 + }, + { + "epoch": 1.85, + "grad_norm": 0.2809221148490906, + "learning_rate": 3.243749309454922e-05, + "loss": 0.0613, + "step": 1045 + }, + { + "epoch": 1.85, + "grad_norm": 0.4999224841594696, + "learning_rate": 3.235024130891487e-05, + "loss": 0.0806, + "step": 1046 + }, + { + "epoch": 1.85, + "grad_norm": 0.1895889937877655, + "learning_rate": 3.226305087667295e-05, + "loss": 0.026, + "step": 1047 + }, + { + "epoch": 1.85, + "grad_norm": 0.220509871840477, + "learning_rate": 3.217592210091137e-05, + "loss": 0.0681, + "step": 1048 + }, + { + "epoch": 1.86, + "grad_norm": 0.19226831197738647, + "learning_rate": 3.208885528450376e-05, + "loss": 0.0232, + "step": 1049 + }, + { + "epoch": 1.86, + "grad_norm": 0.30779534578323364, + "learning_rate": 3.200185073010831e-05, + "loss": 0.0547, + "step": 1050 + }, + { + "epoch": 1.86, + "grad_norm": 0.16252338886260986, + "learning_rate": 3.1914908740166795e-05, + "loss": 0.0237, + "step": 1051 + }, + { + "epoch": 1.86, + "grad_norm": 0.4130539000034332, + "learning_rate": 3.182802961690357e-05, + "loss": 0.0437, + "step": 1052 + }, + { + "epoch": 1.86, + "grad_norm": 0.20889542996883392, + "learning_rate": 3.1741213662324365e-05, + "loss": 0.0493, + "step": 1053 + }, + { + "epoch": 1.86, + "grad_norm": 0.27447709441185, + "learning_rate": 3.165446117821538e-05, + "loss": 0.0859, + "step": 1054 + }, + { + "epoch": 1.87, + "grad_norm": 0.07740370184183121, + "learning_rate": 3.1567772466142156e-05, + "loss": 0.011, + "step": 1055 + }, + { + "epoch": 1.87, + "grad_norm": 0.1462429016828537, + "learning_rate": 3.148114782744855e-05, + "loss": 0.0228, + "step": 1056 + }, + { + "epoch": 1.87, + "grad_norm": 0.34325939416885376, + "learning_rate": 3.139458756325576e-05, + "loss": 0.0928, + "step": 1057 + }, + { + "epoch": 1.87, + "grad_norm": 0.2712673246860504, + "learning_rate": 3.130809197446106e-05, + "loss": 0.0408, + "step": 1058 + }, + { + "epoch": 1.87, + "grad_norm": 0.24180567264556885, + "learning_rate": 3.122166136173706e-05, + "loss": 0.0309, + "step": 1059 + }, + { + "epoch": 1.88, + "grad_norm": 0.2045336663722992, + "learning_rate": 3.113529602553042e-05, + "loss": 0.0391, + "step": 1060 + }, + { + "epoch": 1.88, + "grad_norm": 0.09178590029478073, + "learning_rate": 3.104899626606088e-05, + "loss": 0.0132, + "step": 1061 + }, + { + "epoch": 1.88, + "grad_norm": 0.300592303276062, + "learning_rate": 3.0962762383320285e-05, + "loss": 0.0787, + "step": 1062 + }, + { + "epoch": 1.88, + "grad_norm": 0.31034108996391296, + "learning_rate": 3.08765946770714e-05, + "loss": 0.0307, + "step": 1063 + }, + { + "epoch": 1.88, + "grad_norm": 0.1603342443704605, + "learning_rate": 3.0790493446847024e-05, + "loss": 0.0209, + "step": 1064 + }, + { + "epoch": 1.88, + "grad_norm": 0.23990066349506378, + "learning_rate": 3.070445899194885e-05, + "loss": 0.0361, + "step": 1065 + }, + { + "epoch": 1.89, + "grad_norm": 0.15933218598365784, + "learning_rate": 3.061849161144641e-05, + "loss": 0.0137, + "step": 1066 + }, + { + "epoch": 1.89, + "grad_norm": 0.4810096025466919, + "learning_rate": 3.053259160417613e-05, + "loss": 0.1255, + "step": 1067 + }, + { + "epoch": 1.89, + "grad_norm": 0.4099353849887848, + "learning_rate": 3.0446759268740233e-05, + "loss": 0.0718, + "step": 1068 + }, + { + "epoch": 1.89, + "grad_norm": 0.39221569895744324, + "learning_rate": 3.0360994903505653e-05, + "loss": 0.0721, + "step": 1069 + }, + { + "epoch": 1.89, + "grad_norm": 0.48040878772735596, + "learning_rate": 3.02752988066031e-05, + "loss": 0.1077, + "step": 1070 + }, + { + "epoch": 1.89, + "grad_norm": 0.1548435539007187, + "learning_rate": 3.018967127592595e-05, + "loss": 0.0273, + "step": 1071 + }, + { + "epoch": 1.9, + "grad_norm": 0.37816908955574036, + "learning_rate": 3.010411260912922e-05, + "loss": 0.0421, + "step": 1072 + }, + { + "epoch": 1.9, + "grad_norm": 0.2902359366416931, + "learning_rate": 3.0018623103628596e-05, + "loss": 0.0645, + "step": 1073 + }, + { + "epoch": 1.9, + "grad_norm": 0.32010090351104736, + "learning_rate": 2.9933203056599275e-05, + "loss": 0.0444, + "step": 1074 + }, + { + "epoch": 1.9, + "grad_norm": 0.47379517555236816, + "learning_rate": 2.984785276497507e-05, + "loss": 0.0423, + "step": 1075 + }, + { + "epoch": 1.9, + "grad_norm": 0.3198046088218689, + "learning_rate": 2.9762572525447262e-05, + "loss": 0.0678, + "step": 1076 + }, + { + "epoch": 1.91, + "grad_norm": 0.33741793036460876, + "learning_rate": 2.9677362634463647e-05, + "loss": 0.0643, + "step": 1077 + }, + { + "epoch": 1.91, + "grad_norm": 0.2264060080051422, + "learning_rate": 2.9592223388227503e-05, + "loss": 0.0275, + "step": 1078 + }, + { + "epoch": 1.91, + "grad_norm": 0.3069595694541931, + "learning_rate": 2.9507155082696482e-05, + "loss": 0.0481, + "step": 1079 + }, + { + "epoch": 1.91, + "grad_norm": 0.26178085803985596, + "learning_rate": 2.9422158013581658e-05, + "loss": 0.0298, + "step": 1080 + }, + { + "epoch": 1.91, + "grad_norm": 0.4191998243331909, + "learning_rate": 2.93372324763465e-05, + "loss": 0.1156, + "step": 1081 + }, + { + "epoch": 1.91, + "grad_norm": 0.21308496594429016, + "learning_rate": 2.9252378766205758e-05, + "loss": 0.0478, + "step": 1082 + }, + { + "epoch": 1.92, + "grad_norm": 0.20033082365989685, + "learning_rate": 2.9167597178124585e-05, + "loss": 0.0262, + "step": 1083 + }, + { + "epoch": 1.92, + "grad_norm": 0.21688255667686462, + "learning_rate": 2.9082888006817365e-05, + "loss": 0.0767, + "step": 1084 + }, + { + "epoch": 1.92, + "grad_norm": 0.300791472196579, + "learning_rate": 2.899825154674674e-05, + "loss": 0.0464, + "step": 1085 + }, + { + "epoch": 1.92, + "grad_norm": 0.2416476011276245, + "learning_rate": 2.8913688092122664e-05, + "loss": 0.0605, + "step": 1086 + }, + { + "epoch": 1.92, + "grad_norm": 0.2521096169948578, + "learning_rate": 2.8829197936901232e-05, + "loss": 0.0293, + "step": 1087 + }, + { + "epoch": 1.92, + "grad_norm": 0.3229115605354309, + "learning_rate": 2.8744781374783813e-05, + "loss": 0.0435, + "step": 1088 + }, + { + "epoch": 1.93, + "grad_norm": 0.0780615508556366, + "learning_rate": 2.8660438699215898e-05, + "loss": 0.0109, + "step": 1089 + }, + { + "epoch": 1.93, + "grad_norm": 0.2879962623119354, + "learning_rate": 2.8576170203386143e-05, + "loss": 0.0665, + "step": 1090 + }, + { + "epoch": 1.93, + "grad_norm": 0.19985683262348175, + "learning_rate": 2.8491976180225388e-05, + "loss": 0.0378, + "step": 1091 + }, + { + "epoch": 1.93, + "grad_norm": 0.15360887348651886, + "learning_rate": 2.840785692240553e-05, + "loss": 0.0257, + "step": 1092 + }, + { + "epoch": 1.93, + "grad_norm": 0.19224387407302856, + "learning_rate": 2.832381272233864e-05, + "loss": 0.0529, + "step": 1093 + }, + { + "epoch": 1.94, + "grad_norm": 0.2037738561630249, + "learning_rate": 2.8239843872175814e-05, + "loss": 0.0228, + "step": 1094 + }, + { + "epoch": 1.94, + "grad_norm": 0.380874902009964, + "learning_rate": 2.8155950663806235e-05, + "loss": 0.0525, + "step": 1095 + }, + { + "epoch": 1.94, + "grad_norm": 0.28522253036499023, + "learning_rate": 2.8072133388856192e-05, + "loss": 0.0615, + "step": 1096 + }, + { + "epoch": 1.94, + "grad_norm": 0.1098146066069603, + "learning_rate": 2.7988392338687926e-05, + "loss": 0.0159, + "step": 1097 + }, + { + "epoch": 1.94, + "grad_norm": 0.2615334987640381, + "learning_rate": 2.7904727804398812e-05, + "loss": 0.0353, + "step": 1098 + }, + { + "epoch": 1.94, + "grad_norm": 0.2041955441236496, + "learning_rate": 2.7821140076820162e-05, + "loss": 0.0187, + "step": 1099 + }, + { + "epoch": 1.95, + "grad_norm": 0.1846192330121994, + "learning_rate": 2.773762944651632e-05, + "loss": 0.0554, + "step": 1100 + }, + { + "epoch": 1.95, + "grad_norm": 0.17711102962493896, + "learning_rate": 2.765419620378366e-05, + "loss": 0.0342, + "step": 1101 + }, + { + "epoch": 1.95, + "grad_norm": 0.3703756332397461, + "learning_rate": 2.7570840638649486e-05, + "loss": 0.0378, + "step": 1102 + }, + { + "epoch": 1.95, + "grad_norm": 0.4282096326351166, + "learning_rate": 2.7487563040871145e-05, + "loss": 0.0789, + "step": 1103 + }, + { + "epoch": 1.95, + "grad_norm": 0.32506605982780457, + "learning_rate": 2.740436369993491e-05, + "loss": 0.0337, + "step": 1104 + }, + { + "epoch": 1.95, + "grad_norm": 0.250688374042511, + "learning_rate": 2.7321242905055013e-05, + "loss": 0.0554, + "step": 1105 + }, + { + "epoch": 1.96, + "grad_norm": 0.3557257354259491, + "learning_rate": 2.7238200945172698e-05, + "loss": 0.0356, + "step": 1106 + }, + { + "epoch": 1.96, + "grad_norm": 0.3472774028778076, + "learning_rate": 2.715523810895515e-05, + "loss": 0.0348, + "step": 1107 + }, + { + "epoch": 1.96, + "grad_norm": 0.07373315095901489, + "learning_rate": 2.707235468479449e-05, + "loss": 0.0088, + "step": 1108 + }, + { + "epoch": 1.96, + "grad_norm": 0.21439437568187714, + "learning_rate": 2.6989550960806768e-05, + "loss": 0.0222, + "step": 1109 + }, + { + "epoch": 1.96, + "grad_norm": 0.2730681002140045, + "learning_rate": 2.690682722483102e-05, + "loss": 0.068, + "step": 1110 + }, + { + "epoch": 1.97, + "grad_norm": 0.41124334931373596, + "learning_rate": 2.6824183764428224e-05, + "loss": 0.086, + "step": 1111 + }, + { + "epoch": 1.97, + "grad_norm": 0.6637737154960632, + "learning_rate": 2.6741620866880335e-05, + "loss": 0.0365, + "step": 1112 + }, + { + "epoch": 1.97, + "grad_norm": 0.425441712141037, + "learning_rate": 2.665913881918921e-05, + "loss": 0.095, + "step": 1113 + }, + { + "epoch": 1.97, + "grad_norm": 0.5520187020301819, + "learning_rate": 2.6576737908075668e-05, + "loss": 0.0514, + "step": 1114 + }, + { + "epoch": 1.97, + "grad_norm": 0.5284621119499207, + "learning_rate": 2.6494418419978482e-05, + "loss": 0.0593, + "step": 1115 + }, + { + "epoch": 1.97, + "grad_norm": 0.08148845285177231, + "learning_rate": 2.641218064105341e-05, + "loss": 0.0084, + "step": 1116 + }, + { + "epoch": 1.98, + "grad_norm": 0.8642109036445618, + "learning_rate": 2.6330024857172192e-05, + "loss": 0.0766, + "step": 1117 + }, + { + "epoch": 1.98, + "grad_norm": 0.40509146451950073, + "learning_rate": 2.6247951353921485e-05, + "loss": 0.1148, + "step": 1118 + }, + { + "epoch": 1.98, + "grad_norm": 0.2981242537498474, + "learning_rate": 2.616596041660194e-05, + "loss": 0.0666, + "step": 1119 + }, + { + "epoch": 1.98, + "grad_norm": 0.21514151990413666, + "learning_rate": 2.6084052330227238e-05, + "loss": 0.0363, + "step": 1120 + }, + { + "epoch": 1.98, + "grad_norm": 0.10281267762184143, + "learning_rate": 2.6002227379522992e-05, + "loss": 0.0169, + "step": 1121 + }, + { + "epoch": 1.98, + "grad_norm": 0.3236760199069977, + "learning_rate": 2.5920485848925913e-05, + "loss": 0.0296, + "step": 1122 + }, + { + "epoch": 1.99, + "grad_norm": 0.22741632163524628, + "learning_rate": 2.5838828022582594e-05, + "loss": 0.023, + "step": 1123 + }, + { + "epoch": 1.99, + "grad_norm": 0.3826078772544861, + "learning_rate": 2.5757254184348778e-05, + "loss": 0.0744, + "step": 1124 + }, + { + "epoch": 1.99, + "grad_norm": 0.226307213306427, + "learning_rate": 2.5675764617788234e-05, + "loss": 0.0297, + "step": 1125 + }, + { + "epoch": 1.99, + "grad_norm": 0.31913021206855774, + "learning_rate": 2.5594359606171724e-05, + "loss": 0.0793, + "step": 1126 + }, + { + "epoch": 1.99, + "grad_norm": 0.2947479486465454, + "learning_rate": 2.5513039432476193e-05, + "loss": 0.1363, + "step": 1127 + }, + { + "epoch": 2.0, + "grad_norm": 0.26046791672706604, + "learning_rate": 2.5431804379383523e-05, + "loss": 0.0727, + "step": 1128 + }, + { + "epoch": 2.0, + "grad_norm": 0.1183793917298317, + "learning_rate": 2.535065472927983e-05, + "loss": 0.0139, + "step": 1129 + }, + { + "epoch": 2.0, + "grad_norm": 0.23370495438575745, + "learning_rate": 2.526959076425434e-05, + "loss": 0.0503, + "step": 1130 + } + ], + "logging_steps": 1, + "max_steps": 1695, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 565, + "total_flos": 1.0339891388035891e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}